xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 6fffb01e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/intel-svm.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 
49 #include "../irq_remapping.h"
50 #include "../iommu-sva-lib.h"
51 #include "pasid.h"
52 #include "cap_audit.h"
53 
54 #define ROOT_SIZE		VTD_PAGE_SIZE
55 #define CONTEXT_SIZE		VTD_PAGE_SIZE
56 
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 
62 #define IOAPIC_RANGE_START	(0xfee00000)
63 #define IOAPIC_RANGE_END	(0xfeefffff)
64 #define IOVA_START_ADDR		(0x1000)
65 
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
77 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN		(1)
82 
83 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
84 
85 /* page table handling */
86 #define LEVEL_STRIDE		(9)
87 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
88 
89 static inline int agaw_to_level(int agaw)
90 {
91 	return agaw + 2;
92 }
93 
94 static inline int agaw_to_width(int agaw)
95 {
96 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
97 }
98 
99 static inline int width_to_agaw(int width)
100 {
101 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
102 }
103 
104 static inline unsigned int level_to_offset_bits(int level)
105 {
106 	return (level - 1) * LEVEL_STRIDE;
107 }
108 
109 static inline int pfn_level_offset(u64 pfn, int level)
110 {
111 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
112 }
113 
114 static inline u64 level_mask(int level)
115 {
116 	return -1ULL << level_to_offset_bits(level);
117 }
118 
119 static inline u64 level_size(int level)
120 {
121 	return 1ULL << level_to_offset_bits(level);
122 }
123 
124 static inline u64 align_to_level(u64 pfn, int level)
125 {
126 	return (pfn + level_size(level) - 1) & level_mask(level);
127 }
128 
129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
130 {
131 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
132 }
133 
134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
135    are never going to work. */
136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
137 {
138 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
139 }
140 
141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
142 {
143 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
144 }
145 static inline unsigned long page_to_dma_pfn(struct page *pg)
146 {
147 	return mm_to_dma_pfn(page_to_pfn(pg));
148 }
149 static inline unsigned long virt_to_dma_pfn(void *p)
150 {
151 	return page_to_dma_pfn(virt_to_page(p));
152 }
153 
154 /* global iommu list, set NULL for ignored DMAR units */
155 static struct intel_iommu **g_iommus;
156 
157 static void __init check_tylersburg_isoch(void);
158 static int rwbf_quirk;
159 static inline struct device_domain_info *
160 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
161 
162 /*
163  * set to 1 to panic kernel if can't successfully enable VT-d
164  * (used when kernel is launched w/ TXT)
165  */
166 static int force_on = 0;
167 static int intel_iommu_tboot_noforce;
168 static int no_platform_optin;
169 
170 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
171 
172 /*
173  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
174  * if marked present.
175  */
176 static phys_addr_t root_entry_lctp(struct root_entry *re)
177 {
178 	if (!(re->lo & 1))
179 		return 0;
180 
181 	return re->lo & VTD_PAGE_MASK;
182 }
183 
184 /*
185  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
186  * if marked present.
187  */
188 static phys_addr_t root_entry_uctp(struct root_entry *re)
189 {
190 	if (!(re->hi & 1))
191 		return 0;
192 
193 	return re->hi & VTD_PAGE_MASK;
194 }
195 
196 static inline void context_clear_pasid_enable(struct context_entry *context)
197 {
198 	context->lo &= ~(1ULL << 11);
199 }
200 
201 static inline bool context_pasid_enabled(struct context_entry *context)
202 {
203 	return !!(context->lo & (1ULL << 11));
204 }
205 
206 static inline void context_set_copied(struct context_entry *context)
207 {
208 	context->hi |= (1ull << 3);
209 }
210 
211 static inline bool context_copied(struct context_entry *context)
212 {
213 	return !!(context->hi & (1ULL << 3));
214 }
215 
216 static inline bool __context_present(struct context_entry *context)
217 {
218 	return (context->lo & 1);
219 }
220 
221 bool context_present(struct context_entry *context)
222 {
223 	return context_pasid_enabled(context) ?
224 	     __context_present(context) :
225 	     __context_present(context) && !context_copied(context);
226 }
227 
228 static inline void context_set_present(struct context_entry *context)
229 {
230 	context->lo |= 1;
231 }
232 
233 static inline void context_set_fault_enable(struct context_entry *context)
234 {
235 	context->lo &= (((u64)-1) << 2) | 1;
236 }
237 
238 static inline void context_set_translation_type(struct context_entry *context,
239 						unsigned long value)
240 {
241 	context->lo &= (((u64)-1) << 4) | 3;
242 	context->lo |= (value & 3) << 2;
243 }
244 
245 static inline void context_set_address_root(struct context_entry *context,
246 					    unsigned long value)
247 {
248 	context->lo &= ~VTD_PAGE_MASK;
249 	context->lo |= value & VTD_PAGE_MASK;
250 }
251 
252 static inline void context_set_address_width(struct context_entry *context,
253 					     unsigned long value)
254 {
255 	context->hi |= value & 7;
256 }
257 
258 static inline void context_set_domain_id(struct context_entry *context,
259 					 unsigned long value)
260 {
261 	context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263 
264 static inline int context_domain_id(struct context_entry *c)
265 {
266 	return((c->hi >> 8) & 0xffff);
267 }
268 
269 static inline void context_clear_entry(struct context_entry *context)
270 {
271 	context->lo = 0;
272 	context->hi = 0;
273 }
274 
275 /*
276  * This domain is a statically identity mapping domain.
277  *	1. This domain creats a static 1:1 mapping to all usable memory.
278  * 	2. It maps to each iommu if successful.
279  *	3. Each iommu mapps to this domain if successful.
280  */
281 static struct dmar_domain *si_domain;
282 static int hw_pass_through = 1;
283 
284 #define for_each_domain_iommu(idx, domain)			\
285 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
286 		if (domain->iommu_refcnt[idx])
287 
288 struct dmar_rmrr_unit {
289 	struct list_head list;		/* list of rmrr units	*/
290 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
291 	u64	base_address;		/* reserved base address*/
292 	u64	end_address;		/* reserved end address */
293 	struct dmar_dev_scope *devices;	/* target devices */
294 	int	devices_cnt;		/* target device count */
295 };
296 
297 struct dmar_atsr_unit {
298 	struct list_head list;		/* list of ATSR units */
299 	struct acpi_dmar_header *hdr;	/* ACPI header */
300 	struct dmar_dev_scope *devices;	/* target devices */
301 	int devices_cnt;		/* target device count */
302 	u8 include_all:1;		/* include all ports */
303 };
304 
305 struct dmar_satc_unit {
306 	struct list_head list;		/* list of SATC units */
307 	struct acpi_dmar_header *hdr;	/* ACPI header */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	struct intel_iommu *iommu;	/* the corresponding iommu */
310 	int devices_cnt;		/* target device count */
311 	u8 atc_required:1;		/* ATS is required */
312 };
313 
314 static LIST_HEAD(dmar_atsr_units);
315 static LIST_HEAD(dmar_rmrr_units);
316 static LIST_HEAD(dmar_satc_units);
317 
318 #define for_each_rmrr_units(rmrr) \
319 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
320 
321 /* bitmap for indexing intel_iommus */
322 static int g_num_of_iommus;
323 
324 static void domain_exit(struct dmar_domain *domain);
325 static void domain_remove_dev_info(struct dmar_domain *domain);
326 static void dmar_remove_one_dev_info(struct device *dev);
327 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
328 static int intel_iommu_attach_device(struct iommu_domain *domain,
329 				     struct device *dev);
330 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
331 					    dma_addr_t iova);
332 
333 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
334 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
335 
336 int intel_iommu_enabled = 0;
337 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
338 
339 static int dmar_map_gfx = 1;
340 static int intel_iommu_superpage = 1;
341 static int iommu_identity_mapping;
342 static int iommu_skip_te_disable;
343 
344 #define IDENTMAP_GFX		2
345 #define IDENTMAP_AZALIA		4
346 
347 int intel_iommu_gfx_mapped;
348 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
349 
350 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
351 struct device_domain_info *get_domain_info(struct device *dev)
352 {
353 	struct device_domain_info *info;
354 
355 	if (!dev)
356 		return NULL;
357 
358 	info = dev_iommu_priv_get(dev);
359 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
360 		return NULL;
361 
362 	return info;
363 }
364 
365 DEFINE_SPINLOCK(device_domain_lock);
366 static LIST_HEAD(device_domain_list);
367 
368 /*
369  * Iterate over elements in device_domain_list and call the specified
370  * callback @fn against each element.
371  */
372 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
373 				     void *data), void *data)
374 {
375 	int ret = 0;
376 	unsigned long flags;
377 	struct device_domain_info *info;
378 
379 	spin_lock_irqsave(&device_domain_lock, flags);
380 	list_for_each_entry(info, &device_domain_list, global) {
381 		ret = fn(info, data);
382 		if (ret) {
383 			spin_unlock_irqrestore(&device_domain_lock, flags);
384 			return ret;
385 		}
386 	}
387 	spin_unlock_irqrestore(&device_domain_lock, flags);
388 
389 	return 0;
390 }
391 
392 const struct iommu_ops intel_iommu_ops;
393 
394 static bool translation_pre_enabled(struct intel_iommu *iommu)
395 {
396 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
397 }
398 
399 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
400 {
401 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
402 }
403 
404 static void init_translation_status(struct intel_iommu *iommu)
405 {
406 	u32 gsts;
407 
408 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
409 	if (gsts & DMA_GSTS_TES)
410 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
411 }
412 
413 static int __init intel_iommu_setup(char *str)
414 {
415 	if (!str)
416 		return -EINVAL;
417 
418 	while (*str) {
419 		if (!strncmp(str, "on", 2)) {
420 			dmar_disabled = 0;
421 			pr_info("IOMMU enabled\n");
422 		} else if (!strncmp(str, "off", 3)) {
423 			dmar_disabled = 1;
424 			no_platform_optin = 1;
425 			pr_info("IOMMU disabled\n");
426 		} else if (!strncmp(str, "igfx_off", 8)) {
427 			dmar_map_gfx = 0;
428 			pr_info("Disable GFX device mapping\n");
429 		} else if (!strncmp(str, "forcedac", 8)) {
430 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
431 			iommu_dma_forcedac = true;
432 		} else if (!strncmp(str, "strict", 6)) {
433 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
434 			iommu_set_dma_strict();
435 		} else if (!strncmp(str, "sp_off", 6)) {
436 			pr_info("Disable supported super page\n");
437 			intel_iommu_superpage = 0;
438 		} else if (!strncmp(str, "sm_on", 5)) {
439 			pr_info("Enable scalable mode if hardware supports\n");
440 			intel_iommu_sm = 1;
441 		} else if (!strncmp(str, "sm_off", 6)) {
442 			pr_info("Scalable mode is disallowed\n");
443 			intel_iommu_sm = 0;
444 		} else if (!strncmp(str, "tboot_noforce", 13)) {
445 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
446 			intel_iommu_tboot_noforce = 1;
447 		} else {
448 			pr_notice("Unknown option - '%s'\n", str);
449 		}
450 
451 		str += strcspn(str, ",");
452 		while (*str == ',')
453 			str++;
454 	}
455 
456 	return 1;
457 }
458 __setup("intel_iommu=", intel_iommu_setup);
459 
460 static struct kmem_cache *iommu_domain_cache;
461 static struct kmem_cache *iommu_devinfo_cache;
462 
463 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
464 {
465 	struct dmar_domain **domains;
466 	int idx = did >> 8;
467 
468 	domains = iommu->domains[idx];
469 	if (!domains)
470 		return NULL;
471 
472 	return domains[did & 0xff];
473 }
474 
475 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
476 			     struct dmar_domain *domain)
477 {
478 	struct dmar_domain **domains;
479 	int idx = did >> 8;
480 
481 	if (!iommu->domains[idx]) {
482 		size_t size = 256 * sizeof(struct dmar_domain *);
483 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
484 	}
485 
486 	domains = iommu->domains[idx];
487 	if (WARN_ON(!domains))
488 		return;
489 	else
490 		domains[did & 0xff] = domain;
491 }
492 
493 void *alloc_pgtable_page(int node)
494 {
495 	struct page *page;
496 	void *vaddr = NULL;
497 
498 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
499 	if (page)
500 		vaddr = page_address(page);
501 	return vaddr;
502 }
503 
504 void free_pgtable_page(void *vaddr)
505 {
506 	free_page((unsigned long)vaddr);
507 }
508 
509 static inline void *alloc_domain_mem(void)
510 {
511 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
512 }
513 
514 static void free_domain_mem(void *vaddr)
515 {
516 	kmem_cache_free(iommu_domain_cache, vaddr);
517 }
518 
519 static inline void * alloc_devinfo_mem(void)
520 {
521 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
522 }
523 
524 static inline void free_devinfo_mem(void *vaddr)
525 {
526 	kmem_cache_free(iommu_devinfo_cache, vaddr);
527 }
528 
529 static inline int domain_type_is_si(struct dmar_domain *domain)
530 {
531 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
532 }
533 
534 static inline bool domain_use_first_level(struct dmar_domain *domain)
535 {
536 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
537 }
538 
539 static inline int domain_pfn_supported(struct dmar_domain *domain,
540 				       unsigned long pfn)
541 {
542 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
543 
544 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
545 }
546 
547 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
548 {
549 	unsigned long sagaw;
550 	int agaw;
551 
552 	sagaw = cap_sagaw(iommu->cap);
553 	for (agaw = width_to_agaw(max_gaw);
554 	     agaw >= 0; agaw--) {
555 		if (test_bit(agaw, &sagaw))
556 			break;
557 	}
558 
559 	return agaw;
560 }
561 
562 /*
563  * Calculate max SAGAW for each iommu.
564  */
565 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
566 {
567 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
568 }
569 
570 /*
571  * calculate agaw for each iommu.
572  * "SAGAW" may be different across iommus, use a default agaw, and
573  * get a supported less agaw for iommus that don't support the default agaw.
574  */
575 int iommu_calculate_agaw(struct intel_iommu *iommu)
576 {
577 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
578 }
579 
580 /* This functionin only returns single iommu in a domain */
581 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
582 {
583 	int iommu_id;
584 
585 	/* si_domain and vm domain should not get here. */
586 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
587 		return NULL;
588 
589 	for_each_domain_iommu(iommu_id, domain)
590 		break;
591 
592 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
593 		return NULL;
594 
595 	return g_iommus[iommu_id];
596 }
597 
598 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
599 {
600 	return sm_supported(iommu) ?
601 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
602 }
603 
604 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 {
606 	struct dmar_drhd_unit *drhd;
607 	struct intel_iommu *iommu;
608 	bool found = false;
609 	int i;
610 
611 	domain->iommu_coherency = true;
612 
613 	for_each_domain_iommu(i, domain) {
614 		found = true;
615 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
616 			domain->iommu_coherency = false;
617 			break;
618 		}
619 	}
620 	if (found)
621 		return;
622 
623 	/* No hardware attached; use lowest common denominator */
624 	rcu_read_lock();
625 	for_each_active_iommu(iommu, drhd) {
626 		if (!iommu_paging_structure_coherency(iommu)) {
627 			domain->iommu_coherency = false;
628 			break;
629 		}
630 	}
631 	rcu_read_unlock();
632 }
633 
634 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
635 {
636 	struct dmar_drhd_unit *drhd;
637 	struct intel_iommu *iommu;
638 	bool ret = true;
639 
640 	rcu_read_lock();
641 	for_each_active_iommu(iommu, drhd) {
642 		if (iommu != skip) {
643 			/*
644 			 * If the hardware is operating in the scalable mode,
645 			 * the snooping control is always supported since we
646 			 * always set PASID-table-entry.PGSNP bit if the domain
647 			 * is managed outside (UNMANAGED).
648 			 */
649 			if (!sm_supported(iommu) &&
650 			    !ecap_sc_support(iommu->ecap)) {
651 				ret = false;
652 				break;
653 			}
654 		}
655 	}
656 	rcu_read_unlock();
657 
658 	return ret;
659 }
660 
661 static int domain_update_iommu_superpage(struct dmar_domain *domain,
662 					 struct intel_iommu *skip)
663 {
664 	struct dmar_drhd_unit *drhd;
665 	struct intel_iommu *iommu;
666 	int mask = 0x3;
667 
668 	if (!intel_iommu_superpage)
669 		return 0;
670 
671 	/* set iommu_superpage to the smallest common denominator */
672 	rcu_read_lock();
673 	for_each_active_iommu(iommu, drhd) {
674 		if (iommu != skip) {
675 			if (domain && domain_use_first_level(domain)) {
676 				if (!cap_fl1gp_support(iommu->cap))
677 					mask = 0x1;
678 			} else {
679 				mask &= cap_super_page_val(iommu->cap);
680 			}
681 
682 			if (!mask)
683 				break;
684 		}
685 	}
686 	rcu_read_unlock();
687 
688 	return fls(mask);
689 }
690 
691 static int domain_update_device_node(struct dmar_domain *domain)
692 {
693 	struct device_domain_info *info;
694 	int nid = NUMA_NO_NODE;
695 
696 	assert_spin_locked(&device_domain_lock);
697 
698 	if (list_empty(&domain->devices))
699 		return NUMA_NO_NODE;
700 
701 	list_for_each_entry(info, &domain->devices, link) {
702 		if (!info->dev)
703 			continue;
704 
705 		/*
706 		 * There could possibly be multiple device numa nodes as devices
707 		 * within the same domain may sit behind different IOMMUs. There
708 		 * isn't perfect answer in such situation, so we select first
709 		 * come first served policy.
710 		 */
711 		nid = dev_to_node(info->dev);
712 		if (nid != NUMA_NO_NODE)
713 			break;
714 	}
715 
716 	return nid;
717 }
718 
719 static void domain_update_iotlb(struct dmar_domain *domain);
720 
721 /* Return the super pagesize bitmap if supported. */
722 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
723 {
724 	unsigned long bitmap = 0;
725 
726 	/*
727 	 * 1-level super page supports page size of 2MiB, 2-level super page
728 	 * supports page size of both 2MiB and 1GiB.
729 	 */
730 	if (domain->iommu_superpage == 1)
731 		bitmap |= SZ_2M;
732 	else if (domain->iommu_superpage == 2)
733 		bitmap |= SZ_2M | SZ_1G;
734 
735 	return bitmap;
736 }
737 
738 /* Some capabilities may be different across iommus */
739 static void domain_update_iommu_cap(struct dmar_domain *domain)
740 {
741 	domain_update_iommu_coherency(domain);
742 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
743 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
744 
745 	/*
746 	 * If RHSA is missing, we should default to the device numa domain
747 	 * as fall back.
748 	 */
749 	if (domain->nid == NUMA_NO_NODE)
750 		domain->nid = domain_update_device_node(domain);
751 
752 	/*
753 	 * First-level translation restricts the input-address to a
754 	 * canonical address (i.e., address bits 63:N have the same
755 	 * value as address bit [N-1], where N is 48-bits with 4-level
756 	 * paging and 57-bits with 5-level paging). Hence, skip bit
757 	 * [N-1].
758 	 */
759 	if (domain_use_first_level(domain))
760 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
761 	else
762 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
763 
764 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
765 	domain_update_iotlb(domain);
766 }
767 
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 					 u8 devfn, int alloc)
770 {
771 	struct root_entry *root = &iommu->root_entry[bus];
772 	struct context_entry *context;
773 	u64 *entry;
774 
775 	entry = &root->lo;
776 	if (sm_supported(iommu)) {
777 		if (devfn >= 0x80) {
778 			devfn -= 0x80;
779 			entry = &root->hi;
780 		}
781 		devfn *= 2;
782 	}
783 	if (*entry & 1)
784 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 	else {
786 		unsigned long phy_addr;
787 		if (!alloc)
788 			return NULL;
789 
790 		context = alloc_pgtable_page(iommu->node);
791 		if (!context)
792 			return NULL;
793 
794 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 		phy_addr = virt_to_phys((void *)context);
796 		*entry = phy_addr | 1;
797 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
798 	}
799 	return &context[devfn];
800 }
801 
802 static bool attach_deferred(struct device *dev)
803 {
804 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805 }
806 
807 /**
808  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809  *				 sub-hierarchy of a candidate PCI-PCI bridge
810  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811  * @bridge: the candidate PCI-PCI bridge
812  *
813  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814  */
815 static bool
816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817 {
818 	struct pci_dev *pdev, *pbridge;
819 
820 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 		return false;
822 
823 	pdev = to_pci_dev(dev);
824 	pbridge = to_pci_dev(bridge);
825 
826 	if (pbridge->subordinate &&
827 	    pbridge->subordinate->number <= pdev->bus->number &&
828 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 		return true;
830 
831 	return false;
832 }
833 
834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835 {
836 	struct dmar_drhd_unit *drhd;
837 	u32 vtbar;
838 	int rc;
839 
840 	/* We know that this device on this chipset has its own IOMMU.
841 	 * If we find it under a different IOMMU, then the BIOS is lying
842 	 * to us. Hope that the IOMMU for this device is actually
843 	 * disabled, and it needs no translation...
844 	 */
845 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 	if (rc) {
847 		/* "can't" happen */
848 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 		return false;
850 	}
851 	vtbar &= 0xffff0000;
852 
853 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
854 	drhd = dmar_find_matched_drhd_unit(pdev);
855 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 		return true;
859 	}
860 
861 	return false;
862 }
863 
864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865 {
866 	if (!iommu || iommu->drhd->ignored)
867 		return true;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pdev = to_pci_dev(dev);
871 
872 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 		    quirk_ioat_snb_local_iommu(pdev))
875 			return true;
876 	}
877 
878 	return false;
879 }
880 
881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882 {
883 	struct dmar_drhd_unit *drhd = NULL;
884 	struct pci_dev *pdev = NULL;
885 	struct intel_iommu *iommu;
886 	struct device *tmp;
887 	u16 segment = 0;
888 	int i;
889 
890 	if (!dev)
891 		return NULL;
892 
893 	if (dev_is_pci(dev)) {
894 		struct pci_dev *pf_pdev;
895 
896 		pdev = pci_real_dma_dev(to_pci_dev(dev));
897 
898 		/* VFs aren't listed in scope tables; we need to look up
899 		 * the PF instead to find the IOMMU. */
900 		pf_pdev = pci_physfn(pdev);
901 		dev = &pf_pdev->dev;
902 		segment = pci_domain_nr(pdev->bus);
903 	} else if (has_acpi_companion(dev))
904 		dev = &ACPI_COMPANION(dev)->dev;
905 
906 	rcu_read_lock();
907 	for_each_iommu(iommu, drhd) {
908 		if (pdev && segment != drhd->segment)
909 			continue;
910 
911 		for_each_active_dev_scope(drhd->devices,
912 					  drhd->devices_cnt, i, tmp) {
913 			if (tmp == dev) {
914 				/* For a VF use its original BDF# not that of the PF
915 				 * which we used for the IOMMU lookup. Strictly speaking
916 				 * we could do this for all PCI devices; we only need to
917 				 * get the BDF# from the scope table for ACPI matches. */
918 				if (pdev && pdev->is_virtfn)
919 					goto got_pdev;
920 
921 				if (bus && devfn) {
922 					*bus = drhd->devices[i].bus;
923 					*devfn = drhd->devices[i].devfn;
924 				}
925 				goto out;
926 			}
927 
928 			if (is_downstream_to_pci_bridge(dev, tmp))
929 				goto got_pdev;
930 		}
931 
932 		if (pdev && drhd->include_all) {
933 		got_pdev:
934 			if (bus && devfn) {
935 				*bus = pdev->bus->number;
936 				*devfn = pdev->devfn;
937 			}
938 			goto out;
939 		}
940 	}
941 	iommu = NULL;
942  out:
943 	if (iommu_is_dummy(iommu, dev))
944 		iommu = NULL;
945 
946 	rcu_read_unlock();
947 
948 	return iommu;
949 }
950 
951 static void domain_flush_cache(struct dmar_domain *domain,
952 			       void *addr, int size)
953 {
954 	if (!domain->iommu_coherency)
955 		clflush_cache_range(addr, size);
956 }
957 
958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959 {
960 	struct context_entry *context;
961 	int ret = 0;
962 	unsigned long flags;
963 
964 	spin_lock_irqsave(&iommu->lock, flags);
965 	context = iommu_context_addr(iommu, bus, devfn, 0);
966 	if (context)
967 		ret = context_present(context);
968 	spin_unlock_irqrestore(&iommu->lock, flags);
969 	return ret;
970 }
971 
972 static void free_context_table(struct intel_iommu *iommu)
973 {
974 	int i;
975 	unsigned long flags;
976 	struct context_entry *context;
977 
978 	spin_lock_irqsave(&iommu->lock, flags);
979 	if (!iommu->root_entry) {
980 		goto out;
981 	}
982 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 		context = iommu_context_addr(iommu, i, 0, 0);
984 		if (context)
985 			free_pgtable_page(context);
986 
987 		if (!sm_supported(iommu))
988 			continue;
989 
990 		context = iommu_context_addr(iommu, i, 0x80, 0);
991 		if (context)
992 			free_pgtable_page(context);
993 
994 	}
995 	free_pgtable_page(iommu->root_entry);
996 	iommu->root_entry = NULL;
997 out:
998 	spin_unlock_irqrestore(&iommu->lock, flags);
999 }
1000 
1001 #ifdef CONFIG_DMAR_DEBUG
1002 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
1003 {
1004 	struct device_domain_info *info;
1005 	struct dma_pte *parent, *pte;
1006 	struct dmar_domain *domain;
1007 	int offset, level;
1008 
1009 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
1010 	if (!info || !info->domain) {
1011 		pr_info("device [%02x:%02x.%d] not probed\n",
1012 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1013 		return;
1014 	}
1015 
1016 	domain = info->domain;
1017 	level = agaw_to_level(domain->agaw);
1018 	parent = domain->pgd;
1019 	if (!parent) {
1020 		pr_info("no page table setup\n");
1021 		return;
1022 	}
1023 
1024 	while (1) {
1025 		offset = pfn_level_offset(pfn, level);
1026 		pte = &parent[offset];
1027 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
1028 			pr_info("PTE not present at level %d\n", level);
1029 			break;
1030 		}
1031 
1032 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
1033 
1034 		if (level == 1)
1035 			break;
1036 
1037 		parent = phys_to_virt(dma_pte_addr(pte));
1038 		level--;
1039 	}
1040 }
1041 
1042 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
1043 			  unsigned long long addr, u32 pasid)
1044 {
1045 	struct pasid_dir_entry *dir, *pde;
1046 	struct pasid_entry *entries, *pte;
1047 	struct context_entry *ctx_entry;
1048 	struct root_entry *rt_entry;
1049 	u8 devfn = source_id & 0xff;
1050 	u8 bus = source_id >> 8;
1051 	int i, dir_index, index;
1052 
1053 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
1054 
1055 	/* root entry dump */
1056 	rt_entry = &iommu->root_entry[bus];
1057 	if (!rt_entry) {
1058 		pr_info("root table entry is not present\n");
1059 		return;
1060 	}
1061 
1062 	if (sm_supported(iommu))
1063 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
1064 			rt_entry->hi, rt_entry->lo);
1065 	else
1066 		pr_info("root entry: 0x%016llx", rt_entry->lo);
1067 
1068 	/* context entry dump */
1069 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
1070 	if (!ctx_entry) {
1071 		pr_info("context table entry is not present\n");
1072 		return;
1073 	}
1074 
1075 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
1076 		ctx_entry->hi, ctx_entry->lo);
1077 
1078 	/* legacy mode does not require PASID entries */
1079 	if (!sm_supported(iommu))
1080 		goto pgtable_walk;
1081 
1082 	/* get the pointer to pasid directory entry */
1083 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
1084 	if (!dir) {
1085 		pr_info("pasid directory entry is not present\n");
1086 		return;
1087 	}
1088 	/* For request-without-pasid, get the pasid from context entry */
1089 	if (intel_iommu_sm && pasid == INVALID_IOASID)
1090 		pasid = PASID_RID2PASID;
1091 
1092 	dir_index = pasid >> PASID_PDE_SHIFT;
1093 	pde = &dir[dir_index];
1094 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
1095 
1096 	/* get the pointer to the pasid table entry */
1097 	entries = get_pasid_table_from_pde(pde);
1098 	if (!entries) {
1099 		pr_info("pasid table entry is not present\n");
1100 		return;
1101 	}
1102 	index = pasid & PASID_PTE_MASK;
1103 	pte = &entries[index];
1104 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1105 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1106 
1107 pgtable_walk:
1108 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1109 }
1110 #endif
1111 
1112 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1113 				      unsigned long pfn, int *target_level)
1114 {
1115 	struct dma_pte *parent, *pte;
1116 	int level = agaw_to_level(domain->agaw);
1117 	int offset;
1118 
1119 	BUG_ON(!domain->pgd);
1120 
1121 	if (!domain_pfn_supported(domain, pfn))
1122 		/* Address beyond IOMMU's addressing capabilities. */
1123 		return NULL;
1124 
1125 	parent = domain->pgd;
1126 
1127 	while (1) {
1128 		void *tmp_page;
1129 
1130 		offset = pfn_level_offset(pfn, level);
1131 		pte = &parent[offset];
1132 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1133 			break;
1134 		if (level == *target_level)
1135 			break;
1136 
1137 		if (!dma_pte_present(pte)) {
1138 			uint64_t pteval;
1139 
1140 			tmp_page = alloc_pgtable_page(domain->nid);
1141 
1142 			if (!tmp_page)
1143 				return NULL;
1144 
1145 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1146 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1147 			if (domain_use_first_level(domain)) {
1148 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1149 				if (iommu_is_dma_domain(&domain->domain))
1150 					pteval |= DMA_FL_PTE_ACCESS;
1151 			}
1152 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1153 				/* Someone else set it while we were thinking; use theirs. */
1154 				free_pgtable_page(tmp_page);
1155 			else
1156 				domain_flush_cache(domain, pte, sizeof(*pte));
1157 		}
1158 		if (level == 1)
1159 			break;
1160 
1161 		parent = phys_to_virt(dma_pte_addr(pte));
1162 		level--;
1163 	}
1164 
1165 	if (!*target_level)
1166 		*target_level = level;
1167 
1168 	return pte;
1169 }
1170 
1171 /* return address's pte at specific level */
1172 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1173 					 unsigned long pfn,
1174 					 int level, int *large_page)
1175 {
1176 	struct dma_pte *parent, *pte;
1177 	int total = agaw_to_level(domain->agaw);
1178 	int offset;
1179 
1180 	parent = domain->pgd;
1181 	while (level <= total) {
1182 		offset = pfn_level_offset(pfn, total);
1183 		pte = &parent[offset];
1184 		if (level == total)
1185 			return pte;
1186 
1187 		if (!dma_pte_present(pte)) {
1188 			*large_page = total;
1189 			break;
1190 		}
1191 
1192 		if (dma_pte_superpage(pte)) {
1193 			*large_page = total;
1194 			return pte;
1195 		}
1196 
1197 		parent = phys_to_virt(dma_pte_addr(pte));
1198 		total--;
1199 	}
1200 	return NULL;
1201 }
1202 
1203 /* clear last level pte, a tlb flush should be followed */
1204 static void dma_pte_clear_range(struct dmar_domain *domain,
1205 				unsigned long start_pfn,
1206 				unsigned long last_pfn)
1207 {
1208 	unsigned int large_page;
1209 	struct dma_pte *first_pte, *pte;
1210 
1211 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1212 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1213 	BUG_ON(start_pfn > last_pfn);
1214 
1215 	/* we don't need lock here; nobody else touches the iova range */
1216 	do {
1217 		large_page = 1;
1218 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1219 		if (!pte) {
1220 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1221 			continue;
1222 		}
1223 		do {
1224 			dma_clear_pte(pte);
1225 			start_pfn += lvl_to_nr_pages(large_page);
1226 			pte++;
1227 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1228 
1229 		domain_flush_cache(domain, first_pte,
1230 				   (void *)pte - (void *)first_pte);
1231 
1232 	} while (start_pfn && start_pfn <= last_pfn);
1233 }
1234 
1235 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1236 			       int retain_level, struct dma_pte *pte,
1237 			       unsigned long pfn, unsigned long start_pfn,
1238 			       unsigned long last_pfn)
1239 {
1240 	pfn = max(start_pfn, pfn);
1241 	pte = &pte[pfn_level_offset(pfn, level)];
1242 
1243 	do {
1244 		unsigned long level_pfn;
1245 		struct dma_pte *level_pte;
1246 
1247 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1248 			goto next;
1249 
1250 		level_pfn = pfn & level_mask(level);
1251 		level_pte = phys_to_virt(dma_pte_addr(pte));
1252 
1253 		if (level > 2) {
1254 			dma_pte_free_level(domain, level - 1, retain_level,
1255 					   level_pte, level_pfn, start_pfn,
1256 					   last_pfn);
1257 		}
1258 
1259 		/*
1260 		 * Free the page table if we're below the level we want to
1261 		 * retain and the range covers the entire table.
1262 		 */
1263 		if (level < retain_level && !(start_pfn > level_pfn ||
1264 		      last_pfn < level_pfn + level_size(level) - 1)) {
1265 			dma_clear_pte(pte);
1266 			domain_flush_cache(domain, pte, sizeof(*pte));
1267 			free_pgtable_page(level_pte);
1268 		}
1269 next:
1270 		pfn += level_size(level);
1271 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1272 }
1273 
1274 /*
1275  * clear last level (leaf) ptes and free page table pages below the
1276  * level we wish to keep intact.
1277  */
1278 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1279 				   unsigned long start_pfn,
1280 				   unsigned long last_pfn,
1281 				   int retain_level)
1282 {
1283 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1284 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1285 	BUG_ON(start_pfn > last_pfn);
1286 
1287 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1288 
1289 	/* We don't need lock here; nobody else touches the iova range */
1290 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1291 			   domain->pgd, 0, start_pfn, last_pfn);
1292 
1293 	/* free pgd */
1294 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1295 		free_pgtable_page(domain->pgd);
1296 		domain->pgd = NULL;
1297 	}
1298 }
1299 
1300 /* When a page at a given level is being unlinked from its parent, we don't
1301    need to *modify* it at all. All we need to do is make a list of all the
1302    pages which can be freed just as soon as we've flushed the IOTLB and we
1303    know the hardware page-walk will no longer touch them.
1304    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1305    be freed. */
1306 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1307 					    int level, struct dma_pte *pte,
1308 					    struct page *freelist)
1309 {
1310 	struct page *pg;
1311 
1312 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1313 	pg->freelist = freelist;
1314 	freelist = pg;
1315 
1316 	if (level == 1)
1317 		return freelist;
1318 
1319 	pte = page_address(pg);
1320 	do {
1321 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1322 			freelist = dma_pte_list_pagetables(domain, level - 1,
1323 							   pte, freelist);
1324 		pte++;
1325 	} while (!first_pte_in_page(pte));
1326 
1327 	return freelist;
1328 }
1329 
1330 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1331 					struct dma_pte *pte, unsigned long pfn,
1332 					unsigned long start_pfn,
1333 					unsigned long last_pfn,
1334 					struct page *freelist)
1335 {
1336 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1337 
1338 	pfn = max(start_pfn, pfn);
1339 	pte = &pte[pfn_level_offset(pfn, level)];
1340 
1341 	do {
1342 		unsigned long level_pfn = pfn & level_mask(level);
1343 
1344 		if (!dma_pte_present(pte))
1345 			goto next;
1346 
1347 		/* If range covers entire pagetable, free it */
1348 		if (start_pfn <= level_pfn &&
1349 		    last_pfn >= level_pfn + level_size(level) - 1) {
1350 			/* These suborbinate page tables are going away entirely. Don't
1351 			   bother to clear them; we're just going to *free* them. */
1352 			if (level > 1 && !dma_pte_superpage(pte))
1353 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1354 
1355 			dma_clear_pte(pte);
1356 			if (!first_pte)
1357 				first_pte = pte;
1358 			last_pte = pte;
1359 		} else if (level > 1) {
1360 			/* Recurse down into a level that isn't *entirely* obsolete */
1361 			freelist = dma_pte_clear_level(domain, level - 1,
1362 						       phys_to_virt(dma_pte_addr(pte)),
1363 						       level_pfn, start_pfn, last_pfn,
1364 						       freelist);
1365 		}
1366 next:
1367 		pfn = level_pfn + level_size(level);
1368 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1369 
1370 	if (first_pte)
1371 		domain_flush_cache(domain, first_pte,
1372 				   (void *)++last_pte - (void *)first_pte);
1373 
1374 	return freelist;
1375 }
1376 
1377 /* We can't just free the pages because the IOMMU may still be walking
1378    the page tables, and may have cached the intermediate levels. The
1379    pages can only be freed after the IOTLB flush has been done. */
1380 static struct page *domain_unmap(struct dmar_domain *domain,
1381 				 unsigned long start_pfn,
1382 				 unsigned long last_pfn,
1383 				 struct page *freelist)
1384 {
1385 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1386 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1387 	BUG_ON(start_pfn > last_pfn);
1388 
1389 	/* we don't need lock here; nobody else touches the iova range */
1390 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1391 				       domain->pgd, 0, start_pfn, last_pfn,
1392 				       freelist);
1393 
1394 	/* free pgd */
1395 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1396 		struct page *pgd_page = virt_to_page(domain->pgd);
1397 		pgd_page->freelist = freelist;
1398 		freelist = pgd_page;
1399 
1400 		domain->pgd = NULL;
1401 	}
1402 
1403 	return freelist;
1404 }
1405 
1406 static void dma_free_pagelist(struct page *freelist)
1407 {
1408 	struct page *pg;
1409 
1410 	while ((pg = freelist)) {
1411 		freelist = pg->freelist;
1412 		free_pgtable_page(page_address(pg));
1413 	}
1414 }
1415 
1416 /* iommu handling */
1417 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1418 {
1419 	struct root_entry *root;
1420 	unsigned long flags;
1421 
1422 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1423 	if (!root) {
1424 		pr_err("Allocating root entry for %s failed\n",
1425 			iommu->name);
1426 		return -ENOMEM;
1427 	}
1428 
1429 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1430 
1431 	spin_lock_irqsave(&iommu->lock, flags);
1432 	iommu->root_entry = root;
1433 	spin_unlock_irqrestore(&iommu->lock, flags);
1434 
1435 	return 0;
1436 }
1437 
1438 static void iommu_set_root_entry(struct intel_iommu *iommu)
1439 {
1440 	u64 addr;
1441 	u32 sts;
1442 	unsigned long flag;
1443 
1444 	addr = virt_to_phys(iommu->root_entry);
1445 	if (sm_supported(iommu))
1446 		addr |= DMA_RTADDR_SMT;
1447 
1448 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1449 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1450 
1451 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1452 
1453 	/* Make sure hardware complete it */
1454 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1455 		      readl, (sts & DMA_GSTS_RTPS), sts);
1456 
1457 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1458 
1459 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1460 	if (sm_supported(iommu))
1461 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1462 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1463 }
1464 
1465 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1466 {
1467 	u32 val;
1468 	unsigned long flag;
1469 
1470 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1471 		return;
1472 
1473 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1474 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1475 
1476 	/* Make sure hardware complete it */
1477 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1478 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1479 
1480 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1481 }
1482 
1483 /* return value determine if we need a write buffer flush */
1484 static void __iommu_flush_context(struct intel_iommu *iommu,
1485 				  u16 did, u16 source_id, u8 function_mask,
1486 				  u64 type)
1487 {
1488 	u64 val = 0;
1489 	unsigned long flag;
1490 
1491 	switch (type) {
1492 	case DMA_CCMD_GLOBAL_INVL:
1493 		val = DMA_CCMD_GLOBAL_INVL;
1494 		break;
1495 	case DMA_CCMD_DOMAIN_INVL:
1496 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1497 		break;
1498 	case DMA_CCMD_DEVICE_INVL:
1499 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1500 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1501 		break;
1502 	default:
1503 		BUG();
1504 	}
1505 	val |= DMA_CCMD_ICC;
1506 
1507 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1508 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1509 
1510 	/* Make sure hardware complete it */
1511 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1512 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1513 
1514 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1515 }
1516 
1517 /* return value determine if we need a write buffer flush */
1518 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1519 				u64 addr, unsigned int size_order, u64 type)
1520 {
1521 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1522 	u64 val = 0, val_iva = 0;
1523 	unsigned long flag;
1524 
1525 	switch (type) {
1526 	case DMA_TLB_GLOBAL_FLUSH:
1527 		/* global flush doesn't need set IVA_REG */
1528 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1529 		break;
1530 	case DMA_TLB_DSI_FLUSH:
1531 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1532 		break;
1533 	case DMA_TLB_PSI_FLUSH:
1534 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1535 		/* IH bit is passed in as part of address */
1536 		val_iva = size_order | addr;
1537 		break;
1538 	default:
1539 		BUG();
1540 	}
1541 	/* Note: set drain read/write */
1542 #if 0
1543 	/*
1544 	 * This is probably to be super secure.. Looks like we can
1545 	 * ignore it without any impact.
1546 	 */
1547 	if (cap_read_drain(iommu->cap))
1548 		val |= DMA_TLB_READ_DRAIN;
1549 #endif
1550 	if (cap_write_drain(iommu->cap))
1551 		val |= DMA_TLB_WRITE_DRAIN;
1552 
1553 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1554 	/* Note: Only uses first TLB reg currently */
1555 	if (val_iva)
1556 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1557 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1558 
1559 	/* Make sure hardware complete it */
1560 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1561 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1562 
1563 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1564 
1565 	/* check IOTLB invalidation granularity */
1566 	if (DMA_TLB_IAIG(val) == 0)
1567 		pr_err("Flush IOTLB failed\n");
1568 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1569 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1570 			(unsigned long long)DMA_TLB_IIRG(type),
1571 			(unsigned long long)DMA_TLB_IAIG(val));
1572 }
1573 
1574 static struct device_domain_info *
1575 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1576 			 u8 bus, u8 devfn)
1577 {
1578 	struct device_domain_info *info;
1579 
1580 	assert_spin_locked(&device_domain_lock);
1581 
1582 	if (!iommu->qi)
1583 		return NULL;
1584 
1585 	list_for_each_entry(info, &domain->devices, link)
1586 		if (info->iommu == iommu && info->bus == bus &&
1587 		    info->devfn == devfn) {
1588 			if (info->ats_supported && info->dev)
1589 				return info;
1590 			break;
1591 		}
1592 
1593 	return NULL;
1594 }
1595 
1596 static void domain_update_iotlb(struct dmar_domain *domain)
1597 {
1598 	struct device_domain_info *info;
1599 	bool has_iotlb_device = false;
1600 
1601 	assert_spin_locked(&device_domain_lock);
1602 
1603 	list_for_each_entry(info, &domain->devices, link)
1604 		if (info->ats_enabled) {
1605 			has_iotlb_device = true;
1606 			break;
1607 		}
1608 
1609 	if (!has_iotlb_device) {
1610 		struct subdev_domain_info *sinfo;
1611 
1612 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1613 			info = get_domain_info(sinfo->pdev);
1614 			if (info && info->ats_enabled) {
1615 				has_iotlb_device = true;
1616 				break;
1617 			}
1618 		}
1619 	}
1620 
1621 	domain->has_iotlb_device = has_iotlb_device;
1622 }
1623 
1624 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1625 {
1626 	struct pci_dev *pdev;
1627 
1628 	assert_spin_locked(&device_domain_lock);
1629 
1630 	if (!info || !dev_is_pci(info->dev))
1631 		return;
1632 
1633 	pdev = to_pci_dev(info->dev);
1634 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1635 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1636 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1637 	 * reserved, which should be set to 0.
1638 	 */
1639 	if (!ecap_dit(info->iommu->ecap))
1640 		info->pfsid = 0;
1641 	else {
1642 		struct pci_dev *pf_pdev;
1643 
1644 		/* pdev will be returned if device is not a vf */
1645 		pf_pdev = pci_physfn(pdev);
1646 		info->pfsid = pci_dev_id(pf_pdev);
1647 	}
1648 
1649 #ifdef CONFIG_INTEL_IOMMU_SVM
1650 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1651 	   the device if you enable PASID support after ATS support is
1652 	   undefined. So always enable PASID support on devices which
1653 	   have it, even if we can't yet know if we're ever going to
1654 	   use it. */
1655 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1656 		info->pasid_enabled = 1;
1657 
1658 	if (info->pri_supported &&
1659 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1660 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1661 		info->pri_enabled = 1;
1662 #endif
1663 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1664 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1665 		info->ats_enabled = 1;
1666 		domain_update_iotlb(info->domain);
1667 		info->ats_qdep = pci_ats_queue_depth(pdev);
1668 	}
1669 }
1670 
1671 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1672 {
1673 	struct pci_dev *pdev;
1674 
1675 	assert_spin_locked(&device_domain_lock);
1676 
1677 	if (!dev_is_pci(info->dev))
1678 		return;
1679 
1680 	pdev = to_pci_dev(info->dev);
1681 
1682 	if (info->ats_enabled) {
1683 		pci_disable_ats(pdev);
1684 		info->ats_enabled = 0;
1685 		domain_update_iotlb(info->domain);
1686 	}
1687 #ifdef CONFIG_INTEL_IOMMU_SVM
1688 	if (info->pri_enabled) {
1689 		pci_disable_pri(pdev);
1690 		info->pri_enabled = 0;
1691 	}
1692 	if (info->pasid_enabled) {
1693 		pci_disable_pasid(pdev);
1694 		info->pasid_enabled = 0;
1695 	}
1696 #endif
1697 }
1698 
1699 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1700 				    u64 addr, unsigned int mask)
1701 {
1702 	u16 sid, qdep;
1703 
1704 	if (!info || !info->ats_enabled)
1705 		return;
1706 
1707 	sid = info->bus << 8 | info->devfn;
1708 	qdep = info->ats_qdep;
1709 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1710 			   qdep, addr, mask);
1711 }
1712 
1713 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1714 				  u64 addr, unsigned mask)
1715 {
1716 	unsigned long flags;
1717 	struct device_domain_info *info;
1718 	struct subdev_domain_info *sinfo;
1719 
1720 	if (!domain->has_iotlb_device)
1721 		return;
1722 
1723 	spin_lock_irqsave(&device_domain_lock, flags);
1724 	list_for_each_entry(info, &domain->devices, link)
1725 		__iommu_flush_dev_iotlb(info, addr, mask);
1726 
1727 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1728 		info = get_domain_info(sinfo->pdev);
1729 		__iommu_flush_dev_iotlb(info, addr, mask);
1730 	}
1731 	spin_unlock_irqrestore(&device_domain_lock, flags);
1732 }
1733 
1734 static void domain_flush_piotlb(struct intel_iommu *iommu,
1735 				struct dmar_domain *domain,
1736 				u64 addr, unsigned long npages, bool ih)
1737 {
1738 	u16 did = domain->iommu_did[iommu->seq_id];
1739 
1740 	if (domain->default_pasid)
1741 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1742 				addr, npages, ih);
1743 
1744 	if (!list_empty(&domain->devices))
1745 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1746 }
1747 
1748 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1749 				  struct dmar_domain *domain,
1750 				  unsigned long pfn, unsigned int pages,
1751 				  int ih, int map)
1752 {
1753 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1754 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1755 	u16 did = domain->iommu_did[iommu->seq_id];
1756 
1757 	BUG_ON(pages == 0);
1758 
1759 	if (ih)
1760 		ih = 1 << 6;
1761 
1762 	if (domain_use_first_level(domain)) {
1763 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1764 	} else {
1765 		/*
1766 		 * Fallback to domain selective flush if no PSI support or
1767 		 * the size is too big. PSI requires page size to be 2 ^ x,
1768 		 * and the base address is naturally aligned to the size.
1769 		 */
1770 		if (!cap_pgsel_inv(iommu->cap) ||
1771 		    mask > cap_max_amask_val(iommu->cap))
1772 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1773 							DMA_TLB_DSI_FLUSH);
1774 		else
1775 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1776 							DMA_TLB_PSI_FLUSH);
1777 	}
1778 
1779 	/*
1780 	 * In caching mode, changes of pages from non-present to present require
1781 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1782 	 */
1783 	if (!cap_caching_mode(iommu->cap) || !map)
1784 		iommu_flush_dev_iotlb(domain, addr, mask);
1785 }
1786 
1787 /* Notification for newly created mappings */
1788 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1789 					struct dmar_domain *domain,
1790 					unsigned long pfn, unsigned int pages)
1791 {
1792 	/*
1793 	 * It's a non-present to present mapping. Only flush if caching mode
1794 	 * and second level.
1795 	 */
1796 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1797 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1798 	else
1799 		iommu_flush_write_buffer(iommu);
1800 }
1801 
1802 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1803 {
1804 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1805 	int idx;
1806 
1807 	for_each_domain_iommu(idx, dmar_domain) {
1808 		struct intel_iommu *iommu = g_iommus[idx];
1809 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1810 
1811 		if (domain_use_first_level(dmar_domain))
1812 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1813 		else
1814 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1815 						 DMA_TLB_DSI_FLUSH);
1816 
1817 		if (!cap_caching_mode(iommu->cap))
1818 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1819 					      0, MAX_AGAW_PFN_WIDTH);
1820 	}
1821 }
1822 
1823 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1824 {
1825 	u32 pmen;
1826 	unsigned long flags;
1827 
1828 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1829 		return;
1830 
1831 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1832 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1833 	pmen &= ~DMA_PMEN_EPM;
1834 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1835 
1836 	/* wait for the protected region status bit to clear */
1837 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1838 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1839 
1840 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1841 }
1842 
1843 static void iommu_enable_translation(struct intel_iommu *iommu)
1844 {
1845 	u32 sts;
1846 	unsigned long flags;
1847 
1848 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1849 	iommu->gcmd |= DMA_GCMD_TE;
1850 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1851 
1852 	/* Make sure hardware complete it */
1853 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1854 		      readl, (sts & DMA_GSTS_TES), sts);
1855 
1856 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1857 }
1858 
1859 static void iommu_disable_translation(struct intel_iommu *iommu)
1860 {
1861 	u32 sts;
1862 	unsigned long flag;
1863 
1864 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1865 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1866 		return;
1867 
1868 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1869 	iommu->gcmd &= ~DMA_GCMD_TE;
1870 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1871 
1872 	/* Make sure hardware complete it */
1873 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1874 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1875 
1876 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1877 }
1878 
1879 static int iommu_init_domains(struct intel_iommu *iommu)
1880 {
1881 	u32 ndomains, nlongs;
1882 	size_t size;
1883 
1884 	ndomains = cap_ndoms(iommu->cap);
1885 	pr_debug("%s: Number of Domains supported <%d>\n",
1886 		 iommu->name, ndomains);
1887 	nlongs = BITS_TO_LONGS(ndomains);
1888 
1889 	spin_lock_init(&iommu->lock);
1890 
1891 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1892 	if (!iommu->domain_ids)
1893 		return -ENOMEM;
1894 
1895 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1896 	iommu->domains = kzalloc(size, GFP_KERNEL);
1897 
1898 	if (iommu->domains) {
1899 		size = 256 * sizeof(struct dmar_domain *);
1900 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1901 	}
1902 
1903 	if (!iommu->domains || !iommu->domains[0]) {
1904 		pr_err("%s: Allocating domain array failed\n",
1905 		       iommu->name);
1906 		kfree(iommu->domain_ids);
1907 		kfree(iommu->domains);
1908 		iommu->domain_ids = NULL;
1909 		iommu->domains    = NULL;
1910 		return -ENOMEM;
1911 	}
1912 
1913 	/*
1914 	 * If Caching mode is set, then invalid translations are tagged
1915 	 * with domain-id 0, hence we need to pre-allocate it. We also
1916 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1917 	 * make sure it is not used for a real domain.
1918 	 */
1919 	set_bit(0, iommu->domain_ids);
1920 
1921 	/*
1922 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1923 	 * entry for first-level or pass-through translation modes should
1924 	 * be programmed with a domain id different from those used for
1925 	 * second-level or nested translation. We reserve a domain id for
1926 	 * this purpose.
1927 	 */
1928 	if (sm_supported(iommu))
1929 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1930 
1931 	return 0;
1932 }
1933 
1934 static void disable_dmar_iommu(struct intel_iommu *iommu)
1935 {
1936 	struct device_domain_info *info, *tmp;
1937 	unsigned long flags;
1938 
1939 	if (!iommu->domains || !iommu->domain_ids)
1940 		return;
1941 
1942 	spin_lock_irqsave(&device_domain_lock, flags);
1943 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1944 		if (info->iommu != iommu)
1945 			continue;
1946 
1947 		if (!info->dev || !info->domain)
1948 			continue;
1949 
1950 		__dmar_remove_one_dev_info(info);
1951 	}
1952 	spin_unlock_irqrestore(&device_domain_lock, flags);
1953 
1954 	if (iommu->gcmd & DMA_GCMD_TE)
1955 		iommu_disable_translation(iommu);
1956 }
1957 
1958 static void free_dmar_iommu(struct intel_iommu *iommu)
1959 {
1960 	if ((iommu->domains) && (iommu->domain_ids)) {
1961 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1962 		int i;
1963 
1964 		for (i = 0; i < elems; i++)
1965 			kfree(iommu->domains[i]);
1966 		kfree(iommu->domains);
1967 		kfree(iommu->domain_ids);
1968 		iommu->domains = NULL;
1969 		iommu->domain_ids = NULL;
1970 	}
1971 
1972 	g_iommus[iommu->seq_id] = NULL;
1973 
1974 	/* free context mapping */
1975 	free_context_table(iommu);
1976 
1977 #ifdef CONFIG_INTEL_IOMMU_SVM
1978 	if (pasid_supported(iommu)) {
1979 		if (ecap_prs(iommu->ecap))
1980 			intel_svm_finish_prq(iommu);
1981 	}
1982 	if (vccap_pasid(iommu->vccap))
1983 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1984 
1985 #endif
1986 }
1987 
1988 /*
1989  * Check and return whether first level is used by default for
1990  * DMA translation.
1991  */
1992 static bool first_level_by_default(unsigned int type)
1993 {
1994 	/* Only SL is available in legacy mode */
1995 	if (!scalable_mode_support())
1996 		return false;
1997 
1998 	/* Only level (either FL or SL) is available, just use it */
1999 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
2000 		return intel_cap_flts_sanity();
2001 
2002 	/* Both levels are available, decide it based on domain type */
2003 	return type != IOMMU_DOMAIN_UNMANAGED;
2004 }
2005 
2006 static struct dmar_domain *alloc_domain(unsigned int type)
2007 {
2008 	struct dmar_domain *domain;
2009 
2010 	domain = alloc_domain_mem();
2011 	if (!domain)
2012 		return NULL;
2013 
2014 	memset(domain, 0, sizeof(*domain));
2015 	domain->nid = NUMA_NO_NODE;
2016 	if (first_level_by_default(type))
2017 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
2018 	domain->has_iotlb_device = false;
2019 	INIT_LIST_HEAD(&domain->devices);
2020 	INIT_LIST_HEAD(&domain->subdevices);
2021 
2022 	return domain;
2023 }
2024 
2025 /* Must be called with iommu->lock */
2026 static int domain_attach_iommu(struct dmar_domain *domain,
2027 			       struct intel_iommu *iommu)
2028 {
2029 	unsigned long ndomains;
2030 	int num;
2031 
2032 	assert_spin_locked(&device_domain_lock);
2033 	assert_spin_locked(&iommu->lock);
2034 
2035 	domain->iommu_refcnt[iommu->seq_id] += 1;
2036 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
2037 		ndomains = cap_ndoms(iommu->cap);
2038 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
2039 
2040 		if (num >= ndomains) {
2041 			pr_err("%s: No free domain ids\n", iommu->name);
2042 			domain->iommu_refcnt[iommu->seq_id] -= 1;
2043 			return -ENOSPC;
2044 		}
2045 
2046 		set_bit(num, iommu->domain_ids);
2047 		set_iommu_domain(iommu, num, domain);
2048 
2049 		domain->iommu_did[iommu->seq_id] = num;
2050 		domain->nid			 = iommu->node;
2051 
2052 		domain_update_iommu_cap(domain);
2053 	}
2054 
2055 	return 0;
2056 }
2057 
2058 static void domain_detach_iommu(struct dmar_domain *domain,
2059 				struct intel_iommu *iommu)
2060 {
2061 	int num;
2062 
2063 	assert_spin_locked(&device_domain_lock);
2064 	assert_spin_locked(&iommu->lock);
2065 
2066 	domain->iommu_refcnt[iommu->seq_id] -= 1;
2067 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2068 		num = domain->iommu_did[iommu->seq_id];
2069 		clear_bit(num, iommu->domain_ids);
2070 		set_iommu_domain(iommu, num, NULL);
2071 
2072 		domain_update_iommu_cap(domain);
2073 		domain->iommu_did[iommu->seq_id] = 0;
2074 	}
2075 }
2076 
2077 static inline int guestwidth_to_adjustwidth(int gaw)
2078 {
2079 	int agaw;
2080 	int r = (gaw - 12) % 9;
2081 
2082 	if (r == 0)
2083 		agaw = gaw;
2084 	else
2085 		agaw = gaw + 9 - r;
2086 	if (agaw > 64)
2087 		agaw = 64;
2088 	return agaw;
2089 }
2090 
2091 static void domain_exit(struct dmar_domain *domain)
2092 {
2093 
2094 	/* Remove associated devices and clear attached or cached domains */
2095 	domain_remove_dev_info(domain);
2096 
2097 	if (domain->pgd) {
2098 		struct page *freelist;
2099 
2100 		freelist = domain_unmap(domain, 0,
2101 					DOMAIN_MAX_PFN(domain->gaw), NULL);
2102 		dma_free_pagelist(freelist);
2103 	}
2104 
2105 	free_domain_mem(domain);
2106 }
2107 
2108 /*
2109  * Get the PASID directory size for scalable mode context entry.
2110  * Value of X in the PDTS field of a scalable mode context entry
2111  * indicates PASID directory with 2^(X + 7) entries.
2112  */
2113 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2114 {
2115 	int pds, max_pde;
2116 
2117 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2118 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2119 	if (pds < 7)
2120 		return 0;
2121 
2122 	return pds - 7;
2123 }
2124 
2125 /*
2126  * Set the RID_PASID field of a scalable mode context entry. The
2127  * IOMMU hardware will use the PASID value set in this field for
2128  * DMA translations of DMA requests without PASID.
2129  */
2130 static inline void
2131 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2132 {
2133 	context->hi |= pasid & ((1 << 20) - 1);
2134 }
2135 
2136 /*
2137  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2138  * entry.
2139  */
2140 static inline void context_set_sm_dte(struct context_entry *context)
2141 {
2142 	context->lo |= (1 << 2);
2143 }
2144 
2145 /*
2146  * Set the PRE(Page Request Enable) field of a scalable mode context
2147  * entry.
2148  */
2149 static inline void context_set_sm_pre(struct context_entry *context)
2150 {
2151 	context->lo |= (1 << 4);
2152 }
2153 
2154 /* Convert value to context PASID directory size field coding. */
2155 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2156 
2157 static int domain_context_mapping_one(struct dmar_domain *domain,
2158 				      struct intel_iommu *iommu,
2159 				      struct pasid_table *table,
2160 				      u8 bus, u8 devfn)
2161 {
2162 	u16 did = domain->iommu_did[iommu->seq_id];
2163 	int translation = CONTEXT_TT_MULTI_LEVEL;
2164 	struct device_domain_info *info = NULL;
2165 	struct context_entry *context;
2166 	unsigned long flags;
2167 	int ret;
2168 
2169 	WARN_ON(did == 0);
2170 
2171 	if (hw_pass_through && domain_type_is_si(domain))
2172 		translation = CONTEXT_TT_PASS_THROUGH;
2173 
2174 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2175 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2176 
2177 	BUG_ON(!domain->pgd);
2178 
2179 	spin_lock_irqsave(&device_domain_lock, flags);
2180 	spin_lock(&iommu->lock);
2181 
2182 	ret = -ENOMEM;
2183 	context = iommu_context_addr(iommu, bus, devfn, 1);
2184 	if (!context)
2185 		goto out_unlock;
2186 
2187 	ret = 0;
2188 	if (context_present(context))
2189 		goto out_unlock;
2190 
2191 	/*
2192 	 * For kdump cases, old valid entries may be cached due to the
2193 	 * in-flight DMA and copied pgtable, but there is no unmapping
2194 	 * behaviour for them, thus we need an explicit cache flush for
2195 	 * the newly-mapped device. For kdump, at this point, the device
2196 	 * is supposed to finish reset at its driver probe stage, so no
2197 	 * in-flight DMA will exist, and we don't need to worry anymore
2198 	 * hereafter.
2199 	 */
2200 	if (context_copied(context)) {
2201 		u16 did_old = context_domain_id(context);
2202 
2203 		if (did_old < cap_ndoms(iommu->cap)) {
2204 			iommu->flush.flush_context(iommu, did_old,
2205 						   (((u16)bus) << 8) | devfn,
2206 						   DMA_CCMD_MASK_NOBIT,
2207 						   DMA_CCMD_DEVICE_INVL);
2208 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2209 						 DMA_TLB_DSI_FLUSH);
2210 		}
2211 	}
2212 
2213 	context_clear_entry(context);
2214 
2215 	if (sm_supported(iommu)) {
2216 		unsigned long pds;
2217 
2218 		WARN_ON(!table);
2219 
2220 		/* Setup the PASID DIR pointer: */
2221 		pds = context_get_sm_pds(table);
2222 		context->lo = (u64)virt_to_phys(table->table) |
2223 				context_pdts(pds);
2224 
2225 		/* Setup the RID_PASID field: */
2226 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2227 
2228 		/*
2229 		 * Setup the Device-TLB enable bit and Page request
2230 		 * Enable bit:
2231 		 */
2232 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2233 		if (info && info->ats_supported)
2234 			context_set_sm_dte(context);
2235 		if (info && info->pri_supported)
2236 			context_set_sm_pre(context);
2237 	} else {
2238 		struct dma_pte *pgd = domain->pgd;
2239 		int agaw;
2240 
2241 		context_set_domain_id(context, did);
2242 
2243 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2244 			/*
2245 			 * Skip top levels of page tables for iommu which has
2246 			 * less agaw than default. Unnecessary for PT mode.
2247 			 */
2248 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2249 				ret = -ENOMEM;
2250 				pgd = phys_to_virt(dma_pte_addr(pgd));
2251 				if (!dma_pte_present(pgd))
2252 					goto out_unlock;
2253 			}
2254 
2255 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2256 			if (info && info->ats_supported)
2257 				translation = CONTEXT_TT_DEV_IOTLB;
2258 			else
2259 				translation = CONTEXT_TT_MULTI_LEVEL;
2260 
2261 			context_set_address_root(context, virt_to_phys(pgd));
2262 			context_set_address_width(context, agaw);
2263 		} else {
2264 			/*
2265 			 * In pass through mode, AW must be programmed to
2266 			 * indicate the largest AGAW value supported by
2267 			 * hardware. And ASR is ignored by hardware.
2268 			 */
2269 			context_set_address_width(context, iommu->msagaw);
2270 		}
2271 
2272 		context_set_translation_type(context, translation);
2273 	}
2274 
2275 	context_set_fault_enable(context);
2276 	context_set_present(context);
2277 	if (!ecap_coherent(iommu->ecap))
2278 		clflush_cache_range(context, sizeof(*context));
2279 
2280 	/*
2281 	 * It's a non-present to present mapping. If hardware doesn't cache
2282 	 * non-present entry we only need to flush the write-buffer. If the
2283 	 * _does_ cache non-present entries, then it does so in the special
2284 	 * domain #0, which we have to flush:
2285 	 */
2286 	if (cap_caching_mode(iommu->cap)) {
2287 		iommu->flush.flush_context(iommu, 0,
2288 					   (((u16)bus) << 8) | devfn,
2289 					   DMA_CCMD_MASK_NOBIT,
2290 					   DMA_CCMD_DEVICE_INVL);
2291 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2292 	} else {
2293 		iommu_flush_write_buffer(iommu);
2294 	}
2295 	iommu_enable_dev_iotlb(info);
2296 
2297 	ret = 0;
2298 
2299 out_unlock:
2300 	spin_unlock(&iommu->lock);
2301 	spin_unlock_irqrestore(&device_domain_lock, flags);
2302 
2303 	return ret;
2304 }
2305 
2306 struct domain_context_mapping_data {
2307 	struct dmar_domain *domain;
2308 	struct intel_iommu *iommu;
2309 	struct pasid_table *table;
2310 };
2311 
2312 static int domain_context_mapping_cb(struct pci_dev *pdev,
2313 				     u16 alias, void *opaque)
2314 {
2315 	struct domain_context_mapping_data *data = opaque;
2316 
2317 	return domain_context_mapping_one(data->domain, data->iommu,
2318 					  data->table, PCI_BUS_NUM(alias),
2319 					  alias & 0xff);
2320 }
2321 
2322 static int
2323 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2324 {
2325 	struct domain_context_mapping_data data;
2326 	struct pasid_table *table;
2327 	struct intel_iommu *iommu;
2328 	u8 bus, devfn;
2329 
2330 	iommu = device_to_iommu(dev, &bus, &devfn);
2331 	if (!iommu)
2332 		return -ENODEV;
2333 
2334 	table = intel_pasid_get_table(dev);
2335 
2336 	if (!dev_is_pci(dev))
2337 		return domain_context_mapping_one(domain, iommu, table,
2338 						  bus, devfn);
2339 
2340 	data.domain = domain;
2341 	data.iommu = iommu;
2342 	data.table = table;
2343 
2344 	return pci_for_each_dma_alias(to_pci_dev(dev),
2345 				      &domain_context_mapping_cb, &data);
2346 }
2347 
2348 static int domain_context_mapped_cb(struct pci_dev *pdev,
2349 				    u16 alias, void *opaque)
2350 {
2351 	struct intel_iommu *iommu = opaque;
2352 
2353 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2354 }
2355 
2356 static int domain_context_mapped(struct device *dev)
2357 {
2358 	struct intel_iommu *iommu;
2359 	u8 bus, devfn;
2360 
2361 	iommu = device_to_iommu(dev, &bus, &devfn);
2362 	if (!iommu)
2363 		return -ENODEV;
2364 
2365 	if (!dev_is_pci(dev))
2366 		return device_context_mapped(iommu, bus, devfn);
2367 
2368 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2369 				       domain_context_mapped_cb, iommu);
2370 }
2371 
2372 /* Returns a number of VTD pages, but aligned to MM page size */
2373 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2374 					    size_t size)
2375 {
2376 	host_addr &= ~PAGE_MASK;
2377 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2378 }
2379 
2380 /* Return largest possible superpage level for a given mapping */
2381 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2382 					  unsigned long iov_pfn,
2383 					  unsigned long phy_pfn,
2384 					  unsigned long pages)
2385 {
2386 	int support, level = 1;
2387 	unsigned long pfnmerge;
2388 
2389 	support = domain->iommu_superpage;
2390 
2391 	/* To use a large page, the virtual *and* physical addresses
2392 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2393 	   of them will mean we have to use smaller pages. So just
2394 	   merge them and check both at once. */
2395 	pfnmerge = iov_pfn | phy_pfn;
2396 
2397 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2398 		pages >>= VTD_STRIDE_SHIFT;
2399 		if (!pages)
2400 			break;
2401 		pfnmerge >>= VTD_STRIDE_SHIFT;
2402 		level++;
2403 		support--;
2404 	}
2405 	return level;
2406 }
2407 
2408 /*
2409  * Ensure that old small page tables are removed to make room for superpage(s).
2410  * We're going to add new large pages, so make sure we don't remove their parent
2411  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2412  */
2413 static void switch_to_super_page(struct dmar_domain *domain,
2414 				 unsigned long start_pfn,
2415 				 unsigned long end_pfn, int level)
2416 {
2417 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2418 	struct dma_pte *pte = NULL;
2419 	int i;
2420 
2421 	while (start_pfn <= end_pfn) {
2422 		if (!pte)
2423 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2424 
2425 		if (dma_pte_present(pte)) {
2426 			dma_pte_free_pagetable(domain, start_pfn,
2427 					       start_pfn + lvl_pages - 1,
2428 					       level + 1);
2429 
2430 			for_each_domain_iommu(i, domain)
2431 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2432 						      start_pfn, lvl_pages,
2433 						      0, 0);
2434 		}
2435 
2436 		pte++;
2437 		start_pfn += lvl_pages;
2438 		if (first_pte_in_page(pte))
2439 			pte = NULL;
2440 	}
2441 }
2442 
2443 static int
2444 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2445 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2446 {
2447 	struct dma_pte *first_pte = NULL, *pte = NULL;
2448 	unsigned int largepage_lvl = 0;
2449 	unsigned long lvl_pages = 0;
2450 	phys_addr_t pteval;
2451 	u64 attr;
2452 
2453 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2454 
2455 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2456 		return -EINVAL;
2457 
2458 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2459 	attr |= DMA_FL_PTE_PRESENT;
2460 	if (domain_use_first_level(domain)) {
2461 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2462 		if (prot & DMA_PTE_WRITE)
2463 			attr |= DMA_FL_PTE_DIRTY;
2464 	}
2465 
2466 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2467 
2468 	while (nr_pages > 0) {
2469 		uint64_t tmp;
2470 
2471 		if (!pte) {
2472 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2473 					phys_pfn, nr_pages);
2474 
2475 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2476 			if (!pte)
2477 				return -ENOMEM;
2478 			first_pte = pte;
2479 
2480 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2481 
2482 			/* It is large page*/
2483 			if (largepage_lvl > 1) {
2484 				unsigned long end_pfn;
2485 				unsigned long pages_to_remove;
2486 
2487 				pteval |= DMA_PTE_LARGE_PAGE;
2488 				pages_to_remove = min_t(unsigned long, nr_pages,
2489 							nr_pte_to_next_page(pte) * lvl_pages);
2490 				end_pfn = iov_pfn + pages_to_remove - 1;
2491 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2492 			} else {
2493 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2494 			}
2495 
2496 		}
2497 		/* We don't need lock here, nobody else
2498 		 * touches the iova range
2499 		 */
2500 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2501 		if (tmp) {
2502 			static int dumps = 5;
2503 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2504 				iov_pfn, tmp, (unsigned long long)pteval);
2505 			if (dumps) {
2506 				dumps--;
2507 				debug_dma_dump_mappings(NULL);
2508 			}
2509 			WARN_ON(1);
2510 		}
2511 
2512 		nr_pages -= lvl_pages;
2513 		iov_pfn += lvl_pages;
2514 		phys_pfn += lvl_pages;
2515 		pteval += lvl_pages * VTD_PAGE_SIZE;
2516 
2517 		/* If the next PTE would be the first in a new page, then we
2518 		 * need to flush the cache on the entries we've just written.
2519 		 * And then we'll need to recalculate 'pte', so clear it and
2520 		 * let it get set again in the if (!pte) block above.
2521 		 *
2522 		 * If we're done (!nr_pages) we need to flush the cache too.
2523 		 *
2524 		 * Also if we've been setting superpages, we may need to
2525 		 * recalculate 'pte' and switch back to smaller pages for the
2526 		 * end of the mapping, if the trailing size is not enough to
2527 		 * use another superpage (i.e. nr_pages < lvl_pages).
2528 		 */
2529 		pte++;
2530 		if (!nr_pages || first_pte_in_page(pte) ||
2531 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2532 			domain_flush_cache(domain, first_pte,
2533 					   (void *)pte - (void *)first_pte);
2534 			pte = NULL;
2535 		}
2536 	}
2537 
2538 	return 0;
2539 }
2540 
2541 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2542 {
2543 	struct intel_iommu *iommu = info->iommu;
2544 	struct context_entry *context;
2545 	unsigned long flags;
2546 	u16 did_old;
2547 
2548 	if (!iommu)
2549 		return;
2550 
2551 	spin_lock_irqsave(&iommu->lock, flags);
2552 	context = iommu_context_addr(iommu, bus, devfn, 0);
2553 	if (!context) {
2554 		spin_unlock_irqrestore(&iommu->lock, flags);
2555 		return;
2556 	}
2557 
2558 	if (sm_supported(iommu)) {
2559 		if (hw_pass_through && domain_type_is_si(info->domain))
2560 			did_old = FLPT_DEFAULT_DID;
2561 		else
2562 			did_old = info->domain->iommu_did[iommu->seq_id];
2563 	} else {
2564 		did_old = context_domain_id(context);
2565 	}
2566 
2567 	context_clear_entry(context);
2568 	__iommu_flush_cache(iommu, context, sizeof(*context));
2569 	spin_unlock_irqrestore(&iommu->lock, flags);
2570 	iommu->flush.flush_context(iommu,
2571 				   did_old,
2572 				   (((u16)bus) << 8) | devfn,
2573 				   DMA_CCMD_MASK_NOBIT,
2574 				   DMA_CCMD_DEVICE_INVL);
2575 
2576 	if (sm_supported(iommu))
2577 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2578 
2579 	iommu->flush.flush_iotlb(iommu,
2580 				 did_old,
2581 				 0,
2582 				 0,
2583 				 DMA_TLB_DSI_FLUSH);
2584 
2585 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2586 }
2587 
2588 static inline void unlink_domain_info(struct device_domain_info *info)
2589 {
2590 	assert_spin_locked(&device_domain_lock);
2591 	list_del(&info->link);
2592 	list_del(&info->global);
2593 	if (info->dev)
2594 		dev_iommu_priv_set(info->dev, NULL);
2595 }
2596 
2597 static void domain_remove_dev_info(struct dmar_domain *domain)
2598 {
2599 	struct device_domain_info *info, *tmp;
2600 	unsigned long flags;
2601 
2602 	spin_lock_irqsave(&device_domain_lock, flags);
2603 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2604 		__dmar_remove_one_dev_info(info);
2605 	spin_unlock_irqrestore(&device_domain_lock, flags);
2606 }
2607 
2608 struct dmar_domain *find_domain(struct device *dev)
2609 {
2610 	struct device_domain_info *info;
2611 
2612 	if (unlikely(!dev || !dev->iommu))
2613 		return NULL;
2614 
2615 	if (unlikely(attach_deferred(dev)))
2616 		return NULL;
2617 
2618 	/* No lock here, assumes no domain exit in normal case */
2619 	info = get_domain_info(dev);
2620 	if (likely(info))
2621 		return info->domain;
2622 
2623 	return NULL;
2624 }
2625 
2626 static inline struct device_domain_info *
2627 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2628 {
2629 	struct device_domain_info *info;
2630 
2631 	list_for_each_entry(info, &device_domain_list, global)
2632 		if (info->segment == segment && info->bus == bus &&
2633 		    info->devfn == devfn)
2634 			return info;
2635 
2636 	return NULL;
2637 }
2638 
2639 static int domain_setup_first_level(struct intel_iommu *iommu,
2640 				    struct dmar_domain *domain,
2641 				    struct device *dev,
2642 				    u32 pasid)
2643 {
2644 	struct dma_pte *pgd = domain->pgd;
2645 	int agaw, level;
2646 	int flags = 0;
2647 
2648 	/*
2649 	 * Skip top levels of page tables for iommu which has
2650 	 * less agaw than default. Unnecessary for PT mode.
2651 	 */
2652 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2653 		pgd = phys_to_virt(dma_pte_addr(pgd));
2654 		if (!dma_pte_present(pgd))
2655 			return -ENOMEM;
2656 	}
2657 
2658 	level = agaw_to_level(agaw);
2659 	if (level != 4 && level != 5)
2660 		return -EINVAL;
2661 
2662 	if (pasid != PASID_RID2PASID)
2663 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2664 	if (level == 5)
2665 		flags |= PASID_FLAG_FL5LP;
2666 
2667 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2668 		flags |= PASID_FLAG_PAGE_SNOOP;
2669 
2670 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2671 					     domain->iommu_did[iommu->seq_id],
2672 					     flags);
2673 }
2674 
2675 static bool dev_is_real_dma_subdevice(struct device *dev)
2676 {
2677 	return dev && dev_is_pci(dev) &&
2678 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2679 }
2680 
2681 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2682 						    int bus, int devfn,
2683 						    struct device *dev,
2684 						    struct dmar_domain *domain)
2685 {
2686 	struct dmar_domain *found = NULL;
2687 	struct device_domain_info *info;
2688 	unsigned long flags;
2689 	int ret;
2690 
2691 	info = alloc_devinfo_mem();
2692 	if (!info)
2693 		return NULL;
2694 
2695 	if (!dev_is_real_dma_subdevice(dev)) {
2696 		info->bus = bus;
2697 		info->devfn = devfn;
2698 		info->segment = iommu->segment;
2699 	} else {
2700 		struct pci_dev *pdev = to_pci_dev(dev);
2701 
2702 		info->bus = pdev->bus->number;
2703 		info->devfn = pdev->devfn;
2704 		info->segment = pci_domain_nr(pdev->bus);
2705 	}
2706 
2707 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2708 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2709 	info->ats_qdep = 0;
2710 	info->dev = dev;
2711 	info->domain = domain;
2712 	info->iommu = iommu;
2713 	info->pasid_table = NULL;
2714 	info->auxd_enabled = 0;
2715 	INIT_LIST_HEAD(&info->subdevices);
2716 
2717 	if (dev && dev_is_pci(dev)) {
2718 		struct pci_dev *pdev = to_pci_dev(info->dev);
2719 
2720 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2721 		    pci_ats_supported(pdev) &&
2722 		    dmar_find_matched_atsr_unit(pdev))
2723 			info->ats_supported = 1;
2724 
2725 		if (sm_supported(iommu)) {
2726 			if (pasid_supported(iommu)) {
2727 				int features = pci_pasid_features(pdev);
2728 				if (features >= 0)
2729 					info->pasid_supported = features | 1;
2730 			}
2731 
2732 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2733 			    pci_pri_supported(pdev))
2734 				info->pri_supported = 1;
2735 		}
2736 	}
2737 
2738 	spin_lock_irqsave(&device_domain_lock, flags);
2739 	if (dev)
2740 		found = find_domain(dev);
2741 
2742 	if (!found) {
2743 		struct device_domain_info *info2;
2744 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2745 						       info->devfn);
2746 		if (info2) {
2747 			found      = info2->domain;
2748 			info2->dev = dev;
2749 		}
2750 	}
2751 
2752 	if (found) {
2753 		spin_unlock_irqrestore(&device_domain_lock, flags);
2754 		free_devinfo_mem(info);
2755 		/* Caller must free the original domain */
2756 		return found;
2757 	}
2758 
2759 	spin_lock(&iommu->lock);
2760 	ret = domain_attach_iommu(domain, iommu);
2761 	spin_unlock(&iommu->lock);
2762 
2763 	if (ret) {
2764 		spin_unlock_irqrestore(&device_domain_lock, flags);
2765 		free_devinfo_mem(info);
2766 		return NULL;
2767 	}
2768 
2769 	list_add(&info->link, &domain->devices);
2770 	list_add(&info->global, &device_domain_list);
2771 	if (dev)
2772 		dev_iommu_priv_set(dev, info);
2773 	spin_unlock_irqrestore(&device_domain_lock, flags);
2774 
2775 	/* PASID table is mandatory for a PCI device in scalable mode. */
2776 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2777 		ret = intel_pasid_alloc_table(dev);
2778 		if (ret) {
2779 			dev_err(dev, "PASID table allocation failed\n");
2780 			dmar_remove_one_dev_info(dev);
2781 			return NULL;
2782 		}
2783 
2784 		/* Setup the PASID entry for requests without PASID: */
2785 		spin_lock_irqsave(&iommu->lock, flags);
2786 		if (hw_pass_through && domain_type_is_si(domain))
2787 			ret = intel_pasid_setup_pass_through(iommu, domain,
2788 					dev, PASID_RID2PASID);
2789 		else if (domain_use_first_level(domain))
2790 			ret = domain_setup_first_level(iommu, domain, dev,
2791 					PASID_RID2PASID);
2792 		else
2793 			ret = intel_pasid_setup_second_level(iommu, domain,
2794 					dev, PASID_RID2PASID);
2795 		spin_unlock_irqrestore(&iommu->lock, flags);
2796 		if (ret) {
2797 			dev_err(dev, "Setup RID2PASID failed\n");
2798 			dmar_remove_one_dev_info(dev);
2799 			return NULL;
2800 		}
2801 	}
2802 
2803 	if (dev && domain_context_mapping(domain, dev)) {
2804 		dev_err(dev, "Domain context map failed\n");
2805 		dmar_remove_one_dev_info(dev);
2806 		return NULL;
2807 	}
2808 
2809 	return domain;
2810 }
2811 
2812 static int iommu_domain_identity_map(struct dmar_domain *domain,
2813 				     unsigned long first_vpfn,
2814 				     unsigned long last_vpfn)
2815 {
2816 	/*
2817 	 * RMRR range might have overlap with physical memory range,
2818 	 * clear it first
2819 	 */
2820 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2821 
2822 	return __domain_mapping(domain, first_vpfn,
2823 				first_vpfn, last_vpfn - first_vpfn + 1,
2824 				DMA_PTE_READ|DMA_PTE_WRITE);
2825 }
2826 
2827 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2828 
2829 static int __init si_domain_init(int hw)
2830 {
2831 	struct dmar_rmrr_unit *rmrr;
2832 	struct device *dev;
2833 	int i, nid, ret;
2834 
2835 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2836 	if (!si_domain)
2837 		return -EFAULT;
2838 
2839 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2840 		domain_exit(si_domain);
2841 		return -EFAULT;
2842 	}
2843 
2844 	if (hw)
2845 		return 0;
2846 
2847 	for_each_online_node(nid) {
2848 		unsigned long start_pfn, end_pfn;
2849 		int i;
2850 
2851 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2852 			ret = iommu_domain_identity_map(si_domain,
2853 					mm_to_dma_pfn(start_pfn),
2854 					mm_to_dma_pfn(end_pfn));
2855 			if (ret)
2856 				return ret;
2857 		}
2858 	}
2859 
2860 	/*
2861 	 * Identity map the RMRRs so that devices with RMRRs could also use
2862 	 * the si_domain.
2863 	 */
2864 	for_each_rmrr_units(rmrr) {
2865 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2866 					  i, dev) {
2867 			unsigned long long start = rmrr->base_address;
2868 			unsigned long long end = rmrr->end_address;
2869 
2870 			if (WARN_ON(end < start ||
2871 				    end >> agaw_to_width(si_domain->agaw)))
2872 				continue;
2873 
2874 			ret = iommu_domain_identity_map(si_domain,
2875 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2876 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2877 			if (ret)
2878 				return ret;
2879 		}
2880 	}
2881 
2882 	return 0;
2883 }
2884 
2885 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2886 {
2887 	struct dmar_domain *ndomain;
2888 	struct intel_iommu *iommu;
2889 	u8 bus, devfn;
2890 
2891 	iommu = device_to_iommu(dev, &bus, &devfn);
2892 	if (!iommu)
2893 		return -ENODEV;
2894 
2895 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2896 	if (ndomain != domain)
2897 		return -EBUSY;
2898 
2899 	return 0;
2900 }
2901 
2902 static bool device_has_rmrr(struct device *dev)
2903 {
2904 	struct dmar_rmrr_unit *rmrr;
2905 	struct device *tmp;
2906 	int i;
2907 
2908 	rcu_read_lock();
2909 	for_each_rmrr_units(rmrr) {
2910 		/*
2911 		 * Return TRUE if this RMRR contains the device that
2912 		 * is passed in.
2913 		 */
2914 		for_each_active_dev_scope(rmrr->devices,
2915 					  rmrr->devices_cnt, i, tmp)
2916 			if (tmp == dev ||
2917 			    is_downstream_to_pci_bridge(dev, tmp)) {
2918 				rcu_read_unlock();
2919 				return true;
2920 			}
2921 	}
2922 	rcu_read_unlock();
2923 	return false;
2924 }
2925 
2926 /**
2927  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2928  * is relaxable (ie. is allowed to be not enforced under some conditions)
2929  * @dev: device handle
2930  *
2931  * We assume that PCI USB devices with RMRRs have them largely
2932  * for historical reasons and that the RMRR space is not actively used post
2933  * boot.  This exclusion may change if vendors begin to abuse it.
2934  *
2935  * The same exception is made for graphics devices, with the requirement that
2936  * any use of the RMRR regions will be torn down before assigning the device
2937  * to a guest.
2938  *
2939  * Return: true if the RMRR is relaxable, false otherwise
2940  */
2941 static bool device_rmrr_is_relaxable(struct device *dev)
2942 {
2943 	struct pci_dev *pdev;
2944 
2945 	if (!dev_is_pci(dev))
2946 		return false;
2947 
2948 	pdev = to_pci_dev(dev);
2949 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2950 		return true;
2951 	else
2952 		return false;
2953 }
2954 
2955 /*
2956  * There are a couple cases where we need to restrict the functionality of
2957  * devices associated with RMRRs.  The first is when evaluating a device for
2958  * identity mapping because problems exist when devices are moved in and out
2959  * of domains and their respective RMRR information is lost.  This means that
2960  * a device with associated RMRRs will never be in a "passthrough" domain.
2961  * The second is use of the device through the IOMMU API.  This interface
2962  * expects to have full control of the IOVA space for the device.  We cannot
2963  * satisfy both the requirement that RMRR access is maintained and have an
2964  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2965  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2966  * We therefore prevent devices associated with an RMRR from participating in
2967  * the IOMMU API, which eliminates them from device assignment.
2968  *
2969  * In both cases, devices which have relaxable RMRRs are not concerned by this
2970  * restriction. See device_rmrr_is_relaxable comment.
2971  */
2972 static bool device_is_rmrr_locked(struct device *dev)
2973 {
2974 	if (!device_has_rmrr(dev))
2975 		return false;
2976 
2977 	if (device_rmrr_is_relaxable(dev))
2978 		return false;
2979 
2980 	return true;
2981 }
2982 
2983 /*
2984  * Return the required default domain type for a specific device.
2985  *
2986  * @dev: the device in query
2987  * @startup: true if this is during early boot
2988  *
2989  * Returns:
2990  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2991  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2992  *  - 0: both identity and dynamic domains work for this device
2993  */
2994 static int device_def_domain_type(struct device *dev)
2995 {
2996 	if (dev_is_pci(dev)) {
2997 		struct pci_dev *pdev = to_pci_dev(dev);
2998 
2999 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3000 			return IOMMU_DOMAIN_IDENTITY;
3001 
3002 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3003 			return IOMMU_DOMAIN_IDENTITY;
3004 	}
3005 
3006 	return 0;
3007 }
3008 
3009 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3010 {
3011 	/*
3012 	 * Start from the sane iommu hardware state.
3013 	 * If the queued invalidation is already initialized by us
3014 	 * (for example, while enabling interrupt-remapping) then
3015 	 * we got the things already rolling from a sane state.
3016 	 */
3017 	if (!iommu->qi) {
3018 		/*
3019 		 * Clear any previous faults.
3020 		 */
3021 		dmar_fault(-1, iommu);
3022 		/*
3023 		 * Disable queued invalidation if supported and already enabled
3024 		 * before OS handover.
3025 		 */
3026 		dmar_disable_qi(iommu);
3027 	}
3028 
3029 	if (dmar_enable_qi(iommu)) {
3030 		/*
3031 		 * Queued Invalidate not enabled, use Register Based Invalidate
3032 		 */
3033 		iommu->flush.flush_context = __iommu_flush_context;
3034 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3035 		pr_info("%s: Using Register based invalidation\n",
3036 			iommu->name);
3037 	} else {
3038 		iommu->flush.flush_context = qi_flush_context;
3039 		iommu->flush.flush_iotlb = qi_flush_iotlb;
3040 		pr_info("%s: Using Queued invalidation\n", iommu->name);
3041 	}
3042 }
3043 
3044 static int copy_context_table(struct intel_iommu *iommu,
3045 			      struct root_entry *old_re,
3046 			      struct context_entry **tbl,
3047 			      int bus, bool ext)
3048 {
3049 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3050 	struct context_entry *new_ce = NULL, ce;
3051 	struct context_entry *old_ce = NULL;
3052 	struct root_entry re;
3053 	phys_addr_t old_ce_phys;
3054 
3055 	tbl_idx = ext ? bus * 2 : bus;
3056 	memcpy(&re, old_re, sizeof(re));
3057 
3058 	for (devfn = 0; devfn < 256; devfn++) {
3059 		/* First calculate the correct index */
3060 		idx = (ext ? devfn * 2 : devfn) % 256;
3061 
3062 		if (idx == 0) {
3063 			/* First save what we may have and clean up */
3064 			if (new_ce) {
3065 				tbl[tbl_idx] = new_ce;
3066 				__iommu_flush_cache(iommu, new_ce,
3067 						    VTD_PAGE_SIZE);
3068 				pos = 1;
3069 			}
3070 
3071 			if (old_ce)
3072 				memunmap(old_ce);
3073 
3074 			ret = 0;
3075 			if (devfn < 0x80)
3076 				old_ce_phys = root_entry_lctp(&re);
3077 			else
3078 				old_ce_phys = root_entry_uctp(&re);
3079 
3080 			if (!old_ce_phys) {
3081 				if (ext && devfn == 0) {
3082 					/* No LCTP, try UCTP */
3083 					devfn = 0x7f;
3084 					continue;
3085 				} else {
3086 					goto out;
3087 				}
3088 			}
3089 
3090 			ret = -ENOMEM;
3091 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3092 					MEMREMAP_WB);
3093 			if (!old_ce)
3094 				goto out;
3095 
3096 			new_ce = alloc_pgtable_page(iommu->node);
3097 			if (!new_ce)
3098 				goto out_unmap;
3099 
3100 			ret = 0;
3101 		}
3102 
3103 		/* Now copy the context entry */
3104 		memcpy(&ce, old_ce + idx, sizeof(ce));
3105 
3106 		if (!__context_present(&ce))
3107 			continue;
3108 
3109 		did = context_domain_id(&ce);
3110 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3111 			set_bit(did, iommu->domain_ids);
3112 
3113 		/*
3114 		 * We need a marker for copied context entries. This
3115 		 * marker needs to work for the old format as well as
3116 		 * for extended context entries.
3117 		 *
3118 		 * Bit 67 of the context entry is used. In the old
3119 		 * format this bit is available to software, in the
3120 		 * extended format it is the PGE bit, but PGE is ignored
3121 		 * by HW if PASIDs are disabled (and thus still
3122 		 * available).
3123 		 *
3124 		 * So disable PASIDs first and then mark the entry
3125 		 * copied. This means that we don't copy PASID
3126 		 * translations from the old kernel, but this is fine as
3127 		 * faults there are not fatal.
3128 		 */
3129 		context_clear_pasid_enable(&ce);
3130 		context_set_copied(&ce);
3131 
3132 		new_ce[idx] = ce;
3133 	}
3134 
3135 	tbl[tbl_idx + pos] = new_ce;
3136 
3137 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3138 
3139 out_unmap:
3140 	memunmap(old_ce);
3141 
3142 out:
3143 	return ret;
3144 }
3145 
3146 static int copy_translation_tables(struct intel_iommu *iommu)
3147 {
3148 	struct context_entry **ctxt_tbls;
3149 	struct root_entry *old_rt;
3150 	phys_addr_t old_rt_phys;
3151 	int ctxt_table_entries;
3152 	unsigned long flags;
3153 	u64 rtaddr_reg;
3154 	int bus, ret;
3155 	bool new_ext, ext;
3156 
3157 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3158 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3159 	new_ext    = !!ecap_ecs(iommu->ecap);
3160 
3161 	/*
3162 	 * The RTT bit can only be changed when translation is disabled,
3163 	 * but disabling translation means to open a window for data
3164 	 * corruption. So bail out and don't copy anything if we would
3165 	 * have to change the bit.
3166 	 */
3167 	if (new_ext != ext)
3168 		return -EINVAL;
3169 
3170 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3171 	if (!old_rt_phys)
3172 		return -EINVAL;
3173 
3174 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3175 	if (!old_rt)
3176 		return -ENOMEM;
3177 
3178 	/* This is too big for the stack - allocate it from slab */
3179 	ctxt_table_entries = ext ? 512 : 256;
3180 	ret = -ENOMEM;
3181 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3182 	if (!ctxt_tbls)
3183 		goto out_unmap;
3184 
3185 	for (bus = 0; bus < 256; bus++) {
3186 		ret = copy_context_table(iommu, &old_rt[bus],
3187 					 ctxt_tbls, bus, ext);
3188 		if (ret) {
3189 			pr_err("%s: Failed to copy context table for bus %d\n",
3190 				iommu->name, bus);
3191 			continue;
3192 		}
3193 	}
3194 
3195 	spin_lock_irqsave(&iommu->lock, flags);
3196 
3197 	/* Context tables are copied, now write them to the root_entry table */
3198 	for (bus = 0; bus < 256; bus++) {
3199 		int idx = ext ? bus * 2 : bus;
3200 		u64 val;
3201 
3202 		if (ctxt_tbls[idx]) {
3203 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3204 			iommu->root_entry[bus].lo = val;
3205 		}
3206 
3207 		if (!ext || !ctxt_tbls[idx + 1])
3208 			continue;
3209 
3210 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3211 		iommu->root_entry[bus].hi = val;
3212 	}
3213 
3214 	spin_unlock_irqrestore(&iommu->lock, flags);
3215 
3216 	kfree(ctxt_tbls);
3217 
3218 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3219 
3220 	ret = 0;
3221 
3222 out_unmap:
3223 	memunmap(old_rt);
3224 
3225 	return ret;
3226 }
3227 
3228 #ifdef CONFIG_INTEL_IOMMU_SVM
3229 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3230 {
3231 	struct intel_iommu *iommu = data;
3232 	ioasid_t ioasid;
3233 
3234 	if (!iommu)
3235 		return INVALID_IOASID;
3236 	/*
3237 	 * VT-d virtual command interface always uses the full 20 bit
3238 	 * PASID range. Host can partition guest PASID range based on
3239 	 * policies but it is out of guest's control.
3240 	 */
3241 	if (min < PASID_MIN || max > intel_pasid_max_id)
3242 		return INVALID_IOASID;
3243 
3244 	if (vcmd_alloc_pasid(iommu, &ioasid))
3245 		return INVALID_IOASID;
3246 
3247 	return ioasid;
3248 }
3249 
3250 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3251 {
3252 	struct intel_iommu *iommu = data;
3253 
3254 	if (!iommu)
3255 		return;
3256 	/*
3257 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3258 	 * We can only free the PASID when all the devices are unbound.
3259 	 */
3260 	if (ioasid_find(NULL, ioasid, NULL)) {
3261 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3262 		return;
3263 	}
3264 	vcmd_free_pasid(iommu, ioasid);
3265 }
3266 
3267 static void register_pasid_allocator(struct intel_iommu *iommu)
3268 {
3269 	/*
3270 	 * If we are running in the host, no need for custom allocator
3271 	 * in that PASIDs are allocated from the host system-wide.
3272 	 */
3273 	if (!cap_caching_mode(iommu->cap))
3274 		return;
3275 
3276 	if (!sm_supported(iommu)) {
3277 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3278 		return;
3279 	}
3280 
3281 	/*
3282 	 * Register a custom PASID allocator if we are running in a guest,
3283 	 * guest PASID must be obtained via virtual command interface.
3284 	 * There can be multiple vIOMMUs in each guest but only one allocator
3285 	 * is active. All vIOMMU allocators will eventually be calling the same
3286 	 * host allocator.
3287 	 */
3288 	if (!vccap_pasid(iommu->vccap))
3289 		return;
3290 
3291 	pr_info("Register custom PASID allocator\n");
3292 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3293 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3294 	iommu->pasid_allocator.pdata = (void *)iommu;
3295 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3296 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3297 		/*
3298 		 * Disable scalable mode on this IOMMU if there
3299 		 * is no custom allocator. Mixing SM capable vIOMMU
3300 		 * and non-SM vIOMMU are not supported.
3301 		 */
3302 		intel_iommu_sm = 0;
3303 	}
3304 }
3305 #endif
3306 
3307 static int __init init_dmars(void)
3308 {
3309 	struct dmar_drhd_unit *drhd;
3310 	struct intel_iommu *iommu;
3311 	int ret;
3312 
3313 	/*
3314 	 * for each drhd
3315 	 *    allocate root
3316 	 *    initialize and program root entry to not present
3317 	 * endfor
3318 	 */
3319 	for_each_drhd_unit(drhd) {
3320 		/*
3321 		 * lock not needed as this is only incremented in the single
3322 		 * threaded kernel __init code path all other access are read
3323 		 * only
3324 		 */
3325 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3326 			g_num_of_iommus++;
3327 			continue;
3328 		}
3329 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3330 	}
3331 
3332 	/* Preallocate enough resources for IOMMU hot-addition */
3333 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3334 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3335 
3336 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3337 			GFP_KERNEL);
3338 	if (!g_iommus) {
3339 		ret = -ENOMEM;
3340 		goto error;
3341 	}
3342 
3343 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3344 	if (ret)
3345 		goto free_iommu;
3346 
3347 	for_each_iommu(iommu, drhd) {
3348 		if (drhd->ignored) {
3349 			iommu_disable_translation(iommu);
3350 			continue;
3351 		}
3352 
3353 		/*
3354 		 * Find the max pasid size of all IOMMU's in the system.
3355 		 * We need to ensure the system pasid table is no bigger
3356 		 * than the smallest supported.
3357 		 */
3358 		if (pasid_supported(iommu)) {
3359 			u32 temp = 2 << ecap_pss(iommu->ecap);
3360 
3361 			intel_pasid_max_id = min_t(u32, temp,
3362 						   intel_pasid_max_id);
3363 		}
3364 
3365 		g_iommus[iommu->seq_id] = iommu;
3366 
3367 		intel_iommu_init_qi(iommu);
3368 
3369 		ret = iommu_init_domains(iommu);
3370 		if (ret)
3371 			goto free_iommu;
3372 
3373 		init_translation_status(iommu);
3374 
3375 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3376 			iommu_disable_translation(iommu);
3377 			clear_translation_pre_enabled(iommu);
3378 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3379 				iommu->name);
3380 		}
3381 
3382 		/*
3383 		 * TBD:
3384 		 * we could share the same root & context tables
3385 		 * among all IOMMU's. Need to Split it later.
3386 		 */
3387 		ret = iommu_alloc_root_entry(iommu);
3388 		if (ret)
3389 			goto free_iommu;
3390 
3391 		if (translation_pre_enabled(iommu)) {
3392 			pr_info("Translation already enabled - trying to copy translation structures\n");
3393 
3394 			ret = copy_translation_tables(iommu);
3395 			if (ret) {
3396 				/*
3397 				 * We found the IOMMU with translation
3398 				 * enabled - but failed to copy over the
3399 				 * old root-entry table. Try to proceed
3400 				 * by disabling translation now and
3401 				 * allocating a clean root-entry table.
3402 				 * This might cause DMAR faults, but
3403 				 * probably the dump will still succeed.
3404 				 */
3405 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3406 				       iommu->name);
3407 				iommu_disable_translation(iommu);
3408 				clear_translation_pre_enabled(iommu);
3409 			} else {
3410 				pr_info("Copied translation tables from previous kernel for %s\n",
3411 					iommu->name);
3412 			}
3413 		}
3414 
3415 		if (!ecap_pass_through(iommu->ecap))
3416 			hw_pass_through = 0;
3417 		intel_svm_check(iommu);
3418 	}
3419 
3420 	/*
3421 	 * Now that qi is enabled on all iommus, set the root entry and flush
3422 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3423 	 * flush_context function will loop forever and the boot hangs.
3424 	 */
3425 	for_each_active_iommu(iommu, drhd) {
3426 		iommu_flush_write_buffer(iommu);
3427 #ifdef CONFIG_INTEL_IOMMU_SVM
3428 		register_pasid_allocator(iommu);
3429 #endif
3430 		iommu_set_root_entry(iommu);
3431 	}
3432 
3433 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3434 	dmar_map_gfx = 0;
3435 #endif
3436 
3437 	if (!dmar_map_gfx)
3438 		iommu_identity_mapping |= IDENTMAP_GFX;
3439 
3440 	check_tylersburg_isoch();
3441 
3442 	ret = si_domain_init(hw_pass_through);
3443 	if (ret)
3444 		goto free_iommu;
3445 
3446 	/*
3447 	 * for each drhd
3448 	 *   enable fault log
3449 	 *   global invalidate context cache
3450 	 *   global invalidate iotlb
3451 	 *   enable translation
3452 	 */
3453 	for_each_iommu(iommu, drhd) {
3454 		if (drhd->ignored) {
3455 			/*
3456 			 * we always have to disable PMRs or DMA may fail on
3457 			 * this device
3458 			 */
3459 			if (force_on)
3460 				iommu_disable_protect_mem_regions(iommu);
3461 			continue;
3462 		}
3463 
3464 		iommu_flush_write_buffer(iommu);
3465 
3466 #ifdef CONFIG_INTEL_IOMMU_SVM
3467 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3468 			/*
3469 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3470 			 * could cause possible lock race condition.
3471 			 */
3472 			up_write(&dmar_global_lock);
3473 			ret = intel_svm_enable_prq(iommu);
3474 			down_write(&dmar_global_lock);
3475 			if (ret)
3476 				goto free_iommu;
3477 		}
3478 #endif
3479 		ret = dmar_set_interrupt(iommu);
3480 		if (ret)
3481 			goto free_iommu;
3482 	}
3483 
3484 	return 0;
3485 
3486 free_iommu:
3487 	for_each_active_iommu(iommu, drhd) {
3488 		disable_dmar_iommu(iommu);
3489 		free_dmar_iommu(iommu);
3490 	}
3491 
3492 	kfree(g_iommus);
3493 
3494 error:
3495 	return ret;
3496 }
3497 
3498 static inline int iommu_domain_cache_init(void)
3499 {
3500 	int ret = 0;
3501 
3502 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3503 					 sizeof(struct dmar_domain),
3504 					 0,
3505 					 SLAB_HWCACHE_ALIGN,
3506 
3507 					 NULL);
3508 	if (!iommu_domain_cache) {
3509 		pr_err("Couldn't create iommu_domain cache\n");
3510 		ret = -ENOMEM;
3511 	}
3512 
3513 	return ret;
3514 }
3515 
3516 static inline int iommu_devinfo_cache_init(void)
3517 {
3518 	int ret = 0;
3519 
3520 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3521 					 sizeof(struct device_domain_info),
3522 					 0,
3523 					 SLAB_HWCACHE_ALIGN,
3524 					 NULL);
3525 	if (!iommu_devinfo_cache) {
3526 		pr_err("Couldn't create devinfo cache\n");
3527 		ret = -ENOMEM;
3528 	}
3529 
3530 	return ret;
3531 }
3532 
3533 static int __init iommu_init_mempool(void)
3534 {
3535 	int ret;
3536 	ret = iova_cache_get();
3537 	if (ret)
3538 		return ret;
3539 
3540 	ret = iommu_domain_cache_init();
3541 	if (ret)
3542 		goto domain_error;
3543 
3544 	ret = iommu_devinfo_cache_init();
3545 	if (!ret)
3546 		return ret;
3547 
3548 	kmem_cache_destroy(iommu_domain_cache);
3549 domain_error:
3550 	iova_cache_put();
3551 
3552 	return -ENOMEM;
3553 }
3554 
3555 static void __init iommu_exit_mempool(void)
3556 {
3557 	kmem_cache_destroy(iommu_devinfo_cache);
3558 	kmem_cache_destroy(iommu_domain_cache);
3559 	iova_cache_put();
3560 }
3561 
3562 static void __init init_no_remapping_devices(void)
3563 {
3564 	struct dmar_drhd_unit *drhd;
3565 	struct device *dev;
3566 	int i;
3567 
3568 	for_each_drhd_unit(drhd) {
3569 		if (!drhd->include_all) {
3570 			for_each_active_dev_scope(drhd->devices,
3571 						  drhd->devices_cnt, i, dev)
3572 				break;
3573 			/* ignore DMAR unit if no devices exist */
3574 			if (i == drhd->devices_cnt)
3575 				drhd->ignored = 1;
3576 		}
3577 	}
3578 
3579 	for_each_active_drhd_unit(drhd) {
3580 		if (drhd->include_all)
3581 			continue;
3582 
3583 		for_each_active_dev_scope(drhd->devices,
3584 					  drhd->devices_cnt, i, dev)
3585 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3586 				break;
3587 		if (i < drhd->devices_cnt)
3588 			continue;
3589 
3590 		/* This IOMMU has *only* gfx devices. Either bypass it or
3591 		   set the gfx_mapped flag, as appropriate */
3592 		drhd->gfx_dedicated = 1;
3593 		if (!dmar_map_gfx)
3594 			drhd->ignored = 1;
3595 	}
3596 }
3597 
3598 #ifdef CONFIG_SUSPEND
3599 static int init_iommu_hw(void)
3600 {
3601 	struct dmar_drhd_unit *drhd;
3602 	struct intel_iommu *iommu = NULL;
3603 
3604 	for_each_active_iommu(iommu, drhd)
3605 		if (iommu->qi)
3606 			dmar_reenable_qi(iommu);
3607 
3608 	for_each_iommu(iommu, drhd) {
3609 		if (drhd->ignored) {
3610 			/*
3611 			 * we always have to disable PMRs or DMA may fail on
3612 			 * this device
3613 			 */
3614 			if (force_on)
3615 				iommu_disable_protect_mem_regions(iommu);
3616 			continue;
3617 		}
3618 
3619 		iommu_flush_write_buffer(iommu);
3620 		iommu_set_root_entry(iommu);
3621 		iommu_enable_translation(iommu);
3622 		iommu_disable_protect_mem_regions(iommu);
3623 	}
3624 
3625 	return 0;
3626 }
3627 
3628 static void iommu_flush_all(void)
3629 {
3630 	struct dmar_drhd_unit *drhd;
3631 	struct intel_iommu *iommu;
3632 
3633 	for_each_active_iommu(iommu, drhd) {
3634 		iommu->flush.flush_context(iommu, 0, 0, 0,
3635 					   DMA_CCMD_GLOBAL_INVL);
3636 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3637 					 DMA_TLB_GLOBAL_FLUSH);
3638 	}
3639 }
3640 
3641 static int iommu_suspend(void)
3642 {
3643 	struct dmar_drhd_unit *drhd;
3644 	struct intel_iommu *iommu = NULL;
3645 	unsigned long flag;
3646 
3647 	for_each_active_iommu(iommu, drhd) {
3648 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3649 					     GFP_KERNEL);
3650 		if (!iommu->iommu_state)
3651 			goto nomem;
3652 	}
3653 
3654 	iommu_flush_all();
3655 
3656 	for_each_active_iommu(iommu, drhd) {
3657 		iommu_disable_translation(iommu);
3658 
3659 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3660 
3661 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3662 			readl(iommu->reg + DMAR_FECTL_REG);
3663 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3664 			readl(iommu->reg + DMAR_FEDATA_REG);
3665 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3666 			readl(iommu->reg + DMAR_FEADDR_REG);
3667 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3668 			readl(iommu->reg + DMAR_FEUADDR_REG);
3669 
3670 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3671 	}
3672 	return 0;
3673 
3674 nomem:
3675 	for_each_active_iommu(iommu, drhd)
3676 		kfree(iommu->iommu_state);
3677 
3678 	return -ENOMEM;
3679 }
3680 
3681 static void iommu_resume(void)
3682 {
3683 	struct dmar_drhd_unit *drhd;
3684 	struct intel_iommu *iommu = NULL;
3685 	unsigned long flag;
3686 
3687 	if (init_iommu_hw()) {
3688 		if (force_on)
3689 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3690 		else
3691 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3692 		return;
3693 	}
3694 
3695 	for_each_active_iommu(iommu, drhd) {
3696 
3697 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3698 
3699 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3700 			iommu->reg + DMAR_FECTL_REG);
3701 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3702 			iommu->reg + DMAR_FEDATA_REG);
3703 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3704 			iommu->reg + DMAR_FEADDR_REG);
3705 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3706 			iommu->reg + DMAR_FEUADDR_REG);
3707 
3708 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3709 	}
3710 
3711 	for_each_active_iommu(iommu, drhd)
3712 		kfree(iommu->iommu_state);
3713 }
3714 
3715 static struct syscore_ops iommu_syscore_ops = {
3716 	.resume		= iommu_resume,
3717 	.suspend	= iommu_suspend,
3718 };
3719 
3720 static void __init init_iommu_pm_ops(void)
3721 {
3722 	register_syscore_ops(&iommu_syscore_ops);
3723 }
3724 
3725 #else
3726 static inline void init_iommu_pm_ops(void) {}
3727 #endif	/* CONFIG_PM */
3728 
3729 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3730 {
3731 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3732 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3733 	    rmrr->end_address <= rmrr->base_address ||
3734 	    arch_rmrr_sanity_check(rmrr))
3735 		return -EINVAL;
3736 
3737 	return 0;
3738 }
3739 
3740 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3741 {
3742 	struct acpi_dmar_reserved_memory *rmrr;
3743 	struct dmar_rmrr_unit *rmrru;
3744 
3745 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3746 	if (rmrr_sanity_check(rmrr)) {
3747 		pr_warn(FW_BUG
3748 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3749 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3750 			   rmrr->base_address, rmrr->end_address,
3751 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3752 			   dmi_get_system_info(DMI_BIOS_VERSION),
3753 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3754 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3755 	}
3756 
3757 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3758 	if (!rmrru)
3759 		goto out;
3760 
3761 	rmrru->hdr = header;
3762 
3763 	rmrru->base_address = rmrr->base_address;
3764 	rmrru->end_address = rmrr->end_address;
3765 
3766 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3767 				((void *)rmrr) + rmrr->header.length,
3768 				&rmrru->devices_cnt);
3769 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3770 		goto free_rmrru;
3771 
3772 	list_add(&rmrru->list, &dmar_rmrr_units);
3773 
3774 	return 0;
3775 free_rmrru:
3776 	kfree(rmrru);
3777 out:
3778 	return -ENOMEM;
3779 }
3780 
3781 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3782 {
3783 	struct dmar_atsr_unit *atsru;
3784 	struct acpi_dmar_atsr *tmp;
3785 
3786 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3787 				dmar_rcu_check()) {
3788 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3789 		if (atsr->segment != tmp->segment)
3790 			continue;
3791 		if (atsr->header.length != tmp->header.length)
3792 			continue;
3793 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3794 			return atsru;
3795 	}
3796 
3797 	return NULL;
3798 }
3799 
3800 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3801 {
3802 	struct acpi_dmar_atsr *atsr;
3803 	struct dmar_atsr_unit *atsru;
3804 
3805 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3806 		return 0;
3807 
3808 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3809 	atsru = dmar_find_atsr(atsr);
3810 	if (atsru)
3811 		return 0;
3812 
3813 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3814 	if (!atsru)
3815 		return -ENOMEM;
3816 
3817 	/*
3818 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3819 	 * copy the memory content because the memory buffer will be freed
3820 	 * on return.
3821 	 */
3822 	atsru->hdr = (void *)(atsru + 1);
3823 	memcpy(atsru->hdr, hdr, hdr->length);
3824 	atsru->include_all = atsr->flags & 0x1;
3825 	if (!atsru->include_all) {
3826 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3827 				(void *)atsr + atsr->header.length,
3828 				&atsru->devices_cnt);
3829 		if (atsru->devices_cnt && atsru->devices == NULL) {
3830 			kfree(atsru);
3831 			return -ENOMEM;
3832 		}
3833 	}
3834 
3835 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3836 
3837 	return 0;
3838 }
3839 
3840 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3841 {
3842 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3843 	kfree(atsru);
3844 }
3845 
3846 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3847 {
3848 	struct acpi_dmar_atsr *atsr;
3849 	struct dmar_atsr_unit *atsru;
3850 
3851 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3852 	atsru = dmar_find_atsr(atsr);
3853 	if (atsru) {
3854 		list_del_rcu(&atsru->list);
3855 		synchronize_rcu();
3856 		intel_iommu_free_atsr(atsru);
3857 	}
3858 
3859 	return 0;
3860 }
3861 
3862 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3863 {
3864 	int i;
3865 	struct device *dev;
3866 	struct acpi_dmar_atsr *atsr;
3867 	struct dmar_atsr_unit *atsru;
3868 
3869 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3870 	atsru = dmar_find_atsr(atsr);
3871 	if (!atsru)
3872 		return 0;
3873 
3874 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3875 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3876 					  i, dev)
3877 			return -EBUSY;
3878 	}
3879 
3880 	return 0;
3881 }
3882 
3883 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3884 {
3885 	struct dmar_satc_unit *satcu;
3886 	struct acpi_dmar_satc *tmp;
3887 
3888 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3889 				dmar_rcu_check()) {
3890 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3891 		if (satc->segment != tmp->segment)
3892 			continue;
3893 		if (satc->header.length != tmp->header.length)
3894 			continue;
3895 		if (memcmp(satc, tmp, satc->header.length) == 0)
3896 			return satcu;
3897 	}
3898 
3899 	return NULL;
3900 }
3901 
3902 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3903 {
3904 	struct acpi_dmar_satc *satc;
3905 	struct dmar_satc_unit *satcu;
3906 
3907 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3908 		return 0;
3909 
3910 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3911 	satcu = dmar_find_satc(satc);
3912 	if (satcu)
3913 		return 0;
3914 
3915 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3916 	if (!satcu)
3917 		return -ENOMEM;
3918 
3919 	satcu->hdr = (void *)(satcu + 1);
3920 	memcpy(satcu->hdr, hdr, hdr->length);
3921 	satcu->atc_required = satc->flags & 0x1;
3922 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3923 					      (void *)satc + satc->header.length,
3924 					      &satcu->devices_cnt);
3925 	if (satcu->devices_cnt && !satcu->devices) {
3926 		kfree(satcu);
3927 		return -ENOMEM;
3928 	}
3929 	list_add_rcu(&satcu->list, &dmar_satc_units);
3930 
3931 	return 0;
3932 }
3933 
3934 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3935 {
3936 	int sp, ret;
3937 	struct intel_iommu *iommu = dmaru->iommu;
3938 
3939 	if (g_iommus[iommu->seq_id])
3940 		return 0;
3941 
3942 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3943 	if (ret)
3944 		goto out;
3945 
3946 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3947 		pr_warn("%s: Doesn't support hardware pass through.\n",
3948 			iommu->name);
3949 		return -ENXIO;
3950 	}
3951 	if (!ecap_sc_support(iommu->ecap) &&
3952 	    domain_update_iommu_snooping(iommu)) {
3953 		pr_warn("%s: Doesn't support snooping.\n",
3954 			iommu->name);
3955 		return -ENXIO;
3956 	}
3957 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3958 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3959 		pr_warn("%s: Doesn't support large page.\n",
3960 			iommu->name);
3961 		return -ENXIO;
3962 	}
3963 
3964 	/*
3965 	 * Disable translation if already enabled prior to OS handover.
3966 	 */
3967 	if (iommu->gcmd & DMA_GCMD_TE)
3968 		iommu_disable_translation(iommu);
3969 
3970 	g_iommus[iommu->seq_id] = iommu;
3971 	ret = iommu_init_domains(iommu);
3972 	if (ret == 0)
3973 		ret = iommu_alloc_root_entry(iommu);
3974 	if (ret)
3975 		goto out;
3976 
3977 	intel_svm_check(iommu);
3978 
3979 	if (dmaru->ignored) {
3980 		/*
3981 		 * we always have to disable PMRs or DMA may fail on this device
3982 		 */
3983 		if (force_on)
3984 			iommu_disable_protect_mem_regions(iommu);
3985 		return 0;
3986 	}
3987 
3988 	intel_iommu_init_qi(iommu);
3989 	iommu_flush_write_buffer(iommu);
3990 
3991 #ifdef CONFIG_INTEL_IOMMU_SVM
3992 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3993 		ret = intel_svm_enable_prq(iommu);
3994 		if (ret)
3995 			goto disable_iommu;
3996 	}
3997 #endif
3998 	ret = dmar_set_interrupt(iommu);
3999 	if (ret)
4000 		goto disable_iommu;
4001 
4002 	iommu_set_root_entry(iommu);
4003 	iommu_enable_translation(iommu);
4004 
4005 	iommu_disable_protect_mem_regions(iommu);
4006 	return 0;
4007 
4008 disable_iommu:
4009 	disable_dmar_iommu(iommu);
4010 out:
4011 	free_dmar_iommu(iommu);
4012 	return ret;
4013 }
4014 
4015 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4016 {
4017 	int ret = 0;
4018 	struct intel_iommu *iommu = dmaru->iommu;
4019 
4020 	if (!intel_iommu_enabled)
4021 		return 0;
4022 	if (iommu == NULL)
4023 		return -EINVAL;
4024 
4025 	if (insert) {
4026 		ret = intel_iommu_add(dmaru);
4027 	} else {
4028 		disable_dmar_iommu(iommu);
4029 		free_dmar_iommu(iommu);
4030 	}
4031 
4032 	return ret;
4033 }
4034 
4035 static void intel_iommu_free_dmars(void)
4036 {
4037 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4038 	struct dmar_atsr_unit *atsru, *atsr_n;
4039 	struct dmar_satc_unit *satcu, *satc_n;
4040 
4041 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4042 		list_del(&rmrru->list);
4043 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4044 		kfree(rmrru);
4045 	}
4046 
4047 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4048 		list_del(&atsru->list);
4049 		intel_iommu_free_atsr(atsru);
4050 	}
4051 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
4052 		list_del(&satcu->list);
4053 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
4054 		kfree(satcu);
4055 	}
4056 }
4057 
4058 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4059 {
4060 	int i, ret = 1;
4061 	struct pci_bus *bus;
4062 	struct pci_dev *bridge = NULL;
4063 	struct device *tmp;
4064 	struct acpi_dmar_atsr *atsr;
4065 	struct dmar_atsr_unit *atsru;
4066 
4067 	dev = pci_physfn(dev);
4068 	for (bus = dev->bus; bus; bus = bus->parent) {
4069 		bridge = bus->self;
4070 		/* If it's an integrated device, allow ATS */
4071 		if (!bridge)
4072 			return 1;
4073 		/* Connected via non-PCIe: no ATS */
4074 		if (!pci_is_pcie(bridge) ||
4075 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4076 			return 0;
4077 		/* If we found the root port, look it up in the ATSR */
4078 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4079 			break;
4080 	}
4081 
4082 	rcu_read_lock();
4083 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4084 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4085 		if (atsr->segment != pci_domain_nr(dev->bus))
4086 			continue;
4087 
4088 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4089 			if (tmp == &bridge->dev)
4090 				goto out;
4091 
4092 		if (atsru->include_all)
4093 			goto out;
4094 	}
4095 	ret = 0;
4096 out:
4097 	rcu_read_unlock();
4098 
4099 	return ret;
4100 }
4101 
4102 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4103 {
4104 	int ret;
4105 	struct dmar_rmrr_unit *rmrru;
4106 	struct dmar_atsr_unit *atsru;
4107 	struct dmar_satc_unit *satcu;
4108 	struct acpi_dmar_atsr *atsr;
4109 	struct acpi_dmar_reserved_memory *rmrr;
4110 	struct acpi_dmar_satc *satc;
4111 
4112 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4113 		return 0;
4114 
4115 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4116 		rmrr = container_of(rmrru->hdr,
4117 				    struct acpi_dmar_reserved_memory, header);
4118 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4119 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4120 				((void *)rmrr) + rmrr->header.length,
4121 				rmrr->segment, rmrru->devices,
4122 				rmrru->devices_cnt);
4123 			if (ret < 0)
4124 				return ret;
4125 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4126 			dmar_remove_dev_scope(info, rmrr->segment,
4127 				rmrru->devices, rmrru->devices_cnt);
4128 		}
4129 	}
4130 
4131 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4132 		if (atsru->include_all)
4133 			continue;
4134 
4135 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4136 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4137 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4138 					(void *)atsr + atsr->header.length,
4139 					atsr->segment, atsru->devices,
4140 					atsru->devices_cnt);
4141 			if (ret > 0)
4142 				break;
4143 			else if (ret < 0)
4144 				return ret;
4145 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4146 			if (dmar_remove_dev_scope(info, atsr->segment,
4147 					atsru->devices, atsru->devices_cnt))
4148 				break;
4149 		}
4150 	}
4151 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4152 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4153 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4154 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4155 					(void *)satc + satc->header.length,
4156 					satc->segment, satcu->devices,
4157 					satcu->devices_cnt);
4158 			if (ret > 0)
4159 				break;
4160 			else if (ret < 0)
4161 				return ret;
4162 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4163 			if (dmar_remove_dev_scope(info, satc->segment,
4164 					satcu->devices, satcu->devices_cnt))
4165 				break;
4166 		}
4167 	}
4168 
4169 	return 0;
4170 }
4171 
4172 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4173 				       unsigned long val, void *v)
4174 {
4175 	struct memory_notify *mhp = v;
4176 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4177 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4178 			mhp->nr_pages - 1);
4179 
4180 	switch (val) {
4181 	case MEM_GOING_ONLINE:
4182 		if (iommu_domain_identity_map(si_domain,
4183 					      start_vpfn, last_vpfn)) {
4184 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4185 				start_vpfn, last_vpfn);
4186 			return NOTIFY_BAD;
4187 		}
4188 		break;
4189 
4190 	case MEM_OFFLINE:
4191 	case MEM_CANCEL_ONLINE:
4192 		{
4193 			struct dmar_drhd_unit *drhd;
4194 			struct intel_iommu *iommu;
4195 			struct page *freelist;
4196 
4197 			freelist = domain_unmap(si_domain,
4198 						start_vpfn, last_vpfn,
4199 						NULL);
4200 
4201 			rcu_read_lock();
4202 			for_each_active_iommu(iommu, drhd)
4203 				iommu_flush_iotlb_psi(iommu, si_domain,
4204 					start_vpfn, mhp->nr_pages,
4205 					!freelist, 0);
4206 			rcu_read_unlock();
4207 			dma_free_pagelist(freelist);
4208 		}
4209 		break;
4210 	}
4211 
4212 	return NOTIFY_OK;
4213 }
4214 
4215 static struct notifier_block intel_iommu_memory_nb = {
4216 	.notifier_call = intel_iommu_memory_notifier,
4217 	.priority = 0
4218 };
4219 
4220 static void intel_disable_iommus(void)
4221 {
4222 	struct intel_iommu *iommu = NULL;
4223 	struct dmar_drhd_unit *drhd;
4224 
4225 	for_each_iommu(iommu, drhd)
4226 		iommu_disable_translation(iommu);
4227 }
4228 
4229 void intel_iommu_shutdown(void)
4230 {
4231 	struct dmar_drhd_unit *drhd;
4232 	struct intel_iommu *iommu = NULL;
4233 
4234 	if (no_iommu || dmar_disabled)
4235 		return;
4236 
4237 	down_write(&dmar_global_lock);
4238 
4239 	/* Disable PMRs explicitly here. */
4240 	for_each_iommu(iommu, drhd)
4241 		iommu_disable_protect_mem_regions(iommu);
4242 
4243 	/* Make sure the IOMMUs are switched off */
4244 	intel_disable_iommus();
4245 
4246 	up_write(&dmar_global_lock);
4247 }
4248 
4249 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4250 {
4251 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4252 
4253 	return container_of(iommu_dev, struct intel_iommu, iommu);
4254 }
4255 
4256 static ssize_t version_show(struct device *dev,
4257 			    struct device_attribute *attr, char *buf)
4258 {
4259 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4260 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4261 	return sprintf(buf, "%d:%d\n",
4262 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4263 }
4264 static DEVICE_ATTR_RO(version);
4265 
4266 static ssize_t address_show(struct device *dev,
4267 			    struct device_attribute *attr, char *buf)
4268 {
4269 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4270 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4271 }
4272 static DEVICE_ATTR_RO(address);
4273 
4274 static ssize_t cap_show(struct device *dev,
4275 			struct device_attribute *attr, char *buf)
4276 {
4277 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4278 	return sprintf(buf, "%llx\n", iommu->cap);
4279 }
4280 static DEVICE_ATTR_RO(cap);
4281 
4282 static ssize_t ecap_show(struct device *dev,
4283 			 struct device_attribute *attr, char *buf)
4284 {
4285 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4286 	return sprintf(buf, "%llx\n", iommu->ecap);
4287 }
4288 static DEVICE_ATTR_RO(ecap);
4289 
4290 static ssize_t domains_supported_show(struct device *dev,
4291 				      struct device_attribute *attr, char *buf)
4292 {
4293 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4294 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4295 }
4296 static DEVICE_ATTR_RO(domains_supported);
4297 
4298 static ssize_t domains_used_show(struct device *dev,
4299 				 struct device_attribute *attr, char *buf)
4300 {
4301 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4302 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4303 						  cap_ndoms(iommu->cap)));
4304 }
4305 static DEVICE_ATTR_RO(domains_used);
4306 
4307 static struct attribute *intel_iommu_attrs[] = {
4308 	&dev_attr_version.attr,
4309 	&dev_attr_address.attr,
4310 	&dev_attr_cap.attr,
4311 	&dev_attr_ecap.attr,
4312 	&dev_attr_domains_supported.attr,
4313 	&dev_attr_domains_used.attr,
4314 	NULL,
4315 };
4316 
4317 static struct attribute_group intel_iommu_group = {
4318 	.name = "intel-iommu",
4319 	.attrs = intel_iommu_attrs,
4320 };
4321 
4322 const struct attribute_group *intel_iommu_groups[] = {
4323 	&intel_iommu_group,
4324 	NULL,
4325 };
4326 
4327 static inline bool has_external_pci(void)
4328 {
4329 	struct pci_dev *pdev = NULL;
4330 
4331 	for_each_pci_dev(pdev)
4332 		if (pdev->external_facing)
4333 			return true;
4334 
4335 	return false;
4336 }
4337 
4338 static int __init platform_optin_force_iommu(void)
4339 {
4340 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4341 		return 0;
4342 
4343 	if (no_iommu || dmar_disabled)
4344 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4345 
4346 	/*
4347 	 * If Intel-IOMMU is disabled by default, we will apply identity
4348 	 * map for all devices except those marked as being untrusted.
4349 	 */
4350 	if (dmar_disabled)
4351 		iommu_set_default_passthrough(false);
4352 
4353 	dmar_disabled = 0;
4354 	no_iommu = 0;
4355 
4356 	return 1;
4357 }
4358 
4359 static int __init probe_acpi_namespace_devices(void)
4360 {
4361 	struct dmar_drhd_unit *drhd;
4362 	/* To avoid a -Wunused-but-set-variable warning. */
4363 	struct intel_iommu *iommu __maybe_unused;
4364 	struct device *dev;
4365 	int i, ret = 0;
4366 
4367 	for_each_active_iommu(iommu, drhd) {
4368 		for_each_active_dev_scope(drhd->devices,
4369 					  drhd->devices_cnt, i, dev) {
4370 			struct acpi_device_physical_node *pn;
4371 			struct iommu_group *group;
4372 			struct acpi_device *adev;
4373 
4374 			if (dev->bus != &acpi_bus_type)
4375 				continue;
4376 
4377 			adev = to_acpi_device(dev);
4378 			mutex_lock(&adev->physical_node_lock);
4379 			list_for_each_entry(pn,
4380 					    &adev->physical_node_list, node) {
4381 				group = iommu_group_get(pn->dev);
4382 				if (group) {
4383 					iommu_group_put(group);
4384 					continue;
4385 				}
4386 
4387 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4388 				ret = iommu_probe_device(pn->dev);
4389 				if (ret)
4390 					break;
4391 			}
4392 			mutex_unlock(&adev->physical_node_lock);
4393 
4394 			if (ret)
4395 				return ret;
4396 		}
4397 	}
4398 
4399 	return 0;
4400 }
4401 
4402 int __init intel_iommu_init(void)
4403 {
4404 	int ret = -ENODEV;
4405 	struct dmar_drhd_unit *drhd;
4406 	struct intel_iommu *iommu;
4407 
4408 	/*
4409 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4410 	 * opt in, so enforce that.
4411 	 */
4412 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4413 		    platform_optin_force_iommu();
4414 
4415 	if (iommu_init_mempool()) {
4416 		if (force_on)
4417 			panic("tboot: Failed to initialize iommu memory\n");
4418 		return -ENOMEM;
4419 	}
4420 
4421 	down_write(&dmar_global_lock);
4422 	if (dmar_table_init()) {
4423 		if (force_on)
4424 			panic("tboot: Failed to initialize DMAR table\n");
4425 		goto out_free_dmar;
4426 	}
4427 
4428 	if (dmar_dev_scope_init() < 0) {
4429 		if (force_on)
4430 			panic("tboot: Failed to initialize DMAR device scope\n");
4431 		goto out_free_dmar;
4432 	}
4433 
4434 	up_write(&dmar_global_lock);
4435 
4436 	/*
4437 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4438 	 * complain later when we register it under the lock.
4439 	 */
4440 	dmar_register_bus_notifier();
4441 
4442 	down_write(&dmar_global_lock);
4443 
4444 	if (!no_iommu)
4445 		intel_iommu_debugfs_init();
4446 
4447 	if (no_iommu || dmar_disabled) {
4448 		/*
4449 		 * We exit the function here to ensure IOMMU's remapping and
4450 		 * mempool aren't setup, which means that the IOMMU's PMRs
4451 		 * won't be disabled via the call to init_dmars(). So disable
4452 		 * it explicitly here. The PMRs were setup by tboot prior to
4453 		 * calling SENTER, but the kernel is expected to reset/tear
4454 		 * down the PMRs.
4455 		 */
4456 		if (intel_iommu_tboot_noforce) {
4457 			for_each_iommu(iommu, drhd)
4458 				iommu_disable_protect_mem_regions(iommu);
4459 		}
4460 
4461 		/*
4462 		 * Make sure the IOMMUs are switched off, even when we
4463 		 * boot into a kexec kernel and the previous kernel left
4464 		 * them enabled
4465 		 */
4466 		intel_disable_iommus();
4467 		goto out_free_dmar;
4468 	}
4469 
4470 	if (list_empty(&dmar_rmrr_units))
4471 		pr_info("No RMRR found\n");
4472 
4473 	if (list_empty(&dmar_atsr_units))
4474 		pr_info("No ATSR found\n");
4475 
4476 	if (list_empty(&dmar_satc_units))
4477 		pr_info("No SATC found\n");
4478 
4479 	if (dmar_map_gfx)
4480 		intel_iommu_gfx_mapped = 1;
4481 
4482 	init_no_remapping_devices();
4483 
4484 	ret = init_dmars();
4485 	if (ret) {
4486 		if (force_on)
4487 			panic("tboot: Failed to initialize DMARs\n");
4488 		pr_err("Initialization failed\n");
4489 		goto out_free_dmar;
4490 	}
4491 	up_write(&dmar_global_lock);
4492 
4493 	init_iommu_pm_ops();
4494 
4495 	down_read(&dmar_global_lock);
4496 	for_each_active_iommu(iommu, drhd) {
4497 		/*
4498 		 * The flush queue implementation does not perform
4499 		 * page-selective invalidations that are required for efficient
4500 		 * TLB flushes in virtual environments.  The benefit of batching
4501 		 * is likely to be much lower than the overhead of synchronizing
4502 		 * the virtual and physical IOMMU page-tables.
4503 		 */
4504 		if (cap_caching_mode(iommu->cap)) {
4505 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4506 			iommu_set_dma_strict();
4507 		}
4508 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4509 				       intel_iommu_groups,
4510 				       "%s", iommu->name);
4511 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4512 	}
4513 	up_read(&dmar_global_lock);
4514 
4515 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4516 	if (si_domain && !hw_pass_through)
4517 		register_memory_notifier(&intel_iommu_memory_nb);
4518 
4519 	down_read(&dmar_global_lock);
4520 	if (probe_acpi_namespace_devices())
4521 		pr_warn("ACPI name space devices didn't probe correctly\n");
4522 
4523 	/* Finally, we enable the DMA remapping hardware. */
4524 	for_each_iommu(iommu, drhd) {
4525 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4526 			iommu_enable_translation(iommu);
4527 
4528 		iommu_disable_protect_mem_regions(iommu);
4529 	}
4530 	up_read(&dmar_global_lock);
4531 
4532 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4533 
4534 	intel_iommu_enabled = 1;
4535 
4536 	return 0;
4537 
4538 out_free_dmar:
4539 	intel_iommu_free_dmars();
4540 	up_write(&dmar_global_lock);
4541 	iommu_exit_mempool();
4542 	return ret;
4543 }
4544 
4545 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4546 {
4547 	struct device_domain_info *info = opaque;
4548 
4549 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4550 	return 0;
4551 }
4552 
4553 /*
4554  * NB - intel-iommu lacks any sort of reference counting for the users of
4555  * dependent devices.  If multiple endpoints have intersecting dependent
4556  * devices, unbinding the driver from any one of them will possibly leave
4557  * the others unable to operate.
4558  */
4559 static void domain_context_clear(struct device_domain_info *info)
4560 {
4561 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4562 		return;
4563 
4564 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4565 			       &domain_context_clear_one_cb, info);
4566 }
4567 
4568 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4569 {
4570 	struct dmar_domain *domain;
4571 	struct intel_iommu *iommu;
4572 	unsigned long flags;
4573 
4574 	assert_spin_locked(&device_domain_lock);
4575 
4576 	if (WARN_ON(!info))
4577 		return;
4578 
4579 	iommu = info->iommu;
4580 	domain = info->domain;
4581 
4582 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4583 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4584 			intel_pasid_tear_down_entry(iommu, info->dev,
4585 					PASID_RID2PASID, false);
4586 
4587 		iommu_disable_dev_iotlb(info);
4588 		domain_context_clear(info);
4589 		intel_pasid_free_table(info->dev);
4590 	}
4591 
4592 	unlink_domain_info(info);
4593 
4594 	spin_lock_irqsave(&iommu->lock, flags);
4595 	domain_detach_iommu(domain, iommu);
4596 	spin_unlock_irqrestore(&iommu->lock, flags);
4597 
4598 	free_devinfo_mem(info);
4599 }
4600 
4601 static void dmar_remove_one_dev_info(struct device *dev)
4602 {
4603 	struct device_domain_info *info;
4604 	unsigned long flags;
4605 
4606 	spin_lock_irqsave(&device_domain_lock, flags);
4607 	info = get_domain_info(dev);
4608 	if (info)
4609 		__dmar_remove_one_dev_info(info);
4610 	spin_unlock_irqrestore(&device_domain_lock, flags);
4611 }
4612 
4613 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4614 {
4615 	int adjust_width;
4616 
4617 	/* calculate AGAW */
4618 	domain->gaw = guest_width;
4619 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4620 	domain->agaw = width_to_agaw(adjust_width);
4621 
4622 	domain->iommu_coherency = false;
4623 	domain->iommu_snooping = false;
4624 	domain->iommu_superpage = 0;
4625 	domain->max_addr = 0;
4626 
4627 	/* always allocate the top pgd */
4628 	domain->pgd = alloc_pgtable_page(domain->nid);
4629 	if (!domain->pgd)
4630 		return -ENOMEM;
4631 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4632 	return 0;
4633 }
4634 
4635 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4636 {
4637 	struct dmar_domain *dmar_domain;
4638 	struct iommu_domain *domain;
4639 
4640 	switch (type) {
4641 	case IOMMU_DOMAIN_DMA:
4642 	case IOMMU_DOMAIN_DMA_FQ:
4643 	case IOMMU_DOMAIN_UNMANAGED:
4644 		dmar_domain = alloc_domain(type);
4645 		if (!dmar_domain) {
4646 			pr_err("Can't allocate dmar_domain\n");
4647 			return NULL;
4648 		}
4649 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4650 			pr_err("Domain initialization failed\n");
4651 			domain_exit(dmar_domain);
4652 			return NULL;
4653 		}
4654 
4655 		domain = &dmar_domain->domain;
4656 		domain->geometry.aperture_start = 0;
4657 		domain->geometry.aperture_end   =
4658 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4659 		domain->geometry.force_aperture = true;
4660 
4661 		return domain;
4662 	case IOMMU_DOMAIN_IDENTITY:
4663 		return &si_domain->domain;
4664 	default:
4665 		return NULL;
4666 	}
4667 
4668 	return NULL;
4669 }
4670 
4671 static void intel_iommu_domain_free(struct iommu_domain *domain)
4672 {
4673 	if (domain != &si_domain->domain)
4674 		domain_exit(to_dmar_domain(domain));
4675 }
4676 
4677 /*
4678  * Check whether a @domain could be attached to the @dev through the
4679  * aux-domain attach/detach APIs.
4680  */
4681 static inline bool
4682 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4683 {
4684 	struct device_domain_info *info = get_domain_info(dev);
4685 
4686 	return info && info->auxd_enabled &&
4687 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4688 }
4689 
4690 static inline struct subdev_domain_info *
4691 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4692 {
4693 	struct subdev_domain_info *sinfo;
4694 
4695 	if (!list_empty(&domain->subdevices)) {
4696 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4697 			if (sinfo->pdev == dev)
4698 				return sinfo;
4699 		}
4700 	}
4701 
4702 	return NULL;
4703 }
4704 
4705 static int auxiliary_link_device(struct dmar_domain *domain,
4706 				 struct device *dev)
4707 {
4708 	struct device_domain_info *info = get_domain_info(dev);
4709 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4710 
4711 	assert_spin_locked(&device_domain_lock);
4712 	if (WARN_ON(!info))
4713 		return -EINVAL;
4714 
4715 	if (!sinfo) {
4716 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4717 		if (!sinfo)
4718 			return -ENOMEM;
4719 		sinfo->domain = domain;
4720 		sinfo->pdev = dev;
4721 		list_add(&sinfo->link_phys, &info->subdevices);
4722 		list_add(&sinfo->link_domain, &domain->subdevices);
4723 	}
4724 
4725 	return ++sinfo->users;
4726 }
4727 
4728 static int auxiliary_unlink_device(struct dmar_domain *domain,
4729 				   struct device *dev)
4730 {
4731 	struct device_domain_info *info = get_domain_info(dev);
4732 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4733 	int ret;
4734 
4735 	assert_spin_locked(&device_domain_lock);
4736 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4737 		return -EINVAL;
4738 
4739 	ret = --sinfo->users;
4740 	if (!ret) {
4741 		list_del(&sinfo->link_phys);
4742 		list_del(&sinfo->link_domain);
4743 		kfree(sinfo);
4744 	}
4745 
4746 	return ret;
4747 }
4748 
4749 static int aux_domain_add_dev(struct dmar_domain *domain,
4750 			      struct device *dev)
4751 {
4752 	int ret;
4753 	unsigned long flags;
4754 	struct intel_iommu *iommu;
4755 
4756 	iommu = device_to_iommu(dev, NULL, NULL);
4757 	if (!iommu)
4758 		return -ENODEV;
4759 
4760 	if (domain->default_pasid <= 0) {
4761 		u32 pasid;
4762 
4763 		/* No private data needed for the default pasid */
4764 		pasid = ioasid_alloc(NULL, PASID_MIN,
4765 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4766 				     NULL);
4767 		if (pasid == INVALID_IOASID) {
4768 			pr_err("Can't allocate default pasid\n");
4769 			return -ENODEV;
4770 		}
4771 		domain->default_pasid = pasid;
4772 	}
4773 
4774 	spin_lock_irqsave(&device_domain_lock, flags);
4775 	ret = auxiliary_link_device(domain, dev);
4776 	if (ret <= 0)
4777 		goto link_failed;
4778 
4779 	/*
4780 	 * Subdevices from the same physical device can be attached to the
4781 	 * same domain. For such cases, only the first subdevice attachment
4782 	 * needs to go through the full steps in this function. So if ret >
4783 	 * 1, just goto out.
4784 	 */
4785 	if (ret > 1)
4786 		goto out;
4787 
4788 	/*
4789 	 * iommu->lock must be held to attach domain to iommu and setup the
4790 	 * pasid entry for second level translation.
4791 	 */
4792 	spin_lock(&iommu->lock);
4793 	ret = domain_attach_iommu(domain, iommu);
4794 	if (ret)
4795 		goto attach_failed;
4796 
4797 	/* Setup the PASID entry for mediated devices: */
4798 	if (domain_use_first_level(domain))
4799 		ret = domain_setup_first_level(iommu, domain, dev,
4800 					       domain->default_pasid);
4801 	else
4802 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4803 						     domain->default_pasid);
4804 	if (ret)
4805 		goto table_failed;
4806 
4807 	spin_unlock(&iommu->lock);
4808 out:
4809 	spin_unlock_irqrestore(&device_domain_lock, flags);
4810 
4811 	return 0;
4812 
4813 table_failed:
4814 	domain_detach_iommu(domain, iommu);
4815 attach_failed:
4816 	spin_unlock(&iommu->lock);
4817 	auxiliary_unlink_device(domain, dev);
4818 link_failed:
4819 	spin_unlock_irqrestore(&device_domain_lock, flags);
4820 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4821 		ioasid_put(domain->default_pasid);
4822 
4823 	return ret;
4824 }
4825 
4826 static void aux_domain_remove_dev(struct dmar_domain *domain,
4827 				  struct device *dev)
4828 {
4829 	struct device_domain_info *info;
4830 	struct intel_iommu *iommu;
4831 	unsigned long flags;
4832 
4833 	if (!is_aux_domain(dev, &domain->domain))
4834 		return;
4835 
4836 	spin_lock_irqsave(&device_domain_lock, flags);
4837 	info = get_domain_info(dev);
4838 	iommu = info->iommu;
4839 
4840 	if (!auxiliary_unlink_device(domain, dev)) {
4841 		spin_lock(&iommu->lock);
4842 		intel_pasid_tear_down_entry(iommu, dev,
4843 					    domain->default_pasid, false);
4844 		domain_detach_iommu(domain, iommu);
4845 		spin_unlock(&iommu->lock);
4846 	}
4847 
4848 	spin_unlock_irqrestore(&device_domain_lock, flags);
4849 
4850 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4851 		ioasid_put(domain->default_pasid);
4852 }
4853 
4854 static int prepare_domain_attach_device(struct iommu_domain *domain,
4855 					struct device *dev)
4856 {
4857 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4858 	struct intel_iommu *iommu;
4859 	int addr_width;
4860 
4861 	iommu = device_to_iommu(dev, NULL, NULL);
4862 	if (!iommu)
4863 		return -ENODEV;
4864 
4865 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4866 	    !ecap_nest(iommu->ecap)) {
4867 		dev_err(dev, "%s: iommu not support nested translation\n",
4868 			iommu->name);
4869 		return -EINVAL;
4870 	}
4871 
4872 	/* check if this iommu agaw is sufficient for max mapped address */
4873 	addr_width = agaw_to_width(iommu->agaw);
4874 	if (addr_width > cap_mgaw(iommu->cap))
4875 		addr_width = cap_mgaw(iommu->cap);
4876 
4877 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4878 		dev_err(dev, "%s: iommu width (%d) is not "
4879 		        "sufficient for the mapped address (%llx)\n",
4880 		        __func__, addr_width, dmar_domain->max_addr);
4881 		return -EFAULT;
4882 	}
4883 	dmar_domain->gaw = addr_width;
4884 
4885 	/*
4886 	 * Knock out extra levels of page tables if necessary
4887 	 */
4888 	while (iommu->agaw < dmar_domain->agaw) {
4889 		struct dma_pte *pte;
4890 
4891 		pte = dmar_domain->pgd;
4892 		if (dma_pte_present(pte)) {
4893 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4894 			free_pgtable_page(pte);
4895 		}
4896 		dmar_domain->agaw--;
4897 	}
4898 
4899 	return 0;
4900 }
4901 
4902 static int intel_iommu_attach_device(struct iommu_domain *domain,
4903 				     struct device *dev)
4904 {
4905 	int ret;
4906 
4907 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4908 	    device_is_rmrr_locked(dev)) {
4909 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4910 		return -EPERM;
4911 	}
4912 
4913 	if (is_aux_domain(dev, domain))
4914 		return -EPERM;
4915 
4916 	/* normally dev is not mapped */
4917 	if (unlikely(domain_context_mapped(dev))) {
4918 		struct dmar_domain *old_domain;
4919 
4920 		old_domain = find_domain(dev);
4921 		if (old_domain)
4922 			dmar_remove_one_dev_info(dev);
4923 	}
4924 
4925 	ret = prepare_domain_attach_device(domain, dev);
4926 	if (ret)
4927 		return ret;
4928 
4929 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4930 }
4931 
4932 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4933 					 struct device *dev)
4934 {
4935 	int ret;
4936 
4937 	if (!is_aux_domain(dev, domain))
4938 		return -EPERM;
4939 
4940 	ret = prepare_domain_attach_device(domain, dev);
4941 	if (ret)
4942 		return ret;
4943 
4944 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4945 }
4946 
4947 static void intel_iommu_detach_device(struct iommu_domain *domain,
4948 				      struct device *dev)
4949 {
4950 	dmar_remove_one_dev_info(dev);
4951 }
4952 
4953 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4954 					  struct device *dev)
4955 {
4956 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4957 }
4958 
4959 #ifdef CONFIG_INTEL_IOMMU_SVM
4960 /*
4961  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4962  * VT-d granularity. Invalidation is typically included in the unmap operation
4963  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4964  * owns the first level page tables. Invalidations of translation caches in the
4965  * guest are trapped and passed down to the host.
4966  *
4967  * vIOMMU in the guest will only expose first level page tables, therefore
4968  * we do not support IOTLB granularity for request without PASID (second level).
4969  *
4970  * For example, to find the VT-d granularity encoding for IOTLB
4971  * type and page selective granularity within PASID:
4972  * X: indexed by iommu cache type
4973  * Y: indexed by enum iommu_inv_granularity
4974  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4975  */
4976 
4977 static const int
4978 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4979 	/*
4980 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4981 	 * page selective (address granularity)
4982 	 */
4983 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4984 	/* PASID based dev TLBs */
4985 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4986 	/* PASID cache */
4987 	{-EINVAL, -EINVAL, -EINVAL}
4988 };
4989 
4990 static inline int to_vtd_granularity(int type, int granu)
4991 {
4992 	return inv_type_granu_table[type][granu];
4993 }
4994 
4995 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4996 {
4997 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4998 
4999 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5000 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5001 	 * granu size in contiguous memory.
5002 	 */
5003 	return order_base_2(nr_pages);
5004 }
5005 
5006 static int
5007 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5008 			   struct iommu_cache_invalidate_info *inv_info)
5009 {
5010 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5011 	struct device_domain_info *info;
5012 	struct intel_iommu *iommu;
5013 	unsigned long flags;
5014 	int cache_type;
5015 	u8 bus, devfn;
5016 	u16 did, sid;
5017 	int ret = 0;
5018 	u64 size = 0;
5019 
5020 	if (!inv_info || !dmar_domain)
5021 		return -EINVAL;
5022 
5023 	if (!dev || !dev_is_pci(dev))
5024 		return -ENODEV;
5025 
5026 	iommu = device_to_iommu(dev, &bus, &devfn);
5027 	if (!iommu)
5028 		return -ENODEV;
5029 
5030 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5031 		return -EINVAL;
5032 
5033 	spin_lock_irqsave(&device_domain_lock, flags);
5034 	spin_lock(&iommu->lock);
5035 	info = get_domain_info(dev);
5036 	if (!info) {
5037 		ret = -EINVAL;
5038 		goto out_unlock;
5039 	}
5040 	did = dmar_domain->iommu_did[iommu->seq_id];
5041 	sid = PCI_DEVID(bus, devfn);
5042 
5043 	/* Size is only valid in address selective invalidation */
5044 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5045 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5046 				   inv_info->granu.addr_info.nb_granules);
5047 
5048 	for_each_set_bit(cache_type,
5049 			 (unsigned long *)&inv_info->cache,
5050 			 IOMMU_CACHE_INV_TYPE_NR) {
5051 		int granu = 0;
5052 		u64 pasid = 0;
5053 		u64 addr = 0;
5054 
5055 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5056 		if (granu == -EINVAL) {
5057 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5058 					   cache_type, inv_info->granularity);
5059 			break;
5060 		}
5061 
5062 		/*
5063 		 * PASID is stored in different locations based on the
5064 		 * granularity.
5065 		 */
5066 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5067 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5068 			pasid = inv_info->granu.pasid_info.pasid;
5069 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5070 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5071 			pasid = inv_info->granu.addr_info.pasid;
5072 
5073 		switch (BIT(cache_type)) {
5074 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5075 			/* HW will ignore LSB bits based on address mask */
5076 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5077 			    size &&
5078 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5079 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5080 						   inv_info->granu.addr_info.addr, size);
5081 			}
5082 
5083 			/*
5084 			 * If granu is PASID-selective, address is ignored.
5085 			 * We use npages = -1 to indicate that.
5086 			 */
5087 			qi_flush_piotlb(iommu, did, pasid,
5088 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5089 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5090 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5091 
5092 			if (!info->ats_enabled)
5093 				break;
5094 			/*
5095 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5096 			 * in the guest may assume IOTLB flush is inclusive,
5097 			 * which is more efficient.
5098 			 */
5099 			fallthrough;
5100 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5101 			/*
5102 			 * PASID based device TLB invalidation does not support
5103 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5104 			 * IOMMU_INV_GRANU_ADDR.
5105 			 * The equivalent of that is we set the size to be the
5106 			 * entire range of 64 bit. User only provides PASID info
5107 			 * without address info. So we set addr to 0.
5108 			 */
5109 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5110 				size = 64 - VTD_PAGE_SHIFT;
5111 				addr = 0;
5112 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5113 				addr = inv_info->granu.addr_info.addr;
5114 			}
5115 
5116 			if (info->ats_enabled)
5117 				qi_flush_dev_iotlb_pasid(iommu, sid,
5118 						info->pfsid, pasid,
5119 						info->ats_qdep, addr,
5120 						size);
5121 			else
5122 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5123 			break;
5124 		default:
5125 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5126 					    cache_type);
5127 			ret = -EINVAL;
5128 		}
5129 	}
5130 out_unlock:
5131 	spin_unlock(&iommu->lock);
5132 	spin_unlock_irqrestore(&device_domain_lock, flags);
5133 
5134 	return ret;
5135 }
5136 #endif
5137 
5138 static int intel_iommu_map(struct iommu_domain *domain,
5139 			   unsigned long iova, phys_addr_t hpa,
5140 			   size_t size, int iommu_prot, gfp_t gfp)
5141 {
5142 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5143 	u64 max_addr;
5144 	int prot = 0;
5145 
5146 	if (iommu_prot & IOMMU_READ)
5147 		prot |= DMA_PTE_READ;
5148 	if (iommu_prot & IOMMU_WRITE)
5149 		prot |= DMA_PTE_WRITE;
5150 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5151 		prot |= DMA_PTE_SNP;
5152 
5153 	max_addr = iova + size;
5154 	if (dmar_domain->max_addr < max_addr) {
5155 		u64 end;
5156 
5157 		/* check if minimum agaw is sufficient for mapped address */
5158 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5159 		if (end < max_addr) {
5160 			pr_err("%s: iommu width (%d) is not "
5161 			       "sufficient for the mapped address (%llx)\n",
5162 			       __func__, dmar_domain->gaw, max_addr);
5163 			return -EFAULT;
5164 		}
5165 		dmar_domain->max_addr = max_addr;
5166 	}
5167 	/* Round up size to next multiple of PAGE_SIZE, if it and
5168 	   the low bits of hpa would take us onto the next page */
5169 	size = aligned_nrpages(hpa, size);
5170 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5171 				hpa >> VTD_PAGE_SHIFT, size, prot);
5172 }
5173 
5174 static int intel_iommu_map_pages(struct iommu_domain *domain,
5175 				 unsigned long iova, phys_addr_t paddr,
5176 				 size_t pgsize, size_t pgcount,
5177 				 int prot, gfp_t gfp, size_t *mapped)
5178 {
5179 	unsigned long pgshift = __ffs(pgsize);
5180 	size_t size = pgcount << pgshift;
5181 	int ret;
5182 
5183 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5184 		return -EINVAL;
5185 
5186 	if (!IS_ALIGNED(iova | paddr, pgsize))
5187 		return -EINVAL;
5188 
5189 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5190 	if (!ret && mapped)
5191 		*mapped = size;
5192 
5193 	return ret;
5194 }
5195 
5196 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5197 				unsigned long iova, size_t size,
5198 				struct iommu_iotlb_gather *gather)
5199 {
5200 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5201 	unsigned long start_pfn, last_pfn;
5202 	int level = 0;
5203 
5204 	/* Cope with horrid API which requires us to unmap more than the
5205 	   size argument if it happens to be a large-page mapping. */
5206 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5207 
5208 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5209 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5210 
5211 	start_pfn = iova >> VTD_PAGE_SHIFT;
5212 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5213 
5214 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5215 					last_pfn, gather->freelist);
5216 
5217 	if (dmar_domain->max_addr == iova + size)
5218 		dmar_domain->max_addr = iova;
5219 
5220 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5221 
5222 	return size;
5223 }
5224 
5225 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5226 				      unsigned long iova,
5227 				      size_t pgsize, size_t pgcount,
5228 				      struct iommu_iotlb_gather *gather)
5229 {
5230 	unsigned long pgshift = __ffs(pgsize);
5231 	size_t size = pgcount << pgshift;
5232 
5233 	return intel_iommu_unmap(domain, iova, size, gather);
5234 }
5235 
5236 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5237 				 struct iommu_iotlb_gather *gather)
5238 {
5239 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5240 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5241 	size_t size = gather->end - gather->start;
5242 	unsigned long start_pfn;
5243 	unsigned long nrpages;
5244 	int iommu_id;
5245 
5246 	nrpages = aligned_nrpages(gather->start, size);
5247 	start_pfn = mm_to_dma_pfn(iova_pfn);
5248 
5249 	for_each_domain_iommu(iommu_id, dmar_domain)
5250 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5251 				      start_pfn, nrpages, !gather->freelist, 0);
5252 
5253 	dma_free_pagelist(gather->freelist);
5254 }
5255 
5256 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5257 					    dma_addr_t iova)
5258 {
5259 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5260 	struct dma_pte *pte;
5261 	int level = 0;
5262 	u64 phys = 0;
5263 
5264 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5265 	if (pte && dma_pte_present(pte))
5266 		phys = dma_pte_addr(pte) +
5267 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5268 						VTD_PAGE_SHIFT) - 1));
5269 
5270 	return phys;
5271 }
5272 
5273 static bool intel_iommu_capable(enum iommu_cap cap)
5274 {
5275 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5276 		return domain_update_iommu_snooping(NULL);
5277 	if (cap == IOMMU_CAP_INTR_REMAP)
5278 		return irq_remapping_enabled == 1;
5279 
5280 	return false;
5281 }
5282 
5283 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5284 {
5285 	struct intel_iommu *iommu;
5286 
5287 	iommu = device_to_iommu(dev, NULL, NULL);
5288 	if (!iommu)
5289 		return ERR_PTR(-ENODEV);
5290 
5291 	if (translation_pre_enabled(iommu))
5292 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5293 
5294 	return &iommu->iommu;
5295 }
5296 
5297 static void intel_iommu_release_device(struct device *dev)
5298 {
5299 	struct intel_iommu *iommu;
5300 
5301 	iommu = device_to_iommu(dev, NULL, NULL);
5302 	if (!iommu)
5303 		return;
5304 
5305 	dmar_remove_one_dev_info(dev);
5306 
5307 	set_dma_ops(dev, NULL);
5308 }
5309 
5310 static void intel_iommu_probe_finalize(struct device *dev)
5311 {
5312 	set_dma_ops(dev, NULL);
5313 	iommu_setup_dma_ops(dev, 0, U64_MAX);
5314 }
5315 
5316 static void intel_iommu_get_resv_regions(struct device *device,
5317 					 struct list_head *head)
5318 {
5319 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5320 	struct iommu_resv_region *reg;
5321 	struct dmar_rmrr_unit *rmrr;
5322 	struct device *i_dev;
5323 	int i;
5324 
5325 	down_read(&dmar_global_lock);
5326 	for_each_rmrr_units(rmrr) {
5327 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5328 					  i, i_dev) {
5329 			struct iommu_resv_region *resv;
5330 			enum iommu_resv_type type;
5331 			size_t length;
5332 
5333 			if (i_dev != device &&
5334 			    !is_downstream_to_pci_bridge(device, i_dev))
5335 				continue;
5336 
5337 			length = rmrr->end_address - rmrr->base_address + 1;
5338 
5339 			type = device_rmrr_is_relaxable(device) ?
5340 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5341 
5342 			resv = iommu_alloc_resv_region(rmrr->base_address,
5343 						       length, prot, type);
5344 			if (!resv)
5345 				break;
5346 
5347 			list_add_tail(&resv->list, head);
5348 		}
5349 	}
5350 	up_read(&dmar_global_lock);
5351 
5352 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5353 	if (dev_is_pci(device)) {
5354 		struct pci_dev *pdev = to_pci_dev(device);
5355 
5356 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5357 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5358 						   IOMMU_RESV_DIRECT_RELAXABLE);
5359 			if (reg)
5360 				list_add_tail(&reg->list, head);
5361 		}
5362 	}
5363 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5364 
5365 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5366 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5367 				      0, IOMMU_RESV_MSI);
5368 	if (!reg)
5369 		return;
5370 	list_add_tail(&reg->list, head);
5371 }
5372 
5373 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5374 {
5375 	struct device_domain_info *info;
5376 	struct context_entry *context;
5377 	struct dmar_domain *domain;
5378 	unsigned long flags;
5379 	u64 ctx_lo;
5380 	int ret;
5381 
5382 	domain = find_domain(dev);
5383 	if (!domain)
5384 		return -EINVAL;
5385 
5386 	spin_lock_irqsave(&device_domain_lock, flags);
5387 	spin_lock(&iommu->lock);
5388 
5389 	ret = -EINVAL;
5390 	info = get_domain_info(dev);
5391 	if (!info || !info->pasid_supported)
5392 		goto out;
5393 
5394 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5395 	if (WARN_ON(!context))
5396 		goto out;
5397 
5398 	ctx_lo = context[0].lo;
5399 
5400 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5401 		ctx_lo |= CONTEXT_PASIDE;
5402 		context[0].lo = ctx_lo;
5403 		wmb();
5404 		iommu->flush.flush_context(iommu,
5405 					   domain->iommu_did[iommu->seq_id],
5406 					   PCI_DEVID(info->bus, info->devfn),
5407 					   DMA_CCMD_MASK_NOBIT,
5408 					   DMA_CCMD_DEVICE_INVL);
5409 	}
5410 
5411 	/* Enable PASID support in the device, if it wasn't already */
5412 	if (!info->pasid_enabled)
5413 		iommu_enable_dev_iotlb(info);
5414 
5415 	ret = 0;
5416 
5417  out:
5418 	spin_unlock(&iommu->lock);
5419 	spin_unlock_irqrestore(&device_domain_lock, flags);
5420 
5421 	return ret;
5422 }
5423 
5424 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5425 {
5426 	if (dev_is_pci(dev))
5427 		return pci_device_group(dev);
5428 	return generic_device_group(dev);
5429 }
5430 
5431 static int intel_iommu_enable_auxd(struct device *dev)
5432 {
5433 	struct device_domain_info *info;
5434 	struct intel_iommu *iommu;
5435 	unsigned long flags;
5436 	int ret;
5437 
5438 	iommu = device_to_iommu(dev, NULL, NULL);
5439 	if (!iommu || dmar_disabled)
5440 		return -EINVAL;
5441 
5442 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5443 		return -EINVAL;
5444 
5445 	ret = intel_iommu_enable_pasid(iommu, dev);
5446 	if (ret)
5447 		return -ENODEV;
5448 
5449 	spin_lock_irqsave(&device_domain_lock, flags);
5450 	info = get_domain_info(dev);
5451 	info->auxd_enabled = 1;
5452 	spin_unlock_irqrestore(&device_domain_lock, flags);
5453 
5454 	return 0;
5455 }
5456 
5457 static int intel_iommu_disable_auxd(struct device *dev)
5458 {
5459 	struct device_domain_info *info;
5460 	unsigned long flags;
5461 
5462 	spin_lock_irqsave(&device_domain_lock, flags);
5463 	info = get_domain_info(dev);
5464 	if (!WARN_ON(!info))
5465 		info->auxd_enabled = 0;
5466 	spin_unlock_irqrestore(&device_domain_lock, flags);
5467 
5468 	return 0;
5469 }
5470 
5471 static int intel_iommu_enable_sva(struct device *dev)
5472 {
5473 	struct device_domain_info *info = get_domain_info(dev);
5474 	struct intel_iommu *iommu;
5475 	int ret;
5476 
5477 	if (!info || dmar_disabled)
5478 		return -EINVAL;
5479 
5480 	iommu = info->iommu;
5481 	if (!iommu)
5482 		return -EINVAL;
5483 
5484 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5485 		return -ENODEV;
5486 
5487 	if (intel_iommu_enable_pasid(iommu, dev))
5488 		return -ENODEV;
5489 
5490 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5491 		return -EINVAL;
5492 
5493 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5494 	if (!ret)
5495 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5496 
5497 	return ret;
5498 }
5499 
5500 static int intel_iommu_disable_sva(struct device *dev)
5501 {
5502 	struct device_domain_info *info = get_domain_info(dev);
5503 	struct intel_iommu *iommu = info->iommu;
5504 	int ret;
5505 
5506 	ret = iommu_unregister_device_fault_handler(dev);
5507 	if (!ret)
5508 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5509 
5510 	return ret;
5511 }
5512 
5513 static int intel_iommu_enable_iopf(struct device *dev)
5514 {
5515 	struct device_domain_info *info = get_domain_info(dev);
5516 
5517 	if (info && info->pri_supported)
5518 		return 0;
5519 
5520 	return -ENODEV;
5521 }
5522 
5523 static int
5524 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5525 {
5526 	switch (feat) {
5527 	case IOMMU_DEV_FEAT_AUX:
5528 		return intel_iommu_enable_auxd(dev);
5529 
5530 	case IOMMU_DEV_FEAT_IOPF:
5531 		return intel_iommu_enable_iopf(dev);
5532 
5533 	case IOMMU_DEV_FEAT_SVA:
5534 		return intel_iommu_enable_sva(dev);
5535 
5536 	default:
5537 		return -ENODEV;
5538 	}
5539 }
5540 
5541 static int
5542 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5543 {
5544 	switch (feat) {
5545 	case IOMMU_DEV_FEAT_AUX:
5546 		return intel_iommu_disable_auxd(dev);
5547 
5548 	case IOMMU_DEV_FEAT_IOPF:
5549 		return 0;
5550 
5551 	case IOMMU_DEV_FEAT_SVA:
5552 		return intel_iommu_disable_sva(dev);
5553 
5554 	default:
5555 		return -ENODEV;
5556 	}
5557 }
5558 
5559 static bool
5560 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5561 {
5562 	struct device_domain_info *info = get_domain_info(dev);
5563 
5564 	if (feat == IOMMU_DEV_FEAT_AUX)
5565 		return scalable_mode_support() && info && info->auxd_enabled;
5566 
5567 	return false;
5568 }
5569 
5570 static int
5571 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5572 {
5573 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5574 
5575 	return dmar_domain->default_pasid > 0 ?
5576 			dmar_domain->default_pasid : -EINVAL;
5577 }
5578 
5579 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5580 					   struct device *dev)
5581 {
5582 	return attach_deferred(dev);
5583 }
5584 
5585 static int
5586 intel_iommu_enable_nesting(struct iommu_domain *domain)
5587 {
5588 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5589 	unsigned long flags;
5590 	int ret = -ENODEV;
5591 
5592 	spin_lock_irqsave(&device_domain_lock, flags);
5593 	if (list_empty(&dmar_domain->devices)) {
5594 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5595 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5596 		ret = 0;
5597 	}
5598 	spin_unlock_irqrestore(&device_domain_lock, flags);
5599 
5600 	return ret;
5601 }
5602 
5603 /*
5604  * Check that the device does not live on an external facing PCI port that is
5605  * marked as untrusted. Such devices should not be able to apply quirks and
5606  * thus not be able to bypass the IOMMU restrictions.
5607  */
5608 static bool risky_device(struct pci_dev *pdev)
5609 {
5610 	if (pdev->untrusted) {
5611 		pci_info(pdev,
5612 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5613 			 pdev->vendor, pdev->device);
5614 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5615 		return true;
5616 	}
5617 	return false;
5618 }
5619 
5620 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5621 				       unsigned long iova, size_t size)
5622 {
5623 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5624 	unsigned long pages = aligned_nrpages(iova, size);
5625 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5626 	struct intel_iommu *iommu;
5627 	int iommu_id;
5628 
5629 	for_each_domain_iommu(iommu_id, dmar_domain) {
5630 		iommu = g_iommus[iommu_id];
5631 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5632 	}
5633 }
5634 
5635 const struct iommu_ops intel_iommu_ops = {
5636 	.capable		= intel_iommu_capable,
5637 	.domain_alloc		= intel_iommu_domain_alloc,
5638 	.domain_free		= intel_iommu_domain_free,
5639 	.enable_nesting		= intel_iommu_enable_nesting,
5640 	.attach_dev		= intel_iommu_attach_device,
5641 	.detach_dev		= intel_iommu_detach_device,
5642 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5643 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5644 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5645 	.map_pages		= intel_iommu_map_pages,
5646 	.unmap_pages		= intel_iommu_unmap_pages,
5647 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5648 	.flush_iotlb_all        = intel_flush_iotlb_all,
5649 	.iotlb_sync		= intel_iommu_tlb_sync,
5650 	.iova_to_phys		= intel_iommu_iova_to_phys,
5651 	.probe_device		= intel_iommu_probe_device,
5652 	.probe_finalize		= intel_iommu_probe_finalize,
5653 	.release_device		= intel_iommu_release_device,
5654 	.get_resv_regions	= intel_iommu_get_resv_regions,
5655 	.put_resv_regions	= generic_iommu_put_resv_regions,
5656 	.device_group		= intel_iommu_device_group,
5657 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5658 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5659 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5660 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5661 	.def_domain_type	= device_def_domain_type,
5662 	.pgsize_bitmap		= SZ_4K,
5663 #ifdef CONFIG_INTEL_IOMMU_SVM
5664 	.cache_invalidate	= intel_iommu_sva_invalidate,
5665 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5666 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5667 	.sva_bind		= intel_svm_bind,
5668 	.sva_unbind		= intel_svm_unbind,
5669 	.sva_get_pasid		= intel_svm_get_pasid,
5670 	.page_response		= intel_svm_page_response,
5671 #endif
5672 };
5673 
5674 static void quirk_iommu_igfx(struct pci_dev *dev)
5675 {
5676 	if (risky_device(dev))
5677 		return;
5678 
5679 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5680 	dmar_map_gfx = 0;
5681 }
5682 
5683 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5684 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5685 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5691 
5692 /* Broadwell igfx malfunctions with dmar */
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5717 
5718 static void quirk_iommu_rwbf(struct pci_dev *dev)
5719 {
5720 	if (risky_device(dev))
5721 		return;
5722 
5723 	/*
5724 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5725 	 * but needs it. Same seems to hold for the desktop versions.
5726 	 */
5727 	pci_info(dev, "Forcing write-buffer flush capability\n");
5728 	rwbf_quirk = 1;
5729 }
5730 
5731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5736 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5737 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5738 
5739 #define GGC 0x52
5740 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5741 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5742 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5743 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5744 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5745 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5746 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5747 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5748 
5749 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5750 {
5751 	unsigned short ggc;
5752 
5753 	if (risky_device(dev))
5754 		return;
5755 
5756 	if (pci_read_config_word(dev, GGC, &ggc))
5757 		return;
5758 
5759 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5760 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5761 		dmar_map_gfx = 0;
5762 	} else if (dmar_map_gfx) {
5763 		/* we have to ensure the gfx device is idle before we flush */
5764 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5765 		iommu_set_dma_strict();
5766 	}
5767 }
5768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5772 
5773 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5774 {
5775 	unsigned short ver;
5776 
5777 	if (!IS_GFX_DEVICE(dev))
5778 		return;
5779 
5780 	ver = (dev->device >> 8) & 0xff;
5781 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5782 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5783 	    ver != 0x9a)
5784 		return;
5785 
5786 	if (risky_device(dev))
5787 		return;
5788 
5789 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5790 	iommu_skip_te_disable = 1;
5791 }
5792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5793 
5794 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5795    ISOCH DMAR unit for the Azalia sound device, but not give it any
5796    TLB entries, which causes it to deadlock. Check for that.  We do
5797    this in a function called from init_dmars(), instead of in a PCI
5798    quirk, because we don't want to print the obnoxious "BIOS broken"
5799    message if VT-d is actually disabled.
5800 */
5801 static void __init check_tylersburg_isoch(void)
5802 {
5803 	struct pci_dev *pdev;
5804 	uint32_t vtisochctrl;
5805 
5806 	/* If there's no Azalia in the system anyway, forget it. */
5807 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5808 	if (!pdev)
5809 		return;
5810 
5811 	if (risky_device(pdev)) {
5812 		pci_dev_put(pdev);
5813 		return;
5814 	}
5815 
5816 	pci_dev_put(pdev);
5817 
5818 	/* System Management Registers. Might be hidden, in which case
5819 	   we can't do the sanity check. But that's OK, because the
5820 	   known-broken BIOSes _don't_ actually hide it, so far. */
5821 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5822 	if (!pdev)
5823 		return;
5824 
5825 	if (risky_device(pdev)) {
5826 		pci_dev_put(pdev);
5827 		return;
5828 	}
5829 
5830 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5831 		pci_dev_put(pdev);
5832 		return;
5833 	}
5834 
5835 	pci_dev_put(pdev);
5836 
5837 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5838 	if (vtisochctrl & 1)
5839 		return;
5840 
5841 	/* Drop all bits other than the number of TLB entries */
5842 	vtisochctrl &= 0x1c;
5843 
5844 	/* If we have the recommended number of TLB entries (16), fine. */
5845 	if (vtisochctrl == 0x10)
5846 		return;
5847 
5848 	/* Zero TLB entries? You get to ride the short bus to school. */
5849 	if (!vtisochctrl) {
5850 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5851 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5852 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5853 		     dmi_get_system_info(DMI_BIOS_VERSION),
5854 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5855 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5856 		return;
5857 	}
5858 
5859 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5860 	       vtisochctrl);
5861 }
5862