xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 1f012283)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/intel-svm.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 
49 #include "../irq_remapping.h"
50 #include "../iommu-sva-lib.h"
51 #include "pasid.h"
52 #include "cap_audit.h"
53 
54 #define ROOT_SIZE		VTD_PAGE_SIZE
55 #define CONTEXT_SIZE		VTD_PAGE_SIZE
56 
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 
62 #define IOAPIC_RANGE_START	(0xfee00000)
63 #define IOAPIC_RANGE_END	(0xfeefffff)
64 #define IOVA_START_ADDR		(0x1000)
65 
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
77 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN		(1)
82 
83 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
84 
85 /* page table handling */
86 #define LEVEL_STRIDE		(9)
87 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
88 
89 static inline int agaw_to_level(int agaw)
90 {
91 	return agaw + 2;
92 }
93 
94 static inline int agaw_to_width(int agaw)
95 {
96 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
97 }
98 
99 static inline int width_to_agaw(int width)
100 {
101 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
102 }
103 
104 static inline unsigned int level_to_offset_bits(int level)
105 {
106 	return (level - 1) * LEVEL_STRIDE;
107 }
108 
109 static inline int pfn_level_offset(u64 pfn, int level)
110 {
111 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
112 }
113 
114 static inline u64 level_mask(int level)
115 {
116 	return -1ULL << level_to_offset_bits(level);
117 }
118 
119 static inline u64 level_size(int level)
120 {
121 	return 1ULL << level_to_offset_bits(level);
122 }
123 
124 static inline u64 align_to_level(u64 pfn, int level)
125 {
126 	return (pfn + level_size(level) - 1) & level_mask(level);
127 }
128 
129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
130 {
131 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
132 }
133 
134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
135    are never going to work. */
136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
137 {
138 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
139 }
140 
141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
142 {
143 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
144 }
145 static inline unsigned long page_to_dma_pfn(struct page *pg)
146 {
147 	return mm_to_dma_pfn(page_to_pfn(pg));
148 }
149 static inline unsigned long virt_to_dma_pfn(void *p)
150 {
151 	return page_to_dma_pfn(virt_to_page(p));
152 }
153 
154 /* global iommu list, set NULL for ignored DMAR units */
155 static struct intel_iommu **g_iommus;
156 
157 static void __init check_tylersburg_isoch(void);
158 static int rwbf_quirk;
159 static inline struct device_domain_info *
160 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
161 
162 /*
163  * set to 1 to panic kernel if can't successfully enable VT-d
164  * (used when kernel is launched w/ TXT)
165  */
166 static int force_on = 0;
167 static int intel_iommu_tboot_noforce;
168 static int no_platform_optin;
169 
170 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
171 
172 /*
173  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
174  * if marked present.
175  */
176 static phys_addr_t root_entry_lctp(struct root_entry *re)
177 {
178 	if (!(re->lo & 1))
179 		return 0;
180 
181 	return re->lo & VTD_PAGE_MASK;
182 }
183 
184 /*
185  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
186  * if marked present.
187  */
188 static phys_addr_t root_entry_uctp(struct root_entry *re)
189 {
190 	if (!(re->hi & 1))
191 		return 0;
192 
193 	return re->hi & VTD_PAGE_MASK;
194 }
195 
196 static inline void context_clear_pasid_enable(struct context_entry *context)
197 {
198 	context->lo &= ~(1ULL << 11);
199 }
200 
201 static inline bool context_pasid_enabled(struct context_entry *context)
202 {
203 	return !!(context->lo & (1ULL << 11));
204 }
205 
206 static inline void context_set_copied(struct context_entry *context)
207 {
208 	context->hi |= (1ull << 3);
209 }
210 
211 static inline bool context_copied(struct context_entry *context)
212 {
213 	return !!(context->hi & (1ULL << 3));
214 }
215 
216 static inline bool __context_present(struct context_entry *context)
217 {
218 	return (context->lo & 1);
219 }
220 
221 bool context_present(struct context_entry *context)
222 {
223 	return context_pasid_enabled(context) ?
224 	     __context_present(context) :
225 	     __context_present(context) && !context_copied(context);
226 }
227 
228 static inline void context_set_present(struct context_entry *context)
229 {
230 	context->lo |= 1;
231 }
232 
233 static inline void context_set_fault_enable(struct context_entry *context)
234 {
235 	context->lo &= (((u64)-1) << 2) | 1;
236 }
237 
238 static inline void context_set_translation_type(struct context_entry *context,
239 						unsigned long value)
240 {
241 	context->lo &= (((u64)-1) << 4) | 3;
242 	context->lo |= (value & 3) << 2;
243 }
244 
245 static inline void context_set_address_root(struct context_entry *context,
246 					    unsigned long value)
247 {
248 	context->lo &= ~VTD_PAGE_MASK;
249 	context->lo |= value & VTD_PAGE_MASK;
250 }
251 
252 static inline void context_set_address_width(struct context_entry *context,
253 					     unsigned long value)
254 {
255 	context->hi |= value & 7;
256 }
257 
258 static inline void context_set_domain_id(struct context_entry *context,
259 					 unsigned long value)
260 {
261 	context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263 
264 static inline int context_domain_id(struct context_entry *c)
265 {
266 	return((c->hi >> 8) & 0xffff);
267 }
268 
269 static inline void context_clear_entry(struct context_entry *context)
270 {
271 	context->lo = 0;
272 	context->hi = 0;
273 }
274 
275 /*
276  * This domain is a statically identity mapping domain.
277  *	1. This domain creats a static 1:1 mapping to all usable memory.
278  * 	2. It maps to each iommu if successful.
279  *	3. Each iommu mapps to this domain if successful.
280  */
281 static struct dmar_domain *si_domain;
282 static int hw_pass_through = 1;
283 
284 #define for_each_domain_iommu(idx, domain)			\
285 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
286 		if (domain->iommu_refcnt[idx])
287 
288 struct dmar_rmrr_unit {
289 	struct list_head list;		/* list of rmrr units	*/
290 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
291 	u64	base_address;		/* reserved base address*/
292 	u64	end_address;		/* reserved end address */
293 	struct dmar_dev_scope *devices;	/* target devices */
294 	int	devices_cnt;		/* target device count */
295 };
296 
297 struct dmar_atsr_unit {
298 	struct list_head list;		/* list of ATSR units */
299 	struct acpi_dmar_header *hdr;	/* ACPI header */
300 	struct dmar_dev_scope *devices;	/* target devices */
301 	int devices_cnt;		/* target device count */
302 	u8 include_all:1;		/* include all ports */
303 };
304 
305 struct dmar_satc_unit {
306 	struct list_head list;		/* list of SATC units */
307 	struct acpi_dmar_header *hdr;	/* ACPI header */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	struct intel_iommu *iommu;	/* the corresponding iommu */
310 	int devices_cnt;		/* target device count */
311 	u8 atc_required:1;		/* ATS is required */
312 };
313 
314 static LIST_HEAD(dmar_atsr_units);
315 static LIST_HEAD(dmar_rmrr_units);
316 static LIST_HEAD(dmar_satc_units);
317 
318 #define for_each_rmrr_units(rmrr) \
319 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
320 
321 /* bitmap for indexing intel_iommus */
322 static int g_num_of_iommus;
323 
324 static void domain_exit(struct dmar_domain *domain);
325 static void domain_remove_dev_info(struct dmar_domain *domain);
326 static void dmar_remove_one_dev_info(struct device *dev);
327 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
328 static int intel_iommu_attach_device(struct iommu_domain *domain,
329 				     struct device *dev);
330 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
331 					    dma_addr_t iova);
332 
333 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
334 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
335 
336 int intel_iommu_enabled = 0;
337 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
338 
339 static int dmar_map_gfx = 1;
340 static int intel_iommu_superpage = 1;
341 static int iommu_identity_mapping;
342 static int iommu_skip_te_disable;
343 
344 #define IDENTMAP_GFX		2
345 #define IDENTMAP_AZALIA		4
346 
347 int intel_iommu_gfx_mapped;
348 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
349 
350 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
351 struct device_domain_info *get_domain_info(struct device *dev)
352 {
353 	struct device_domain_info *info;
354 
355 	if (!dev)
356 		return NULL;
357 
358 	info = dev_iommu_priv_get(dev);
359 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
360 		return NULL;
361 
362 	return info;
363 }
364 
365 DEFINE_SPINLOCK(device_domain_lock);
366 static LIST_HEAD(device_domain_list);
367 
368 /*
369  * Iterate over elements in device_domain_list and call the specified
370  * callback @fn against each element.
371  */
372 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
373 				     void *data), void *data)
374 {
375 	int ret = 0;
376 	unsigned long flags;
377 	struct device_domain_info *info;
378 
379 	spin_lock_irqsave(&device_domain_lock, flags);
380 	list_for_each_entry(info, &device_domain_list, global) {
381 		ret = fn(info, data);
382 		if (ret) {
383 			spin_unlock_irqrestore(&device_domain_lock, flags);
384 			return ret;
385 		}
386 	}
387 	spin_unlock_irqrestore(&device_domain_lock, flags);
388 
389 	return 0;
390 }
391 
392 const struct iommu_ops intel_iommu_ops;
393 
394 static bool translation_pre_enabled(struct intel_iommu *iommu)
395 {
396 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
397 }
398 
399 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
400 {
401 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
402 }
403 
404 static void init_translation_status(struct intel_iommu *iommu)
405 {
406 	u32 gsts;
407 
408 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
409 	if (gsts & DMA_GSTS_TES)
410 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
411 }
412 
413 static int __init intel_iommu_setup(char *str)
414 {
415 	if (!str)
416 		return -EINVAL;
417 
418 	while (*str) {
419 		if (!strncmp(str, "on", 2)) {
420 			dmar_disabled = 0;
421 			pr_info("IOMMU enabled\n");
422 		} else if (!strncmp(str, "off", 3)) {
423 			dmar_disabled = 1;
424 			no_platform_optin = 1;
425 			pr_info("IOMMU disabled\n");
426 		} else if (!strncmp(str, "igfx_off", 8)) {
427 			dmar_map_gfx = 0;
428 			pr_info("Disable GFX device mapping\n");
429 		} else if (!strncmp(str, "forcedac", 8)) {
430 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
431 			iommu_dma_forcedac = true;
432 		} else if (!strncmp(str, "strict", 6)) {
433 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
434 			iommu_set_dma_strict();
435 		} else if (!strncmp(str, "sp_off", 6)) {
436 			pr_info("Disable supported super page\n");
437 			intel_iommu_superpage = 0;
438 		} else if (!strncmp(str, "sm_on", 5)) {
439 			pr_info("Enable scalable mode if hardware supports\n");
440 			intel_iommu_sm = 1;
441 		} else if (!strncmp(str, "sm_off", 6)) {
442 			pr_info("Scalable mode is disallowed\n");
443 			intel_iommu_sm = 0;
444 		} else if (!strncmp(str, "tboot_noforce", 13)) {
445 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
446 			intel_iommu_tboot_noforce = 1;
447 		} else {
448 			pr_notice("Unknown option - '%s'\n", str);
449 		}
450 
451 		str += strcspn(str, ",");
452 		while (*str == ',')
453 			str++;
454 	}
455 
456 	return 1;
457 }
458 __setup("intel_iommu=", intel_iommu_setup);
459 
460 static struct kmem_cache *iommu_domain_cache;
461 static struct kmem_cache *iommu_devinfo_cache;
462 
463 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
464 {
465 	struct dmar_domain **domains;
466 	int idx = did >> 8;
467 
468 	domains = iommu->domains[idx];
469 	if (!domains)
470 		return NULL;
471 
472 	return domains[did & 0xff];
473 }
474 
475 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
476 			     struct dmar_domain *domain)
477 {
478 	struct dmar_domain **domains;
479 	int idx = did >> 8;
480 
481 	if (!iommu->domains[idx]) {
482 		size_t size = 256 * sizeof(struct dmar_domain *);
483 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
484 	}
485 
486 	domains = iommu->domains[idx];
487 	if (WARN_ON(!domains))
488 		return;
489 	else
490 		domains[did & 0xff] = domain;
491 }
492 
493 void *alloc_pgtable_page(int node)
494 {
495 	struct page *page;
496 	void *vaddr = NULL;
497 
498 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
499 	if (page)
500 		vaddr = page_address(page);
501 	return vaddr;
502 }
503 
504 void free_pgtable_page(void *vaddr)
505 {
506 	free_page((unsigned long)vaddr);
507 }
508 
509 static inline void *alloc_domain_mem(void)
510 {
511 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
512 }
513 
514 static void free_domain_mem(void *vaddr)
515 {
516 	kmem_cache_free(iommu_domain_cache, vaddr);
517 }
518 
519 static inline void * alloc_devinfo_mem(void)
520 {
521 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
522 }
523 
524 static inline void free_devinfo_mem(void *vaddr)
525 {
526 	kmem_cache_free(iommu_devinfo_cache, vaddr);
527 }
528 
529 static inline int domain_type_is_si(struct dmar_domain *domain)
530 {
531 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
532 }
533 
534 static inline bool domain_use_first_level(struct dmar_domain *domain)
535 {
536 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
537 }
538 
539 static inline int domain_pfn_supported(struct dmar_domain *domain,
540 				       unsigned long pfn)
541 {
542 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
543 
544 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
545 }
546 
547 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
548 {
549 	unsigned long sagaw;
550 	int agaw;
551 
552 	sagaw = cap_sagaw(iommu->cap);
553 	for (agaw = width_to_agaw(max_gaw);
554 	     agaw >= 0; agaw--) {
555 		if (test_bit(agaw, &sagaw))
556 			break;
557 	}
558 
559 	return agaw;
560 }
561 
562 /*
563  * Calculate max SAGAW for each iommu.
564  */
565 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
566 {
567 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
568 }
569 
570 /*
571  * calculate agaw for each iommu.
572  * "SAGAW" may be different across iommus, use a default agaw, and
573  * get a supported less agaw for iommus that don't support the default agaw.
574  */
575 int iommu_calculate_agaw(struct intel_iommu *iommu)
576 {
577 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
578 }
579 
580 /* This functionin only returns single iommu in a domain */
581 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
582 {
583 	int iommu_id;
584 
585 	/* si_domain and vm domain should not get here. */
586 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
587 		return NULL;
588 
589 	for_each_domain_iommu(iommu_id, domain)
590 		break;
591 
592 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
593 		return NULL;
594 
595 	return g_iommus[iommu_id];
596 }
597 
598 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
599 {
600 	return sm_supported(iommu) ?
601 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
602 }
603 
604 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 {
606 	struct dmar_drhd_unit *drhd;
607 	struct intel_iommu *iommu;
608 	bool found = false;
609 	int i;
610 
611 	domain->iommu_coherency = true;
612 
613 	for_each_domain_iommu(i, domain) {
614 		found = true;
615 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
616 			domain->iommu_coherency = false;
617 			break;
618 		}
619 	}
620 	if (found)
621 		return;
622 
623 	/* No hardware attached; use lowest common denominator */
624 	rcu_read_lock();
625 	for_each_active_iommu(iommu, drhd) {
626 		if (!iommu_paging_structure_coherency(iommu)) {
627 			domain->iommu_coherency = false;
628 			break;
629 		}
630 	}
631 	rcu_read_unlock();
632 }
633 
634 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
635 {
636 	struct dmar_drhd_unit *drhd;
637 	struct intel_iommu *iommu;
638 	bool ret = true;
639 
640 	rcu_read_lock();
641 	for_each_active_iommu(iommu, drhd) {
642 		if (iommu != skip) {
643 			/*
644 			 * If the hardware is operating in the scalable mode,
645 			 * the snooping control is always supported since we
646 			 * always set PASID-table-entry.PGSNP bit if the domain
647 			 * is managed outside (UNMANAGED).
648 			 */
649 			if (!sm_supported(iommu) &&
650 			    !ecap_sc_support(iommu->ecap)) {
651 				ret = false;
652 				break;
653 			}
654 		}
655 	}
656 	rcu_read_unlock();
657 
658 	return ret;
659 }
660 
661 static int domain_update_iommu_superpage(struct dmar_domain *domain,
662 					 struct intel_iommu *skip)
663 {
664 	struct dmar_drhd_unit *drhd;
665 	struct intel_iommu *iommu;
666 	int mask = 0x3;
667 
668 	if (!intel_iommu_superpage)
669 		return 0;
670 
671 	/* set iommu_superpage to the smallest common denominator */
672 	rcu_read_lock();
673 	for_each_active_iommu(iommu, drhd) {
674 		if (iommu != skip) {
675 			if (domain && domain_use_first_level(domain)) {
676 				if (!cap_fl1gp_support(iommu->cap))
677 					mask = 0x1;
678 			} else {
679 				mask &= cap_super_page_val(iommu->cap);
680 			}
681 
682 			if (!mask)
683 				break;
684 		}
685 	}
686 	rcu_read_unlock();
687 
688 	return fls(mask);
689 }
690 
691 static int domain_update_device_node(struct dmar_domain *domain)
692 {
693 	struct device_domain_info *info;
694 	int nid = NUMA_NO_NODE;
695 
696 	assert_spin_locked(&device_domain_lock);
697 
698 	if (list_empty(&domain->devices))
699 		return NUMA_NO_NODE;
700 
701 	list_for_each_entry(info, &domain->devices, link) {
702 		if (!info->dev)
703 			continue;
704 
705 		/*
706 		 * There could possibly be multiple device numa nodes as devices
707 		 * within the same domain may sit behind different IOMMUs. There
708 		 * isn't perfect answer in such situation, so we select first
709 		 * come first served policy.
710 		 */
711 		nid = dev_to_node(info->dev);
712 		if (nid != NUMA_NO_NODE)
713 			break;
714 	}
715 
716 	return nid;
717 }
718 
719 static void domain_update_iotlb(struct dmar_domain *domain);
720 
721 /* Return the super pagesize bitmap if supported. */
722 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
723 {
724 	unsigned long bitmap = 0;
725 
726 	/*
727 	 * 1-level super page supports page size of 2MiB, 2-level super page
728 	 * supports page size of both 2MiB and 1GiB.
729 	 */
730 	if (domain->iommu_superpage == 1)
731 		bitmap |= SZ_2M;
732 	else if (domain->iommu_superpage == 2)
733 		bitmap |= SZ_2M | SZ_1G;
734 
735 	return bitmap;
736 }
737 
738 /* Some capabilities may be different across iommus */
739 static void domain_update_iommu_cap(struct dmar_domain *domain)
740 {
741 	domain_update_iommu_coherency(domain);
742 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
743 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
744 
745 	/*
746 	 * If RHSA is missing, we should default to the device numa domain
747 	 * as fall back.
748 	 */
749 	if (domain->nid == NUMA_NO_NODE)
750 		domain->nid = domain_update_device_node(domain);
751 
752 	/*
753 	 * First-level translation restricts the input-address to a
754 	 * canonical address (i.e., address bits 63:N have the same
755 	 * value as address bit [N-1], where N is 48-bits with 4-level
756 	 * paging and 57-bits with 5-level paging). Hence, skip bit
757 	 * [N-1].
758 	 */
759 	if (domain_use_first_level(domain))
760 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
761 	else
762 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
763 
764 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
765 	domain_update_iotlb(domain);
766 }
767 
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 					 u8 devfn, int alloc)
770 {
771 	struct root_entry *root = &iommu->root_entry[bus];
772 	struct context_entry *context;
773 	u64 *entry;
774 
775 	entry = &root->lo;
776 	if (sm_supported(iommu)) {
777 		if (devfn >= 0x80) {
778 			devfn -= 0x80;
779 			entry = &root->hi;
780 		}
781 		devfn *= 2;
782 	}
783 	if (*entry & 1)
784 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 	else {
786 		unsigned long phy_addr;
787 		if (!alloc)
788 			return NULL;
789 
790 		context = alloc_pgtable_page(iommu->node);
791 		if (!context)
792 			return NULL;
793 
794 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 		phy_addr = virt_to_phys((void *)context);
796 		*entry = phy_addr | 1;
797 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
798 	}
799 	return &context[devfn];
800 }
801 
802 static bool attach_deferred(struct device *dev)
803 {
804 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805 }
806 
807 /**
808  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809  *				 sub-hierarchy of a candidate PCI-PCI bridge
810  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811  * @bridge: the candidate PCI-PCI bridge
812  *
813  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814  */
815 static bool
816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817 {
818 	struct pci_dev *pdev, *pbridge;
819 
820 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 		return false;
822 
823 	pdev = to_pci_dev(dev);
824 	pbridge = to_pci_dev(bridge);
825 
826 	if (pbridge->subordinate &&
827 	    pbridge->subordinate->number <= pdev->bus->number &&
828 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 		return true;
830 
831 	return false;
832 }
833 
834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835 {
836 	struct dmar_drhd_unit *drhd;
837 	u32 vtbar;
838 	int rc;
839 
840 	/* We know that this device on this chipset has its own IOMMU.
841 	 * If we find it under a different IOMMU, then the BIOS is lying
842 	 * to us. Hope that the IOMMU for this device is actually
843 	 * disabled, and it needs no translation...
844 	 */
845 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 	if (rc) {
847 		/* "can't" happen */
848 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 		return false;
850 	}
851 	vtbar &= 0xffff0000;
852 
853 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
854 	drhd = dmar_find_matched_drhd_unit(pdev);
855 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 		return true;
859 	}
860 
861 	return false;
862 }
863 
864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865 {
866 	if (!iommu || iommu->drhd->ignored)
867 		return true;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pdev = to_pci_dev(dev);
871 
872 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 		    quirk_ioat_snb_local_iommu(pdev))
875 			return true;
876 	}
877 
878 	return false;
879 }
880 
881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882 {
883 	struct dmar_drhd_unit *drhd = NULL;
884 	struct pci_dev *pdev = NULL;
885 	struct intel_iommu *iommu;
886 	struct device *tmp;
887 	u16 segment = 0;
888 	int i;
889 
890 	if (!dev)
891 		return NULL;
892 
893 	if (dev_is_pci(dev)) {
894 		struct pci_dev *pf_pdev;
895 
896 		pdev = pci_real_dma_dev(to_pci_dev(dev));
897 
898 		/* VFs aren't listed in scope tables; we need to look up
899 		 * the PF instead to find the IOMMU. */
900 		pf_pdev = pci_physfn(pdev);
901 		dev = &pf_pdev->dev;
902 		segment = pci_domain_nr(pdev->bus);
903 	} else if (has_acpi_companion(dev))
904 		dev = &ACPI_COMPANION(dev)->dev;
905 
906 	rcu_read_lock();
907 	for_each_iommu(iommu, drhd) {
908 		if (pdev && segment != drhd->segment)
909 			continue;
910 
911 		for_each_active_dev_scope(drhd->devices,
912 					  drhd->devices_cnt, i, tmp) {
913 			if (tmp == dev) {
914 				/* For a VF use its original BDF# not that of the PF
915 				 * which we used for the IOMMU lookup. Strictly speaking
916 				 * we could do this for all PCI devices; we only need to
917 				 * get the BDF# from the scope table for ACPI matches. */
918 				if (pdev && pdev->is_virtfn)
919 					goto got_pdev;
920 
921 				if (bus && devfn) {
922 					*bus = drhd->devices[i].bus;
923 					*devfn = drhd->devices[i].devfn;
924 				}
925 				goto out;
926 			}
927 
928 			if (is_downstream_to_pci_bridge(dev, tmp))
929 				goto got_pdev;
930 		}
931 
932 		if (pdev && drhd->include_all) {
933 		got_pdev:
934 			if (bus && devfn) {
935 				*bus = pdev->bus->number;
936 				*devfn = pdev->devfn;
937 			}
938 			goto out;
939 		}
940 	}
941 	iommu = NULL;
942  out:
943 	if (iommu_is_dummy(iommu, dev))
944 		iommu = NULL;
945 
946 	rcu_read_unlock();
947 
948 	return iommu;
949 }
950 
951 static void domain_flush_cache(struct dmar_domain *domain,
952 			       void *addr, int size)
953 {
954 	if (!domain->iommu_coherency)
955 		clflush_cache_range(addr, size);
956 }
957 
958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959 {
960 	struct context_entry *context;
961 	int ret = 0;
962 	unsigned long flags;
963 
964 	spin_lock_irqsave(&iommu->lock, flags);
965 	context = iommu_context_addr(iommu, bus, devfn, 0);
966 	if (context)
967 		ret = context_present(context);
968 	spin_unlock_irqrestore(&iommu->lock, flags);
969 	return ret;
970 }
971 
972 static void free_context_table(struct intel_iommu *iommu)
973 {
974 	int i;
975 	unsigned long flags;
976 	struct context_entry *context;
977 
978 	spin_lock_irqsave(&iommu->lock, flags);
979 	if (!iommu->root_entry) {
980 		goto out;
981 	}
982 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 		context = iommu_context_addr(iommu, i, 0, 0);
984 		if (context)
985 			free_pgtable_page(context);
986 
987 		if (!sm_supported(iommu))
988 			continue;
989 
990 		context = iommu_context_addr(iommu, i, 0x80, 0);
991 		if (context)
992 			free_pgtable_page(context);
993 
994 	}
995 	free_pgtable_page(iommu->root_entry);
996 	iommu->root_entry = NULL;
997 out:
998 	spin_unlock_irqrestore(&iommu->lock, flags);
999 }
1000 
1001 #ifdef CONFIG_DMAR_DEBUG
1002 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
1003 {
1004 	struct device_domain_info *info;
1005 	struct dma_pte *parent, *pte;
1006 	struct dmar_domain *domain;
1007 	int offset, level;
1008 
1009 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
1010 	if (!info || !info->domain) {
1011 		pr_info("device [%02x:%02x.%d] not probed\n",
1012 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1013 		return;
1014 	}
1015 
1016 	domain = info->domain;
1017 	level = agaw_to_level(domain->agaw);
1018 	parent = domain->pgd;
1019 	if (!parent) {
1020 		pr_info("no page table setup\n");
1021 		return;
1022 	}
1023 
1024 	while (1) {
1025 		offset = pfn_level_offset(pfn, level);
1026 		pte = &parent[offset];
1027 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
1028 			pr_info("PTE not present at level %d\n", level);
1029 			break;
1030 		}
1031 
1032 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
1033 
1034 		if (level == 1)
1035 			break;
1036 
1037 		parent = phys_to_virt(dma_pte_addr(pte));
1038 		level--;
1039 	}
1040 }
1041 
1042 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
1043 			  unsigned long long addr, u32 pasid)
1044 {
1045 	struct pasid_dir_entry *dir, *pde;
1046 	struct pasid_entry *entries, *pte;
1047 	struct context_entry *ctx_entry;
1048 	struct root_entry *rt_entry;
1049 	u8 devfn = source_id & 0xff;
1050 	u8 bus = source_id >> 8;
1051 	int i, dir_index, index;
1052 
1053 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
1054 
1055 	/* root entry dump */
1056 	rt_entry = &iommu->root_entry[bus];
1057 	if (!rt_entry) {
1058 		pr_info("root table entry is not present\n");
1059 		return;
1060 	}
1061 
1062 	if (sm_supported(iommu))
1063 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
1064 			rt_entry->hi, rt_entry->lo);
1065 	else
1066 		pr_info("root entry: 0x%016llx", rt_entry->lo);
1067 
1068 	/* context entry dump */
1069 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
1070 	if (!ctx_entry) {
1071 		pr_info("context table entry is not present\n");
1072 		return;
1073 	}
1074 
1075 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
1076 		ctx_entry->hi, ctx_entry->lo);
1077 
1078 	/* legacy mode does not require PASID entries */
1079 	if (!sm_supported(iommu))
1080 		goto pgtable_walk;
1081 
1082 	/* get the pointer to pasid directory entry */
1083 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
1084 	if (!dir) {
1085 		pr_info("pasid directory entry is not present\n");
1086 		return;
1087 	}
1088 	/* For request-without-pasid, get the pasid from context entry */
1089 	if (intel_iommu_sm && pasid == INVALID_IOASID)
1090 		pasid = PASID_RID2PASID;
1091 
1092 	dir_index = pasid >> PASID_PDE_SHIFT;
1093 	pde = &dir[dir_index];
1094 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
1095 
1096 	/* get the pointer to the pasid table entry */
1097 	entries = get_pasid_table_from_pde(pde);
1098 	if (!entries) {
1099 		pr_info("pasid table entry is not present\n");
1100 		return;
1101 	}
1102 	index = pasid & PASID_PTE_MASK;
1103 	pte = &entries[index];
1104 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1105 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1106 
1107 pgtable_walk:
1108 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1109 }
1110 #endif
1111 
1112 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1113 				      unsigned long pfn, int *target_level)
1114 {
1115 	struct dma_pte *parent, *pte;
1116 	int level = agaw_to_level(domain->agaw);
1117 	int offset;
1118 
1119 	BUG_ON(!domain->pgd);
1120 
1121 	if (!domain_pfn_supported(domain, pfn))
1122 		/* Address beyond IOMMU's addressing capabilities. */
1123 		return NULL;
1124 
1125 	parent = domain->pgd;
1126 
1127 	while (1) {
1128 		void *tmp_page;
1129 
1130 		offset = pfn_level_offset(pfn, level);
1131 		pte = &parent[offset];
1132 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1133 			break;
1134 		if (level == *target_level)
1135 			break;
1136 
1137 		if (!dma_pte_present(pte)) {
1138 			uint64_t pteval;
1139 
1140 			tmp_page = alloc_pgtable_page(domain->nid);
1141 
1142 			if (!tmp_page)
1143 				return NULL;
1144 
1145 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1146 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1147 			if (domain_use_first_level(domain)) {
1148 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1149 				if (iommu_is_dma_domain(&domain->domain))
1150 					pteval |= DMA_FL_PTE_ACCESS;
1151 			}
1152 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1153 				/* Someone else set it while we were thinking; use theirs. */
1154 				free_pgtable_page(tmp_page);
1155 			else
1156 				domain_flush_cache(domain, pte, sizeof(*pte));
1157 		}
1158 		if (level == 1)
1159 			break;
1160 
1161 		parent = phys_to_virt(dma_pte_addr(pte));
1162 		level--;
1163 	}
1164 
1165 	if (!*target_level)
1166 		*target_level = level;
1167 
1168 	return pte;
1169 }
1170 
1171 /* return address's pte at specific level */
1172 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1173 					 unsigned long pfn,
1174 					 int level, int *large_page)
1175 {
1176 	struct dma_pte *parent, *pte;
1177 	int total = agaw_to_level(domain->agaw);
1178 	int offset;
1179 
1180 	parent = domain->pgd;
1181 	while (level <= total) {
1182 		offset = pfn_level_offset(pfn, total);
1183 		pte = &parent[offset];
1184 		if (level == total)
1185 			return pte;
1186 
1187 		if (!dma_pte_present(pte)) {
1188 			*large_page = total;
1189 			break;
1190 		}
1191 
1192 		if (dma_pte_superpage(pte)) {
1193 			*large_page = total;
1194 			return pte;
1195 		}
1196 
1197 		parent = phys_to_virt(dma_pte_addr(pte));
1198 		total--;
1199 	}
1200 	return NULL;
1201 }
1202 
1203 /* clear last level pte, a tlb flush should be followed */
1204 static void dma_pte_clear_range(struct dmar_domain *domain,
1205 				unsigned long start_pfn,
1206 				unsigned long last_pfn)
1207 {
1208 	unsigned int large_page;
1209 	struct dma_pte *first_pte, *pte;
1210 
1211 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1212 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1213 	BUG_ON(start_pfn > last_pfn);
1214 
1215 	/* we don't need lock here; nobody else touches the iova range */
1216 	do {
1217 		large_page = 1;
1218 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1219 		if (!pte) {
1220 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1221 			continue;
1222 		}
1223 		do {
1224 			dma_clear_pte(pte);
1225 			start_pfn += lvl_to_nr_pages(large_page);
1226 			pte++;
1227 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1228 
1229 		domain_flush_cache(domain, first_pte,
1230 				   (void *)pte - (void *)first_pte);
1231 
1232 	} while (start_pfn && start_pfn <= last_pfn);
1233 }
1234 
1235 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1236 			       int retain_level, struct dma_pte *pte,
1237 			       unsigned long pfn, unsigned long start_pfn,
1238 			       unsigned long last_pfn)
1239 {
1240 	pfn = max(start_pfn, pfn);
1241 	pte = &pte[pfn_level_offset(pfn, level)];
1242 
1243 	do {
1244 		unsigned long level_pfn;
1245 		struct dma_pte *level_pte;
1246 
1247 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1248 			goto next;
1249 
1250 		level_pfn = pfn & level_mask(level);
1251 		level_pte = phys_to_virt(dma_pte_addr(pte));
1252 
1253 		if (level > 2) {
1254 			dma_pte_free_level(domain, level - 1, retain_level,
1255 					   level_pte, level_pfn, start_pfn,
1256 					   last_pfn);
1257 		}
1258 
1259 		/*
1260 		 * Free the page table if we're below the level we want to
1261 		 * retain and the range covers the entire table.
1262 		 */
1263 		if (level < retain_level && !(start_pfn > level_pfn ||
1264 		      last_pfn < level_pfn + level_size(level) - 1)) {
1265 			dma_clear_pte(pte);
1266 			domain_flush_cache(domain, pte, sizeof(*pte));
1267 			free_pgtable_page(level_pte);
1268 		}
1269 next:
1270 		pfn += level_size(level);
1271 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1272 }
1273 
1274 /*
1275  * clear last level (leaf) ptes and free page table pages below the
1276  * level we wish to keep intact.
1277  */
1278 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1279 				   unsigned long start_pfn,
1280 				   unsigned long last_pfn,
1281 				   int retain_level)
1282 {
1283 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1284 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1285 	BUG_ON(start_pfn > last_pfn);
1286 
1287 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1288 
1289 	/* We don't need lock here; nobody else touches the iova range */
1290 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1291 			   domain->pgd, 0, start_pfn, last_pfn);
1292 
1293 	/* free pgd */
1294 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1295 		free_pgtable_page(domain->pgd);
1296 		domain->pgd = NULL;
1297 	}
1298 }
1299 
1300 /* When a page at a given level is being unlinked from its parent, we don't
1301    need to *modify* it at all. All we need to do is make a list of all the
1302    pages which can be freed just as soon as we've flushed the IOTLB and we
1303    know the hardware page-walk will no longer touch them.
1304    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1305    be freed. */
1306 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1307 					    int level, struct dma_pte *pte,
1308 					    struct page *freelist)
1309 {
1310 	struct page *pg;
1311 
1312 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1313 	pg->freelist = freelist;
1314 	freelist = pg;
1315 
1316 	if (level == 1)
1317 		return freelist;
1318 
1319 	pte = page_address(pg);
1320 	do {
1321 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1322 			freelist = dma_pte_list_pagetables(domain, level - 1,
1323 							   pte, freelist);
1324 		pte++;
1325 	} while (!first_pte_in_page(pte));
1326 
1327 	return freelist;
1328 }
1329 
1330 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1331 					struct dma_pte *pte, unsigned long pfn,
1332 					unsigned long start_pfn,
1333 					unsigned long last_pfn,
1334 					struct page *freelist)
1335 {
1336 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1337 
1338 	pfn = max(start_pfn, pfn);
1339 	pte = &pte[pfn_level_offset(pfn, level)];
1340 
1341 	do {
1342 		unsigned long level_pfn;
1343 
1344 		if (!dma_pte_present(pte))
1345 			goto next;
1346 
1347 		level_pfn = pfn & level_mask(level);
1348 
1349 		/* If range covers entire pagetable, free it */
1350 		if (start_pfn <= level_pfn &&
1351 		    last_pfn >= level_pfn + level_size(level) - 1) {
1352 			/* These suborbinate page tables are going away entirely. Don't
1353 			   bother to clear them; we're just going to *free* them. */
1354 			if (level > 1 && !dma_pte_superpage(pte))
1355 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1356 
1357 			dma_clear_pte(pte);
1358 			if (!first_pte)
1359 				first_pte = pte;
1360 			last_pte = pte;
1361 		} else if (level > 1) {
1362 			/* Recurse down into a level that isn't *entirely* obsolete */
1363 			freelist = dma_pte_clear_level(domain, level - 1,
1364 						       phys_to_virt(dma_pte_addr(pte)),
1365 						       level_pfn, start_pfn, last_pfn,
1366 						       freelist);
1367 		}
1368 next:
1369 		pfn += level_size(level);
1370 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1371 
1372 	if (first_pte)
1373 		domain_flush_cache(domain, first_pte,
1374 				   (void *)++last_pte - (void *)first_pte);
1375 
1376 	return freelist;
1377 }
1378 
1379 /* We can't just free the pages because the IOMMU may still be walking
1380    the page tables, and may have cached the intermediate levels. The
1381    pages can only be freed after the IOTLB flush has been done. */
1382 static struct page *domain_unmap(struct dmar_domain *domain,
1383 				 unsigned long start_pfn,
1384 				 unsigned long last_pfn,
1385 				 struct page *freelist)
1386 {
1387 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1388 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1389 	BUG_ON(start_pfn > last_pfn);
1390 
1391 	/* we don't need lock here; nobody else touches the iova range */
1392 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1393 				       domain->pgd, 0, start_pfn, last_pfn,
1394 				       freelist);
1395 
1396 	/* free pgd */
1397 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1398 		struct page *pgd_page = virt_to_page(domain->pgd);
1399 		pgd_page->freelist = freelist;
1400 		freelist = pgd_page;
1401 
1402 		domain->pgd = NULL;
1403 	}
1404 
1405 	return freelist;
1406 }
1407 
1408 static void dma_free_pagelist(struct page *freelist)
1409 {
1410 	struct page *pg;
1411 
1412 	while ((pg = freelist)) {
1413 		freelist = pg->freelist;
1414 		free_pgtable_page(page_address(pg));
1415 	}
1416 }
1417 
1418 /* iommu handling */
1419 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1420 {
1421 	struct root_entry *root;
1422 	unsigned long flags;
1423 
1424 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1425 	if (!root) {
1426 		pr_err("Allocating root entry for %s failed\n",
1427 			iommu->name);
1428 		return -ENOMEM;
1429 	}
1430 
1431 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1432 
1433 	spin_lock_irqsave(&iommu->lock, flags);
1434 	iommu->root_entry = root;
1435 	spin_unlock_irqrestore(&iommu->lock, flags);
1436 
1437 	return 0;
1438 }
1439 
1440 static void iommu_set_root_entry(struct intel_iommu *iommu)
1441 {
1442 	u64 addr;
1443 	u32 sts;
1444 	unsigned long flag;
1445 
1446 	addr = virt_to_phys(iommu->root_entry);
1447 	if (sm_supported(iommu))
1448 		addr |= DMA_RTADDR_SMT;
1449 
1450 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1451 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1452 
1453 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1454 
1455 	/* Make sure hardware complete it */
1456 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1457 		      readl, (sts & DMA_GSTS_RTPS), sts);
1458 
1459 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1460 
1461 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1462 	if (sm_supported(iommu))
1463 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1464 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1465 }
1466 
1467 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1468 {
1469 	u32 val;
1470 	unsigned long flag;
1471 
1472 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1473 		return;
1474 
1475 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1476 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1477 
1478 	/* Make sure hardware complete it */
1479 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1480 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1481 
1482 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1483 }
1484 
1485 /* return value determine if we need a write buffer flush */
1486 static void __iommu_flush_context(struct intel_iommu *iommu,
1487 				  u16 did, u16 source_id, u8 function_mask,
1488 				  u64 type)
1489 {
1490 	u64 val = 0;
1491 	unsigned long flag;
1492 
1493 	switch (type) {
1494 	case DMA_CCMD_GLOBAL_INVL:
1495 		val = DMA_CCMD_GLOBAL_INVL;
1496 		break;
1497 	case DMA_CCMD_DOMAIN_INVL:
1498 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1499 		break;
1500 	case DMA_CCMD_DEVICE_INVL:
1501 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1502 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1503 		break;
1504 	default:
1505 		BUG();
1506 	}
1507 	val |= DMA_CCMD_ICC;
1508 
1509 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1510 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1511 
1512 	/* Make sure hardware complete it */
1513 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1514 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1515 
1516 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1517 }
1518 
1519 /* return value determine if we need a write buffer flush */
1520 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1521 				u64 addr, unsigned int size_order, u64 type)
1522 {
1523 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1524 	u64 val = 0, val_iva = 0;
1525 	unsigned long flag;
1526 
1527 	switch (type) {
1528 	case DMA_TLB_GLOBAL_FLUSH:
1529 		/* global flush doesn't need set IVA_REG */
1530 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1531 		break;
1532 	case DMA_TLB_DSI_FLUSH:
1533 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1534 		break;
1535 	case DMA_TLB_PSI_FLUSH:
1536 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1537 		/* IH bit is passed in as part of address */
1538 		val_iva = size_order | addr;
1539 		break;
1540 	default:
1541 		BUG();
1542 	}
1543 	/* Note: set drain read/write */
1544 #if 0
1545 	/*
1546 	 * This is probably to be super secure.. Looks like we can
1547 	 * ignore it without any impact.
1548 	 */
1549 	if (cap_read_drain(iommu->cap))
1550 		val |= DMA_TLB_READ_DRAIN;
1551 #endif
1552 	if (cap_write_drain(iommu->cap))
1553 		val |= DMA_TLB_WRITE_DRAIN;
1554 
1555 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1556 	/* Note: Only uses first TLB reg currently */
1557 	if (val_iva)
1558 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1559 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1560 
1561 	/* Make sure hardware complete it */
1562 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1563 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1564 
1565 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1566 
1567 	/* check IOTLB invalidation granularity */
1568 	if (DMA_TLB_IAIG(val) == 0)
1569 		pr_err("Flush IOTLB failed\n");
1570 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1571 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1572 			(unsigned long long)DMA_TLB_IIRG(type),
1573 			(unsigned long long)DMA_TLB_IAIG(val));
1574 }
1575 
1576 static struct device_domain_info *
1577 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1578 			 u8 bus, u8 devfn)
1579 {
1580 	struct device_domain_info *info;
1581 
1582 	assert_spin_locked(&device_domain_lock);
1583 
1584 	if (!iommu->qi)
1585 		return NULL;
1586 
1587 	list_for_each_entry(info, &domain->devices, link)
1588 		if (info->iommu == iommu && info->bus == bus &&
1589 		    info->devfn == devfn) {
1590 			if (info->ats_supported && info->dev)
1591 				return info;
1592 			break;
1593 		}
1594 
1595 	return NULL;
1596 }
1597 
1598 static void domain_update_iotlb(struct dmar_domain *domain)
1599 {
1600 	struct device_domain_info *info;
1601 	bool has_iotlb_device = false;
1602 
1603 	assert_spin_locked(&device_domain_lock);
1604 
1605 	list_for_each_entry(info, &domain->devices, link)
1606 		if (info->ats_enabled) {
1607 			has_iotlb_device = true;
1608 			break;
1609 		}
1610 
1611 	if (!has_iotlb_device) {
1612 		struct subdev_domain_info *sinfo;
1613 
1614 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1615 			info = get_domain_info(sinfo->pdev);
1616 			if (info && info->ats_enabled) {
1617 				has_iotlb_device = true;
1618 				break;
1619 			}
1620 		}
1621 	}
1622 
1623 	domain->has_iotlb_device = has_iotlb_device;
1624 }
1625 
1626 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1627 {
1628 	struct pci_dev *pdev;
1629 
1630 	assert_spin_locked(&device_domain_lock);
1631 
1632 	if (!info || !dev_is_pci(info->dev))
1633 		return;
1634 
1635 	pdev = to_pci_dev(info->dev);
1636 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1637 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1638 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1639 	 * reserved, which should be set to 0.
1640 	 */
1641 	if (!ecap_dit(info->iommu->ecap))
1642 		info->pfsid = 0;
1643 	else {
1644 		struct pci_dev *pf_pdev;
1645 
1646 		/* pdev will be returned if device is not a vf */
1647 		pf_pdev = pci_physfn(pdev);
1648 		info->pfsid = pci_dev_id(pf_pdev);
1649 	}
1650 
1651 #ifdef CONFIG_INTEL_IOMMU_SVM
1652 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1653 	   the device if you enable PASID support after ATS support is
1654 	   undefined. So always enable PASID support on devices which
1655 	   have it, even if we can't yet know if we're ever going to
1656 	   use it. */
1657 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1658 		info->pasid_enabled = 1;
1659 
1660 	if (info->pri_supported &&
1661 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1662 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1663 		info->pri_enabled = 1;
1664 #endif
1665 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1666 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1667 		info->ats_enabled = 1;
1668 		domain_update_iotlb(info->domain);
1669 		info->ats_qdep = pci_ats_queue_depth(pdev);
1670 	}
1671 }
1672 
1673 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1674 {
1675 	struct pci_dev *pdev;
1676 
1677 	assert_spin_locked(&device_domain_lock);
1678 
1679 	if (!dev_is_pci(info->dev))
1680 		return;
1681 
1682 	pdev = to_pci_dev(info->dev);
1683 
1684 	if (info->ats_enabled) {
1685 		pci_disable_ats(pdev);
1686 		info->ats_enabled = 0;
1687 		domain_update_iotlb(info->domain);
1688 	}
1689 #ifdef CONFIG_INTEL_IOMMU_SVM
1690 	if (info->pri_enabled) {
1691 		pci_disable_pri(pdev);
1692 		info->pri_enabled = 0;
1693 	}
1694 	if (info->pasid_enabled) {
1695 		pci_disable_pasid(pdev);
1696 		info->pasid_enabled = 0;
1697 	}
1698 #endif
1699 }
1700 
1701 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1702 				    u64 addr, unsigned int mask)
1703 {
1704 	u16 sid, qdep;
1705 
1706 	if (!info || !info->ats_enabled)
1707 		return;
1708 
1709 	sid = info->bus << 8 | info->devfn;
1710 	qdep = info->ats_qdep;
1711 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1712 			   qdep, addr, mask);
1713 }
1714 
1715 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1716 				  u64 addr, unsigned mask)
1717 {
1718 	unsigned long flags;
1719 	struct device_domain_info *info;
1720 	struct subdev_domain_info *sinfo;
1721 
1722 	if (!domain->has_iotlb_device)
1723 		return;
1724 
1725 	spin_lock_irqsave(&device_domain_lock, flags);
1726 	list_for_each_entry(info, &domain->devices, link)
1727 		__iommu_flush_dev_iotlb(info, addr, mask);
1728 
1729 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1730 		info = get_domain_info(sinfo->pdev);
1731 		__iommu_flush_dev_iotlb(info, addr, mask);
1732 	}
1733 	spin_unlock_irqrestore(&device_domain_lock, flags);
1734 }
1735 
1736 static void domain_flush_piotlb(struct intel_iommu *iommu,
1737 				struct dmar_domain *domain,
1738 				u64 addr, unsigned long npages, bool ih)
1739 {
1740 	u16 did = domain->iommu_did[iommu->seq_id];
1741 
1742 	if (domain->default_pasid)
1743 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1744 				addr, npages, ih);
1745 
1746 	if (!list_empty(&domain->devices))
1747 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1748 }
1749 
1750 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1751 				  struct dmar_domain *domain,
1752 				  unsigned long pfn, unsigned int pages,
1753 				  int ih, int map)
1754 {
1755 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1756 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1757 	u16 did = domain->iommu_did[iommu->seq_id];
1758 
1759 	BUG_ON(pages == 0);
1760 
1761 	if (ih)
1762 		ih = 1 << 6;
1763 
1764 	if (domain_use_first_level(domain)) {
1765 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1766 	} else {
1767 		/*
1768 		 * Fallback to domain selective flush if no PSI support or
1769 		 * the size is too big. PSI requires page size to be 2 ^ x,
1770 		 * and the base address is naturally aligned to the size.
1771 		 */
1772 		if (!cap_pgsel_inv(iommu->cap) ||
1773 		    mask > cap_max_amask_val(iommu->cap))
1774 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1775 							DMA_TLB_DSI_FLUSH);
1776 		else
1777 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1778 							DMA_TLB_PSI_FLUSH);
1779 	}
1780 
1781 	/*
1782 	 * In caching mode, changes of pages from non-present to present require
1783 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1784 	 */
1785 	if (!cap_caching_mode(iommu->cap) || !map)
1786 		iommu_flush_dev_iotlb(domain, addr, mask);
1787 }
1788 
1789 /* Notification for newly created mappings */
1790 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1791 					struct dmar_domain *domain,
1792 					unsigned long pfn, unsigned int pages)
1793 {
1794 	/*
1795 	 * It's a non-present to present mapping. Only flush if caching mode
1796 	 * and second level.
1797 	 */
1798 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1799 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1800 	else
1801 		iommu_flush_write_buffer(iommu);
1802 }
1803 
1804 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1805 {
1806 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1807 	int idx;
1808 
1809 	for_each_domain_iommu(idx, dmar_domain) {
1810 		struct intel_iommu *iommu = g_iommus[idx];
1811 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1812 
1813 		if (domain_use_first_level(dmar_domain))
1814 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1815 		else
1816 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1817 						 DMA_TLB_DSI_FLUSH);
1818 
1819 		if (!cap_caching_mode(iommu->cap))
1820 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1821 					      0, MAX_AGAW_PFN_WIDTH);
1822 	}
1823 }
1824 
1825 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1826 {
1827 	u32 pmen;
1828 	unsigned long flags;
1829 
1830 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1831 		return;
1832 
1833 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1834 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1835 	pmen &= ~DMA_PMEN_EPM;
1836 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1837 
1838 	/* wait for the protected region status bit to clear */
1839 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1840 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1841 
1842 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1843 }
1844 
1845 static void iommu_enable_translation(struct intel_iommu *iommu)
1846 {
1847 	u32 sts;
1848 	unsigned long flags;
1849 
1850 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1851 	iommu->gcmd |= DMA_GCMD_TE;
1852 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1853 
1854 	/* Make sure hardware complete it */
1855 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1856 		      readl, (sts & DMA_GSTS_TES), sts);
1857 
1858 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1859 }
1860 
1861 static void iommu_disable_translation(struct intel_iommu *iommu)
1862 {
1863 	u32 sts;
1864 	unsigned long flag;
1865 
1866 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1867 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1868 		return;
1869 
1870 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1871 	iommu->gcmd &= ~DMA_GCMD_TE;
1872 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1873 
1874 	/* Make sure hardware complete it */
1875 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1876 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1877 
1878 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1879 }
1880 
1881 static int iommu_init_domains(struct intel_iommu *iommu)
1882 {
1883 	u32 ndomains, nlongs;
1884 	size_t size;
1885 
1886 	ndomains = cap_ndoms(iommu->cap);
1887 	pr_debug("%s: Number of Domains supported <%d>\n",
1888 		 iommu->name, ndomains);
1889 	nlongs = BITS_TO_LONGS(ndomains);
1890 
1891 	spin_lock_init(&iommu->lock);
1892 
1893 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1894 	if (!iommu->domain_ids)
1895 		return -ENOMEM;
1896 
1897 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1898 	iommu->domains = kzalloc(size, GFP_KERNEL);
1899 
1900 	if (iommu->domains) {
1901 		size = 256 * sizeof(struct dmar_domain *);
1902 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1903 	}
1904 
1905 	if (!iommu->domains || !iommu->domains[0]) {
1906 		pr_err("%s: Allocating domain array failed\n",
1907 		       iommu->name);
1908 		kfree(iommu->domain_ids);
1909 		kfree(iommu->domains);
1910 		iommu->domain_ids = NULL;
1911 		iommu->domains    = NULL;
1912 		return -ENOMEM;
1913 	}
1914 
1915 	/*
1916 	 * If Caching mode is set, then invalid translations are tagged
1917 	 * with domain-id 0, hence we need to pre-allocate it. We also
1918 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1919 	 * make sure it is not used for a real domain.
1920 	 */
1921 	set_bit(0, iommu->domain_ids);
1922 
1923 	/*
1924 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1925 	 * entry for first-level or pass-through translation modes should
1926 	 * be programmed with a domain id different from those used for
1927 	 * second-level or nested translation. We reserve a domain id for
1928 	 * this purpose.
1929 	 */
1930 	if (sm_supported(iommu))
1931 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1932 
1933 	return 0;
1934 }
1935 
1936 static void disable_dmar_iommu(struct intel_iommu *iommu)
1937 {
1938 	struct device_domain_info *info, *tmp;
1939 	unsigned long flags;
1940 
1941 	if (!iommu->domains || !iommu->domain_ids)
1942 		return;
1943 
1944 	spin_lock_irqsave(&device_domain_lock, flags);
1945 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1946 		if (info->iommu != iommu)
1947 			continue;
1948 
1949 		if (!info->dev || !info->domain)
1950 			continue;
1951 
1952 		__dmar_remove_one_dev_info(info);
1953 	}
1954 	spin_unlock_irqrestore(&device_domain_lock, flags);
1955 
1956 	if (iommu->gcmd & DMA_GCMD_TE)
1957 		iommu_disable_translation(iommu);
1958 }
1959 
1960 static void free_dmar_iommu(struct intel_iommu *iommu)
1961 {
1962 	if ((iommu->domains) && (iommu->domain_ids)) {
1963 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1964 		int i;
1965 
1966 		for (i = 0; i < elems; i++)
1967 			kfree(iommu->domains[i]);
1968 		kfree(iommu->domains);
1969 		kfree(iommu->domain_ids);
1970 		iommu->domains = NULL;
1971 		iommu->domain_ids = NULL;
1972 	}
1973 
1974 	g_iommus[iommu->seq_id] = NULL;
1975 
1976 	/* free context mapping */
1977 	free_context_table(iommu);
1978 
1979 #ifdef CONFIG_INTEL_IOMMU_SVM
1980 	if (pasid_supported(iommu)) {
1981 		if (ecap_prs(iommu->ecap))
1982 			intel_svm_finish_prq(iommu);
1983 	}
1984 	if (vccap_pasid(iommu->vccap))
1985 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1986 
1987 #endif
1988 }
1989 
1990 /*
1991  * Check and return whether first level is used by default for
1992  * DMA translation.
1993  */
1994 static bool first_level_by_default(unsigned int type)
1995 {
1996 	/* Only SL is available in legacy mode */
1997 	if (!scalable_mode_support())
1998 		return false;
1999 
2000 	/* Only level (either FL or SL) is available, just use it */
2001 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
2002 		return intel_cap_flts_sanity();
2003 
2004 	/* Both levels are available, decide it based on domain type */
2005 	return type != IOMMU_DOMAIN_UNMANAGED;
2006 }
2007 
2008 static struct dmar_domain *alloc_domain(unsigned int type)
2009 {
2010 	struct dmar_domain *domain;
2011 
2012 	domain = alloc_domain_mem();
2013 	if (!domain)
2014 		return NULL;
2015 
2016 	memset(domain, 0, sizeof(*domain));
2017 	domain->nid = NUMA_NO_NODE;
2018 	if (first_level_by_default(type))
2019 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
2020 	domain->has_iotlb_device = false;
2021 	INIT_LIST_HEAD(&domain->devices);
2022 	INIT_LIST_HEAD(&domain->subdevices);
2023 
2024 	return domain;
2025 }
2026 
2027 /* Must be called with iommu->lock */
2028 static int domain_attach_iommu(struct dmar_domain *domain,
2029 			       struct intel_iommu *iommu)
2030 {
2031 	unsigned long ndomains;
2032 	int num;
2033 
2034 	assert_spin_locked(&device_domain_lock);
2035 	assert_spin_locked(&iommu->lock);
2036 
2037 	domain->iommu_refcnt[iommu->seq_id] += 1;
2038 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
2039 		ndomains = cap_ndoms(iommu->cap);
2040 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
2041 
2042 		if (num >= ndomains) {
2043 			pr_err("%s: No free domain ids\n", iommu->name);
2044 			domain->iommu_refcnt[iommu->seq_id] -= 1;
2045 			return -ENOSPC;
2046 		}
2047 
2048 		set_bit(num, iommu->domain_ids);
2049 		set_iommu_domain(iommu, num, domain);
2050 
2051 		domain->iommu_did[iommu->seq_id] = num;
2052 		domain->nid			 = iommu->node;
2053 
2054 		domain_update_iommu_cap(domain);
2055 	}
2056 
2057 	return 0;
2058 }
2059 
2060 static void domain_detach_iommu(struct dmar_domain *domain,
2061 				struct intel_iommu *iommu)
2062 {
2063 	int num;
2064 
2065 	assert_spin_locked(&device_domain_lock);
2066 	assert_spin_locked(&iommu->lock);
2067 
2068 	domain->iommu_refcnt[iommu->seq_id] -= 1;
2069 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2070 		num = domain->iommu_did[iommu->seq_id];
2071 		clear_bit(num, iommu->domain_ids);
2072 		set_iommu_domain(iommu, num, NULL);
2073 
2074 		domain_update_iommu_cap(domain);
2075 		domain->iommu_did[iommu->seq_id] = 0;
2076 	}
2077 }
2078 
2079 static inline int guestwidth_to_adjustwidth(int gaw)
2080 {
2081 	int agaw;
2082 	int r = (gaw - 12) % 9;
2083 
2084 	if (r == 0)
2085 		agaw = gaw;
2086 	else
2087 		agaw = gaw + 9 - r;
2088 	if (agaw > 64)
2089 		agaw = 64;
2090 	return agaw;
2091 }
2092 
2093 static void domain_exit(struct dmar_domain *domain)
2094 {
2095 
2096 	/* Remove associated devices and clear attached or cached domains */
2097 	domain_remove_dev_info(domain);
2098 
2099 	if (domain->pgd) {
2100 		struct page *freelist;
2101 
2102 		freelist = domain_unmap(domain, 0,
2103 					DOMAIN_MAX_PFN(domain->gaw), NULL);
2104 		dma_free_pagelist(freelist);
2105 	}
2106 
2107 	free_domain_mem(domain);
2108 }
2109 
2110 /*
2111  * Get the PASID directory size for scalable mode context entry.
2112  * Value of X in the PDTS field of a scalable mode context entry
2113  * indicates PASID directory with 2^(X + 7) entries.
2114  */
2115 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2116 {
2117 	int pds, max_pde;
2118 
2119 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2120 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2121 	if (pds < 7)
2122 		return 0;
2123 
2124 	return pds - 7;
2125 }
2126 
2127 /*
2128  * Set the RID_PASID field of a scalable mode context entry. The
2129  * IOMMU hardware will use the PASID value set in this field for
2130  * DMA translations of DMA requests without PASID.
2131  */
2132 static inline void
2133 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2134 {
2135 	context->hi |= pasid & ((1 << 20) - 1);
2136 }
2137 
2138 /*
2139  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2140  * entry.
2141  */
2142 static inline void context_set_sm_dte(struct context_entry *context)
2143 {
2144 	context->lo |= (1 << 2);
2145 }
2146 
2147 /*
2148  * Set the PRE(Page Request Enable) field of a scalable mode context
2149  * entry.
2150  */
2151 static inline void context_set_sm_pre(struct context_entry *context)
2152 {
2153 	context->lo |= (1 << 4);
2154 }
2155 
2156 /* Convert value to context PASID directory size field coding. */
2157 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2158 
2159 static int domain_context_mapping_one(struct dmar_domain *domain,
2160 				      struct intel_iommu *iommu,
2161 				      struct pasid_table *table,
2162 				      u8 bus, u8 devfn)
2163 {
2164 	u16 did = domain->iommu_did[iommu->seq_id];
2165 	int translation = CONTEXT_TT_MULTI_LEVEL;
2166 	struct device_domain_info *info = NULL;
2167 	struct context_entry *context;
2168 	unsigned long flags;
2169 	int ret;
2170 
2171 	WARN_ON(did == 0);
2172 
2173 	if (hw_pass_through && domain_type_is_si(domain))
2174 		translation = CONTEXT_TT_PASS_THROUGH;
2175 
2176 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2177 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2178 
2179 	BUG_ON(!domain->pgd);
2180 
2181 	spin_lock_irqsave(&device_domain_lock, flags);
2182 	spin_lock(&iommu->lock);
2183 
2184 	ret = -ENOMEM;
2185 	context = iommu_context_addr(iommu, bus, devfn, 1);
2186 	if (!context)
2187 		goto out_unlock;
2188 
2189 	ret = 0;
2190 	if (context_present(context))
2191 		goto out_unlock;
2192 
2193 	/*
2194 	 * For kdump cases, old valid entries may be cached due to the
2195 	 * in-flight DMA and copied pgtable, but there is no unmapping
2196 	 * behaviour for them, thus we need an explicit cache flush for
2197 	 * the newly-mapped device. For kdump, at this point, the device
2198 	 * is supposed to finish reset at its driver probe stage, so no
2199 	 * in-flight DMA will exist, and we don't need to worry anymore
2200 	 * hereafter.
2201 	 */
2202 	if (context_copied(context)) {
2203 		u16 did_old = context_domain_id(context);
2204 
2205 		if (did_old < cap_ndoms(iommu->cap)) {
2206 			iommu->flush.flush_context(iommu, did_old,
2207 						   (((u16)bus) << 8) | devfn,
2208 						   DMA_CCMD_MASK_NOBIT,
2209 						   DMA_CCMD_DEVICE_INVL);
2210 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2211 						 DMA_TLB_DSI_FLUSH);
2212 		}
2213 	}
2214 
2215 	context_clear_entry(context);
2216 
2217 	if (sm_supported(iommu)) {
2218 		unsigned long pds;
2219 
2220 		WARN_ON(!table);
2221 
2222 		/* Setup the PASID DIR pointer: */
2223 		pds = context_get_sm_pds(table);
2224 		context->lo = (u64)virt_to_phys(table->table) |
2225 				context_pdts(pds);
2226 
2227 		/* Setup the RID_PASID field: */
2228 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2229 
2230 		/*
2231 		 * Setup the Device-TLB enable bit and Page request
2232 		 * Enable bit:
2233 		 */
2234 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2235 		if (info && info->ats_supported)
2236 			context_set_sm_dte(context);
2237 		if (info && info->pri_supported)
2238 			context_set_sm_pre(context);
2239 	} else {
2240 		struct dma_pte *pgd = domain->pgd;
2241 		int agaw;
2242 
2243 		context_set_domain_id(context, did);
2244 
2245 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2246 			/*
2247 			 * Skip top levels of page tables for iommu which has
2248 			 * less agaw than default. Unnecessary for PT mode.
2249 			 */
2250 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2251 				ret = -ENOMEM;
2252 				pgd = phys_to_virt(dma_pte_addr(pgd));
2253 				if (!dma_pte_present(pgd))
2254 					goto out_unlock;
2255 			}
2256 
2257 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2258 			if (info && info->ats_supported)
2259 				translation = CONTEXT_TT_DEV_IOTLB;
2260 			else
2261 				translation = CONTEXT_TT_MULTI_LEVEL;
2262 
2263 			context_set_address_root(context, virt_to_phys(pgd));
2264 			context_set_address_width(context, agaw);
2265 		} else {
2266 			/*
2267 			 * In pass through mode, AW must be programmed to
2268 			 * indicate the largest AGAW value supported by
2269 			 * hardware. And ASR is ignored by hardware.
2270 			 */
2271 			context_set_address_width(context, iommu->msagaw);
2272 		}
2273 
2274 		context_set_translation_type(context, translation);
2275 	}
2276 
2277 	context_set_fault_enable(context);
2278 	context_set_present(context);
2279 	if (!ecap_coherent(iommu->ecap))
2280 		clflush_cache_range(context, sizeof(*context));
2281 
2282 	/*
2283 	 * It's a non-present to present mapping. If hardware doesn't cache
2284 	 * non-present entry we only need to flush the write-buffer. If the
2285 	 * _does_ cache non-present entries, then it does so in the special
2286 	 * domain #0, which we have to flush:
2287 	 */
2288 	if (cap_caching_mode(iommu->cap)) {
2289 		iommu->flush.flush_context(iommu, 0,
2290 					   (((u16)bus) << 8) | devfn,
2291 					   DMA_CCMD_MASK_NOBIT,
2292 					   DMA_CCMD_DEVICE_INVL);
2293 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2294 	} else {
2295 		iommu_flush_write_buffer(iommu);
2296 	}
2297 	iommu_enable_dev_iotlb(info);
2298 
2299 	ret = 0;
2300 
2301 out_unlock:
2302 	spin_unlock(&iommu->lock);
2303 	spin_unlock_irqrestore(&device_domain_lock, flags);
2304 
2305 	return ret;
2306 }
2307 
2308 struct domain_context_mapping_data {
2309 	struct dmar_domain *domain;
2310 	struct intel_iommu *iommu;
2311 	struct pasid_table *table;
2312 };
2313 
2314 static int domain_context_mapping_cb(struct pci_dev *pdev,
2315 				     u16 alias, void *opaque)
2316 {
2317 	struct domain_context_mapping_data *data = opaque;
2318 
2319 	return domain_context_mapping_one(data->domain, data->iommu,
2320 					  data->table, PCI_BUS_NUM(alias),
2321 					  alias & 0xff);
2322 }
2323 
2324 static int
2325 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2326 {
2327 	struct domain_context_mapping_data data;
2328 	struct pasid_table *table;
2329 	struct intel_iommu *iommu;
2330 	u8 bus, devfn;
2331 
2332 	iommu = device_to_iommu(dev, &bus, &devfn);
2333 	if (!iommu)
2334 		return -ENODEV;
2335 
2336 	table = intel_pasid_get_table(dev);
2337 
2338 	if (!dev_is_pci(dev))
2339 		return domain_context_mapping_one(domain, iommu, table,
2340 						  bus, devfn);
2341 
2342 	data.domain = domain;
2343 	data.iommu = iommu;
2344 	data.table = table;
2345 
2346 	return pci_for_each_dma_alias(to_pci_dev(dev),
2347 				      &domain_context_mapping_cb, &data);
2348 }
2349 
2350 static int domain_context_mapped_cb(struct pci_dev *pdev,
2351 				    u16 alias, void *opaque)
2352 {
2353 	struct intel_iommu *iommu = opaque;
2354 
2355 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2356 }
2357 
2358 static int domain_context_mapped(struct device *dev)
2359 {
2360 	struct intel_iommu *iommu;
2361 	u8 bus, devfn;
2362 
2363 	iommu = device_to_iommu(dev, &bus, &devfn);
2364 	if (!iommu)
2365 		return -ENODEV;
2366 
2367 	if (!dev_is_pci(dev))
2368 		return device_context_mapped(iommu, bus, devfn);
2369 
2370 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2371 				       domain_context_mapped_cb, iommu);
2372 }
2373 
2374 /* Returns a number of VTD pages, but aligned to MM page size */
2375 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2376 					    size_t size)
2377 {
2378 	host_addr &= ~PAGE_MASK;
2379 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2380 }
2381 
2382 /* Return largest possible superpage level for a given mapping */
2383 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2384 					  unsigned long iov_pfn,
2385 					  unsigned long phy_pfn,
2386 					  unsigned long pages)
2387 {
2388 	int support, level = 1;
2389 	unsigned long pfnmerge;
2390 
2391 	support = domain->iommu_superpage;
2392 
2393 	/* To use a large page, the virtual *and* physical addresses
2394 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2395 	   of them will mean we have to use smaller pages. So just
2396 	   merge them and check both at once. */
2397 	pfnmerge = iov_pfn | phy_pfn;
2398 
2399 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2400 		pages >>= VTD_STRIDE_SHIFT;
2401 		if (!pages)
2402 			break;
2403 		pfnmerge >>= VTD_STRIDE_SHIFT;
2404 		level++;
2405 		support--;
2406 	}
2407 	return level;
2408 }
2409 
2410 /*
2411  * Ensure that old small page tables are removed to make room for superpage(s).
2412  * We're going to add new large pages, so make sure we don't remove their parent
2413  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2414  */
2415 static void switch_to_super_page(struct dmar_domain *domain,
2416 				 unsigned long start_pfn,
2417 				 unsigned long end_pfn, int level)
2418 {
2419 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2420 	struct dma_pte *pte = NULL;
2421 	int i;
2422 
2423 	while (start_pfn <= end_pfn) {
2424 		if (!pte)
2425 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2426 
2427 		if (dma_pte_present(pte)) {
2428 			dma_pte_free_pagetable(domain, start_pfn,
2429 					       start_pfn + lvl_pages - 1,
2430 					       level + 1);
2431 
2432 			for_each_domain_iommu(i, domain)
2433 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2434 						      start_pfn, lvl_pages,
2435 						      0, 0);
2436 		}
2437 
2438 		pte++;
2439 		start_pfn += lvl_pages;
2440 		if (first_pte_in_page(pte))
2441 			pte = NULL;
2442 	}
2443 }
2444 
2445 static int
2446 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2447 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2448 {
2449 	struct dma_pte *first_pte = NULL, *pte = NULL;
2450 	unsigned int largepage_lvl = 0;
2451 	unsigned long lvl_pages = 0;
2452 	phys_addr_t pteval;
2453 	u64 attr;
2454 
2455 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2456 
2457 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2458 		return -EINVAL;
2459 
2460 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2461 	attr |= DMA_FL_PTE_PRESENT;
2462 	if (domain_use_first_level(domain)) {
2463 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2464 		if (prot & DMA_PTE_WRITE)
2465 			attr |= DMA_FL_PTE_DIRTY;
2466 	}
2467 
2468 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2469 
2470 	while (nr_pages > 0) {
2471 		uint64_t tmp;
2472 
2473 		if (!pte) {
2474 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2475 					phys_pfn, nr_pages);
2476 
2477 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2478 			if (!pte)
2479 				return -ENOMEM;
2480 			first_pte = pte;
2481 
2482 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2483 
2484 			/* It is large page*/
2485 			if (largepage_lvl > 1) {
2486 				unsigned long end_pfn;
2487 				unsigned long pages_to_remove;
2488 
2489 				pteval |= DMA_PTE_LARGE_PAGE;
2490 				pages_to_remove = min_t(unsigned long, nr_pages,
2491 							nr_pte_to_next_page(pte) * lvl_pages);
2492 				end_pfn = iov_pfn + pages_to_remove - 1;
2493 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2494 			} else {
2495 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2496 			}
2497 
2498 		}
2499 		/* We don't need lock here, nobody else
2500 		 * touches the iova range
2501 		 */
2502 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2503 		if (tmp) {
2504 			static int dumps = 5;
2505 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2506 				iov_pfn, tmp, (unsigned long long)pteval);
2507 			if (dumps) {
2508 				dumps--;
2509 				debug_dma_dump_mappings(NULL);
2510 			}
2511 			WARN_ON(1);
2512 		}
2513 
2514 		nr_pages -= lvl_pages;
2515 		iov_pfn += lvl_pages;
2516 		phys_pfn += lvl_pages;
2517 		pteval += lvl_pages * VTD_PAGE_SIZE;
2518 
2519 		/* If the next PTE would be the first in a new page, then we
2520 		 * need to flush the cache on the entries we've just written.
2521 		 * And then we'll need to recalculate 'pte', so clear it and
2522 		 * let it get set again in the if (!pte) block above.
2523 		 *
2524 		 * If we're done (!nr_pages) we need to flush the cache too.
2525 		 *
2526 		 * Also if we've been setting superpages, we may need to
2527 		 * recalculate 'pte' and switch back to smaller pages for the
2528 		 * end of the mapping, if the trailing size is not enough to
2529 		 * use another superpage (i.e. nr_pages < lvl_pages).
2530 		 */
2531 		pte++;
2532 		if (!nr_pages || first_pte_in_page(pte) ||
2533 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2534 			domain_flush_cache(domain, first_pte,
2535 					   (void *)pte - (void *)first_pte);
2536 			pte = NULL;
2537 		}
2538 	}
2539 
2540 	return 0;
2541 }
2542 
2543 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2544 {
2545 	struct intel_iommu *iommu = info->iommu;
2546 	struct context_entry *context;
2547 	unsigned long flags;
2548 	u16 did_old;
2549 
2550 	if (!iommu)
2551 		return;
2552 
2553 	spin_lock_irqsave(&iommu->lock, flags);
2554 	context = iommu_context_addr(iommu, bus, devfn, 0);
2555 	if (!context) {
2556 		spin_unlock_irqrestore(&iommu->lock, flags);
2557 		return;
2558 	}
2559 
2560 	if (sm_supported(iommu)) {
2561 		if (hw_pass_through && domain_type_is_si(info->domain))
2562 			did_old = FLPT_DEFAULT_DID;
2563 		else
2564 			did_old = info->domain->iommu_did[iommu->seq_id];
2565 	} else {
2566 		did_old = context_domain_id(context);
2567 	}
2568 
2569 	context_clear_entry(context);
2570 	__iommu_flush_cache(iommu, context, sizeof(*context));
2571 	spin_unlock_irqrestore(&iommu->lock, flags);
2572 	iommu->flush.flush_context(iommu,
2573 				   did_old,
2574 				   (((u16)bus) << 8) | devfn,
2575 				   DMA_CCMD_MASK_NOBIT,
2576 				   DMA_CCMD_DEVICE_INVL);
2577 
2578 	if (sm_supported(iommu))
2579 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2580 
2581 	iommu->flush.flush_iotlb(iommu,
2582 				 did_old,
2583 				 0,
2584 				 0,
2585 				 DMA_TLB_DSI_FLUSH);
2586 
2587 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2588 }
2589 
2590 static inline void unlink_domain_info(struct device_domain_info *info)
2591 {
2592 	assert_spin_locked(&device_domain_lock);
2593 	list_del(&info->link);
2594 	list_del(&info->global);
2595 	if (info->dev)
2596 		dev_iommu_priv_set(info->dev, NULL);
2597 }
2598 
2599 static void domain_remove_dev_info(struct dmar_domain *domain)
2600 {
2601 	struct device_domain_info *info, *tmp;
2602 	unsigned long flags;
2603 
2604 	spin_lock_irqsave(&device_domain_lock, flags);
2605 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2606 		__dmar_remove_one_dev_info(info);
2607 	spin_unlock_irqrestore(&device_domain_lock, flags);
2608 }
2609 
2610 struct dmar_domain *find_domain(struct device *dev)
2611 {
2612 	struct device_domain_info *info;
2613 
2614 	if (unlikely(!dev || !dev->iommu))
2615 		return NULL;
2616 
2617 	if (unlikely(attach_deferred(dev)))
2618 		return NULL;
2619 
2620 	/* No lock here, assumes no domain exit in normal case */
2621 	info = get_domain_info(dev);
2622 	if (likely(info))
2623 		return info->domain;
2624 
2625 	return NULL;
2626 }
2627 
2628 static inline struct device_domain_info *
2629 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2630 {
2631 	struct device_domain_info *info;
2632 
2633 	list_for_each_entry(info, &device_domain_list, global)
2634 		if (info->segment == segment && info->bus == bus &&
2635 		    info->devfn == devfn)
2636 			return info;
2637 
2638 	return NULL;
2639 }
2640 
2641 static int domain_setup_first_level(struct intel_iommu *iommu,
2642 				    struct dmar_domain *domain,
2643 				    struct device *dev,
2644 				    u32 pasid)
2645 {
2646 	struct dma_pte *pgd = domain->pgd;
2647 	int agaw, level;
2648 	int flags = 0;
2649 
2650 	/*
2651 	 * Skip top levels of page tables for iommu which has
2652 	 * less agaw than default. Unnecessary for PT mode.
2653 	 */
2654 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2655 		pgd = phys_to_virt(dma_pte_addr(pgd));
2656 		if (!dma_pte_present(pgd))
2657 			return -ENOMEM;
2658 	}
2659 
2660 	level = agaw_to_level(agaw);
2661 	if (level != 4 && level != 5)
2662 		return -EINVAL;
2663 
2664 	if (pasid != PASID_RID2PASID)
2665 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2666 	if (level == 5)
2667 		flags |= PASID_FLAG_FL5LP;
2668 
2669 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2670 		flags |= PASID_FLAG_PAGE_SNOOP;
2671 
2672 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2673 					     domain->iommu_did[iommu->seq_id],
2674 					     flags);
2675 }
2676 
2677 static bool dev_is_real_dma_subdevice(struct device *dev)
2678 {
2679 	return dev && dev_is_pci(dev) &&
2680 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2681 }
2682 
2683 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2684 						    int bus, int devfn,
2685 						    struct device *dev,
2686 						    struct dmar_domain *domain)
2687 {
2688 	struct dmar_domain *found = NULL;
2689 	struct device_domain_info *info;
2690 	unsigned long flags;
2691 	int ret;
2692 
2693 	info = alloc_devinfo_mem();
2694 	if (!info)
2695 		return NULL;
2696 
2697 	if (!dev_is_real_dma_subdevice(dev)) {
2698 		info->bus = bus;
2699 		info->devfn = devfn;
2700 		info->segment = iommu->segment;
2701 	} else {
2702 		struct pci_dev *pdev = to_pci_dev(dev);
2703 
2704 		info->bus = pdev->bus->number;
2705 		info->devfn = pdev->devfn;
2706 		info->segment = pci_domain_nr(pdev->bus);
2707 	}
2708 
2709 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2710 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2711 	info->ats_qdep = 0;
2712 	info->dev = dev;
2713 	info->domain = domain;
2714 	info->iommu = iommu;
2715 	info->pasid_table = NULL;
2716 	info->auxd_enabled = 0;
2717 	INIT_LIST_HEAD(&info->subdevices);
2718 
2719 	if (dev && dev_is_pci(dev)) {
2720 		struct pci_dev *pdev = to_pci_dev(info->dev);
2721 
2722 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2723 		    pci_ats_supported(pdev) &&
2724 		    dmar_find_matched_atsr_unit(pdev))
2725 			info->ats_supported = 1;
2726 
2727 		if (sm_supported(iommu)) {
2728 			if (pasid_supported(iommu)) {
2729 				int features = pci_pasid_features(pdev);
2730 				if (features >= 0)
2731 					info->pasid_supported = features | 1;
2732 			}
2733 
2734 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2735 			    pci_pri_supported(pdev))
2736 				info->pri_supported = 1;
2737 		}
2738 	}
2739 
2740 	spin_lock_irqsave(&device_domain_lock, flags);
2741 	if (dev)
2742 		found = find_domain(dev);
2743 
2744 	if (!found) {
2745 		struct device_domain_info *info2;
2746 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2747 						       info->devfn);
2748 		if (info2) {
2749 			found      = info2->domain;
2750 			info2->dev = dev;
2751 		}
2752 	}
2753 
2754 	if (found) {
2755 		spin_unlock_irqrestore(&device_domain_lock, flags);
2756 		free_devinfo_mem(info);
2757 		/* Caller must free the original domain */
2758 		return found;
2759 	}
2760 
2761 	spin_lock(&iommu->lock);
2762 	ret = domain_attach_iommu(domain, iommu);
2763 	spin_unlock(&iommu->lock);
2764 
2765 	if (ret) {
2766 		spin_unlock_irqrestore(&device_domain_lock, flags);
2767 		free_devinfo_mem(info);
2768 		return NULL;
2769 	}
2770 
2771 	list_add(&info->link, &domain->devices);
2772 	list_add(&info->global, &device_domain_list);
2773 	if (dev)
2774 		dev_iommu_priv_set(dev, info);
2775 	spin_unlock_irqrestore(&device_domain_lock, flags);
2776 
2777 	/* PASID table is mandatory for a PCI device in scalable mode. */
2778 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2779 		ret = intel_pasid_alloc_table(dev);
2780 		if (ret) {
2781 			dev_err(dev, "PASID table allocation failed\n");
2782 			dmar_remove_one_dev_info(dev);
2783 			return NULL;
2784 		}
2785 
2786 		/* Setup the PASID entry for requests without PASID: */
2787 		spin_lock_irqsave(&iommu->lock, flags);
2788 		if (hw_pass_through && domain_type_is_si(domain))
2789 			ret = intel_pasid_setup_pass_through(iommu, domain,
2790 					dev, PASID_RID2PASID);
2791 		else if (domain_use_first_level(domain))
2792 			ret = domain_setup_first_level(iommu, domain, dev,
2793 					PASID_RID2PASID);
2794 		else
2795 			ret = intel_pasid_setup_second_level(iommu, domain,
2796 					dev, PASID_RID2PASID);
2797 		spin_unlock_irqrestore(&iommu->lock, flags);
2798 		if (ret) {
2799 			dev_err(dev, "Setup RID2PASID failed\n");
2800 			dmar_remove_one_dev_info(dev);
2801 			return NULL;
2802 		}
2803 	}
2804 
2805 	if (dev && domain_context_mapping(domain, dev)) {
2806 		dev_err(dev, "Domain context map failed\n");
2807 		dmar_remove_one_dev_info(dev);
2808 		return NULL;
2809 	}
2810 
2811 	return domain;
2812 }
2813 
2814 static int iommu_domain_identity_map(struct dmar_domain *domain,
2815 				     unsigned long first_vpfn,
2816 				     unsigned long last_vpfn)
2817 {
2818 	/*
2819 	 * RMRR range might have overlap with physical memory range,
2820 	 * clear it first
2821 	 */
2822 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2823 
2824 	return __domain_mapping(domain, first_vpfn,
2825 				first_vpfn, last_vpfn - first_vpfn + 1,
2826 				DMA_PTE_READ|DMA_PTE_WRITE);
2827 }
2828 
2829 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2830 
2831 static int __init si_domain_init(int hw)
2832 {
2833 	struct dmar_rmrr_unit *rmrr;
2834 	struct device *dev;
2835 	int i, nid, ret;
2836 
2837 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2838 	if (!si_domain)
2839 		return -EFAULT;
2840 
2841 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2842 		domain_exit(si_domain);
2843 		return -EFAULT;
2844 	}
2845 
2846 	if (hw)
2847 		return 0;
2848 
2849 	for_each_online_node(nid) {
2850 		unsigned long start_pfn, end_pfn;
2851 		int i;
2852 
2853 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2854 			ret = iommu_domain_identity_map(si_domain,
2855 					mm_to_dma_pfn(start_pfn),
2856 					mm_to_dma_pfn(end_pfn));
2857 			if (ret)
2858 				return ret;
2859 		}
2860 	}
2861 
2862 	/*
2863 	 * Identity map the RMRRs so that devices with RMRRs could also use
2864 	 * the si_domain.
2865 	 */
2866 	for_each_rmrr_units(rmrr) {
2867 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2868 					  i, dev) {
2869 			unsigned long long start = rmrr->base_address;
2870 			unsigned long long end = rmrr->end_address;
2871 
2872 			if (WARN_ON(end < start ||
2873 				    end >> agaw_to_width(si_domain->agaw)))
2874 				continue;
2875 
2876 			ret = iommu_domain_identity_map(si_domain,
2877 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2878 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2879 			if (ret)
2880 				return ret;
2881 		}
2882 	}
2883 
2884 	return 0;
2885 }
2886 
2887 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2888 {
2889 	struct dmar_domain *ndomain;
2890 	struct intel_iommu *iommu;
2891 	u8 bus, devfn;
2892 
2893 	iommu = device_to_iommu(dev, &bus, &devfn);
2894 	if (!iommu)
2895 		return -ENODEV;
2896 
2897 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2898 	if (ndomain != domain)
2899 		return -EBUSY;
2900 
2901 	return 0;
2902 }
2903 
2904 static bool device_has_rmrr(struct device *dev)
2905 {
2906 	struct dmar_rmrr_unit *rmrr;
2907 	struct device *tmp;
2908 	int i;
2909 
2910 	rcu_read_lock();
2911 	for_each_rmrr_units(rmrr) {
2912 		/*
2913 		 * Return TRUE if this RMRR contains the device that
2914 		 * is passed in.
2915 		 */
2916 		for_each_active_dev_scope(rmrr->devices,
2917 					  rmrr->devices_cnt, i, tmp)
2918 			if (tmp == dev ||
2919 			    is_downstream_to_pci_bridge(dev, tmp)) {
2920 				rcu_read_unlock();
2921 				return true;
2922 			}
2923 	}
2924 	rcu_read_unlock();
2925 	return false;
2926 }
2927 
2928 /**
2929  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2930  * is relaxable (ie. is allowed to be not enforced under some conditions)
2931  * @dev: device handle
2932  *
2933  * We assume that PCI USB devices with RMRRs have them largely
2934  * for historical reasons and that the RMRR space is not actively used post
2935  * boot.  This exclusion may change if vendors begin to abuse it.
2936  *
2937  * The same exception is made for graphics devices, with the requirement that
2938  * any use of the RMRR regions will be torn down before assigning the device
2939  * to a guest.
2940  *
2941  * Return: true if the RMRR is relaxable, false otherwise
2942  */
2943 static bool device_rmrr_is_relaxable(struct device *dev)
2944 {
2945 	struct pci_dev *pdev;
2946 
2947 	if (!dev_is_pci(dev))
2948 		return false;
2949 
2950 	pdev = to_pci_dev(dev);
2951 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2952 		return true;
2953 	else
2954 		return false;
2955 }
2956 
2957 /*
2958  * There are a couple cases where we need to restrict the functionality of
2959  * devices associated with RMRRs.  The first is when evaluating a device for
2960  * identity mapping because problems exist when devices are moved in and out
2961  * of domains and their respective RMRR information is lost.  This means that
2962  * a device with associated RMRRs will never be in a "passthrough" domain.
2963  * The second is use of the device through the IOMMU API.  This interface
2964  * expects to have full control of the IOVA space for the device.  We cannot
2965  * satisfy both the requirement that RMRR access is maintained and have an
2966  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2967  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2968  * We therefore prevent devices associated with an RMRR from participating in
2969  * the IOMMU API, which eliminates them from device assignment.
2970  *
2971  * In both cases, devices which have relaxable RMRRs are not concerned by this
2972  * restriction. See device_rmrr_is_relaxable comment.
2973  */
2974 static bool device_is_rmrr_locked(struct device *dev)
2975 {
2976 	if (!device_has_rmrr(dev))
2977 		return false;
2978 
2979 	if (device_rmrr_is_relaxable(dev))
2980 		return false;
2981 
2982 	return true;
2983 }
2984 
2985 /*
2986  * Return the required default domain type for a specific device.
2987  *
2988  * @dev: the device in query
2989  * @startup: true if this is during early boot
2990  *
2991  * Returns:
2992  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2993  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2994  *  - 0: both identity and dynamic domains work for this device
2995  */
2996 static int device_def_domain_type(struct device *dev)
2997 {
2998 	if (dev_is_pci(dev)) {
2999 		struct pci_dev *pdev = to_pci_dev(dev);
3000 
3001 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3002 			return IOMMU_DOMAIN_IDENTITY;
3003 
3004 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3005 			return IOMMU_DOMAIN_IDENTITY;
3006 	}
3007 
3008 	return 0;
3009 }
3010 
3011 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3012 {
3013 	/*
3014 	 * Start from the sane iommu hardware state.
3015 	 * If the queued invalidation is already initialized by us
3016 	 * (for example, while enabling interrupt-remapping) then
3017 	 * we got the things already rolling from a sane state.
3018 	 */
3019 	if (!iommu->qi) {
3020 		/*
3021 		 * Clear any previous faults.
3022 		 */
3023 		dmar_fault(-1, iommu);
3024 		/*
3025 		 * Disable queued invalidation if supported and already enabled
3026 		 * before OS handover.
3027 		 */
3028 		dmar_disable_qi(iommu);
3029 	}
3030 
3031 	if (dmar_enable_qi(iommu)) {
3032 		/*
3033 		 * Queued Invalidate not enabled, use Register Based Invalidate
3034 		 */
3035 		iommu->flush.flush_context = __iommu_flush_context;
3036 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3037 		pr_info("%s: Using Register based invalidation\n",
3038 			iommu->name);
3039 	} else {
3040 		iommu->flush.flush_context = qi_flush_context;
3041 		iommu->flush.flush_iotlb = qi_flush_iotlb;
3042 		pr_info("%s: Using Queued invalidation\n", iommu->name);
3043 	}
3044 }
3045 
3046 static int copy_context_table(struct intel_iommu *iommu,
3047 			      struct root_entry *old_re,
3048 			      struct context_entry **tbl,
3049 			      int bus, bool ext)
3050 {
3051 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3052 	struct context_entry *new_ce = NULL, ce;
3053 	struct context_entry *old_ce = NULL;
3054 	struct root_entry re;
3055 	phys_addr_t old_ce_phys;
3056 
3057 	tbl_idx = ext ? bus * 2 : bus;
3058 	memcpy(&re, old_re, sizeof(re));
3059 
3060 	for (devfn = 0; devfn < 256; devfn++) {
3061 		/* First calculate the correct index */
3062 		idx = (ext ? devfn * 2 : devfn) % 256;
3063 
3064 		if (idx == 0) {
3065 			/* First save what we may have and clean up */
3066 			if (new_ce) {
3067 				tbl[tbl_idx] = new_ce;
3068 				__iommu_flush_cache(iommu, new_ce,
3069 						    VTD_PAGE_SIZE);
3070 				pos = 1;
3071 			}
3072 
3073 			if (old_ce)
3074 				memunmap(old_ce);
3075 
3076 			ret = 0;
3077 			if (devfn < 0x80)
3078 				old_ce_phys = root_entry_lctp(&re);
3079 			else
3080 				old_ce_phys = root_entry_uctp(&re);
3081 
3082 			if (!old_ce_phys) {
3083 				if (ext && devfn == 0) {
3084 					/* No LCTP, try UCTP */
3085 					devfn = 0x7f;
3086 					continue;
3087 				} else {
3088 					goto out;
3089 				}
3090 			}
3091 
3092 			ret = -ENOMEM;
3093 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3094 					MEMREMAP_WB);
3095 			if (!old_ce)
3096 				goto out;
3097 
3098 			new_ce = alloc_pgtable_page(iommu->node);
3099 			if (!new_ce)
3100 				goto out_unmap;
3101 
3102 			ret = 0;
3103 		}
3104 
3105 		/* Now copy the context entry */
3106 		memcpy(&ce, old_ce + idx, sizeof(ce));
3107 
3108 		if (!__context_present(&ce))
3109 			continue;
3110 
3111 		did = context_domain_id(&ce);
3112 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3113 			set_bit(did, iommu->domain_ids);
3114 
3115 		/*
3116 		 * We need a marker for copied context entries. This
3117 		 * marker needs to work for the old format as well as
3118 		 * for extended context entries.
3119 		 *
3120 		 * Bit 67 of the context entry is used. In the old
3121 		 * format this bit is available to software, in the
3122 		 * extended format it is the PGE bit, but PGE is ignored
3123 		 * by HW if PASIDs are disabled (and thus still
3124 		 * available).
3125 		 *
3126 		 * So disable PASIDs first and then mark the entry
3127 		 * copied. This means that we don't copy PASID
3128 		 * translations from the old kernel, but this is fine as
3129 		 * faults there are not fatal.
3130 		 */
3131 		context_clear_pasid_enable(&ce);
3132 		context_set_copied(&ce);
3133 
3134 		new_ce[idx] = ce;
3135 	}
3136 
3137 	tbl[tbl_idx + pos] = new_ce;
3138 
3139 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3140 
3141 out_unmap:
3142 	memunmap(old_ce);
3143 
3144 out:
3145 	return ret;
3146 }
3147 
3148 static int copy_translation_tables(struct intel_iommu *iommu)
3149 {
3150 	struct context_entry **ctxt_tbls;
3151 	struct root_entry *old_rt;
3152 	phys_addr_t old_rt_phys;
3153 	int ctxt_table_entries;
3154 	unsigned long flags;
3155 	u64 rtaddr_reg;
3156 	int bus, ret;
3157 	bool new_ext, ext;
3158 
3159 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3160 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3161 	new_ext    = !!ecap_ecs(iommu->ecap);
3162 
3163 	/*
3164 	 * The RTT bit can only be changed when translation is disabled,
3165 	 * but disabling translation means to open a window for data
3166 	 * corruption. So bail out and don't copy anything if we would
3167 	 * have to change the bit.
3168 	 */
3169 	if (new_ext != ext)
3170 		return -EINVAL;
3171 
3172 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3173 	if (!old_rt_phys)
3174 		return -EINVAL;
3175 
3176 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3177 	if (!old_rt)
3178 		return -ENOMEM;
3179 
3180 	/* This is too big for the stack - allocate it from slab */
3181 	ctxt_table_entries = ext ? 512 : 256;
3182 	ret = -ENOMEM;
3183 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3184 	if (!ctxt_tbls)
3185 		goto out_unmap;
3186 
3187 	for (bus = 0; bus < 256; bus++) {
3188 		ret = copy_context_table(iommu, &old_rt[bus],
3189 					 ctxt_tbls, bus, ext);
3190 		if (ret) {
3191 			pr_err("%s: Failed to copy context table for bus %d\n",
3192 				iommu->name, bus);
3193 			continue;
3194 		}
3195 	}
3196 
3197 	spin_lock_irqsave(&iommu->lock, flags);
3198 
3199 	/* Context tables are copied, now write them to the root_entry table */
3200 	for (bus = 0; bus < 256; bus++) {
3201 		int idx = ext ? bus * 2 : bus;
3202 		u64 val;
3203 
3204 		if (ctxt_tbls[idx]) {
3205 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3206 			iommu->root_entry[bus].lo = val;
3207 		}
3208 
3209 		if (!ext || !ctxt_tbls[idx + 1])
3210 			continue;
3211 
3212 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3213 		iommu->root_entry[bus].hi = val;
3214 	}
3215 
3216 	spin_unlock_irqrestore(&iommu->lock, flags);
3217 
3218 	kfree(ctxt_tbls);
3219 
3220 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3221 
3222 	ret = 0;
3223 
3224 out_unmap:
3225 	memunmap(old_rt);
3226 
3227 	return ret;
3228 }
3229 
3230 #ifdef CONFIG_INTEL_IOMMU_SVM
3231 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3232 {
3233 	struct intel_iommu *iommu = data;
3234 	ioasid_t ioasid;
3235 
3236 	if (!iommu)
3237 		return INVALID_IOASID;
3238 	/*
3239 	 * VT-d virtual command interface always uses the full 20 bit
3240 	 * PASID range. Host can partition guest PASID range based on
3241 	 * policies but it is out of guest's control.
3242 	 */
3243 	if (min < PASID_MIN || max > intel_pasid_max_id)
3244 		return INVALID_IOASID;
3245 
3246 	if (vcmd_alloc_pasid(iommu, &ioasid))
3247 		return INVALID_IOASID;
3248 
3249 	return ioasid;
3250 }
3251 
3252 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3253 {
3254 	struct intel_iommu *iommu = data;
3255 
3256 	if (!iommu)
3257 		return;
3258 	/*
3259 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3260 	 * We can only free the PASID when all the devices are unbound.
3261 	 */
3262 	if (ioasid_find(NULL, ioasid, NULL)) {
3263 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3264 		return;
3265 	}
3266 	vcmd_free_pasid(iommu, ioasid);
3267 }
3268 
3269 static void register_pasid_allocator(struct intel_iommu *iommu)
3270 {
3271 	/*
3272 	 * If we are running in the host, no need for custom allocator
3273 	 * in that PASIDs are allocated from the host system-wide.
3274 	 */
3275 	if (!cap_caching_mode(iommu->cap))
3276 		return;
3277 
3278 	if (!sm_supported(iommu)) {
3279 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3280 		return;
3281 	}
3282 
3283 	/*
3284 	 * Register a custom PASID allocator if we are running in a guest,
3285 	 * guest PASID must be obtained via virtual command interface.
3286 	 * There can be multiple vIOMMUs in each guest but only one allocator
3287 	 * is active. All vIOMMU allocators will eventually be calling the same
3288 	 * host allocator.
3289 	 */
3290 	if (!vccap_pasid(iommu->vccap))
3291 		return;
3292 
3293 	pr_info("Register custom PASID allocator\n");
3294 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3295 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3296 	iommu->pasid_allocator.pdata = (void *)iommu;
3297 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3298 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3299 		/*
3300 		 * Disable scalable mode on this IOMMU if there
3301 		 * is no custom allocator. Mixing SM capable vIOMMU
3302 		 * and non-SM vIOMMU are not supported.
3303 		 */
3304 		intel_iommu_sm = 0;
3305 	}
3306 }
3307 #endif
3308 
3309 static int __init init_dmars(void)
3310 {
3311 	struct dmar_drhd_unit *drhd;
3312 	struct intel_iommu *iommu;
3313 	int ret;
3314 
3315 	/*
3316 	 * for each drhd
3317 	 *    allocate root
3318 	 *    initialize and program root entry to not present
3319 	 * endfor
3320 	 */
3321 	for_each_drhd_unit(drhd) {
3322 		/*
3323 		 * lock not needed as this is only incremented in the single
3324 		 * threaded kernel __init code path all other access are read
3325 		 * only
3326 		 */
3327 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3328 			g_num_of_iommus++;
3329 			continue;
3330 		}
3331 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3332 	}
3333 
3334 	/* Preallocate enough resources for IOMMU hot-addition */
3335 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3336 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3337 
3338 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3339 			GFP_KERNEL);
3340 	if (!g_iommus) {
3341 		ret = -ENOMEM;
3342 		goto error;
3343 	}
3344 
3345 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3346 	if (ret)
3347 		goto free_iommu;
3348 
3349 	for_each_iommu(iommu, drhd) {
3350 		if (drhd->ignored) {
3351 			iommu_disable_translation(iommu);
3352 			continue;
3353 		}
3354 
3355 		/*
3356 		 * Find the max pasid size of all IOMMU's in the system.
3357 		 * We need to ensure the system pasid table is no bigger
3358 		 * than the smallest supported.
3359 		 */
3360 		if (pasid_supported(iommu)) {
3361 			u32 temp = 2 << ecap_pss(iommu->ecap);
3362 
3363 			intel_pasid_max_id = min_t(u32, temp,
3364 						   intel_pasid_max_id);
3365 		}
3366 
3367 		g_iommus[iommu->seq_id] = iommu;
3368 
3369 		intel_iommu_init_qi(iommu);
3370 
3371 		ret = iommu_init_domains(iommu);
3372 		if (ret)
3373 			goto free_iommu;
3374 
3375 		init_translation_status(iommu);
3376 
3377 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3378 			iommu_disable_translation(iommu);
3379 			clear_translation_pre_enabled(iommu);
3380 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3381 				iommu->name);
3382 		}
3383 
3384 		/*
3385 		 * TBD:
3386 		 * we could share the same root & context tables
3387 		 * among all IOMMU's. Need to Split it later.
3388 		 */
3389 		ret = iommu_alloc_root_entry(iommu);
3390 		if (ret)
3391 			goto free_iommu;
3392 
3393 		if (translation_pre_enabled(iommu)) {
3394 			pr_info("Translation already enabled - trying to copy translation structures\n");
3395 
3396 			ret = copy_translation_tables(iommu);
3397 			if (ret) {
3398 				/*
3399 				 * We found the IOMMU with translation
3400 				 * enabled - but failed to copy over the
3401 				 * old root-entry table. Try to proceed
3402 				 * by disabling translation now and
3403 				 * allocating a clean root-entry table.
3404 				 * This might cause DMAR faults, but
3405 				 * probably the dump will still succeed.
3406 				 */
3407 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3408 				       iommu->name);
3409 				iommu_disable_translation(iommu);
3410 				clear_translation_pre_enabled(iommu);
3411 			} else {
3412 				pr_info("Copied translation tables from previous kernel for %s\n",
3413 					iommu->name);
3414 			}
3415 		}
3416 
3417 		if (!ecap_pass_through(iommu->ecap))
3418 			hw_pass_through = 0;
3419 		intel_svm_check(iommu);
3420 	}
3421 
3422 	/*
3423 	 * Now that qi is enabled on all iommus, set the root entry and flush
3424 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3425 	 * flush_context function will loop forever and the boot hangs.
3426 	 */
3427 	for_each_active_iommu(iommu, drhd) {
3428 		iommu_flush_write_buffer(iommu);
3429 #ifdef CONFIG_INTEL_IOMMU_SVM
3430 		register_pasid_allocator(iommu);
3431 #endif
3432 		iommu_set_root_entry(iommu);
3433 	}
3434 
3435 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3436 	dmar_map_gfx = 0;
3437 #endif
3438 
3439 	if (!dmar_map_gfx)
3440 		iommu_identity_mapping |= IDENTMAP_GFX;
3441 
3442 	check_tylersburg_isoch();
3443 
3444 	ret = si_domain_init(hw_pass_through);
3445 	if (ret)
3446 		goto free_iommu;
3447 
3448 	/*
3449 	 * for each drhd
3450 	 *   enable fault log
3451 	 *   global invalidate context cache
3452 	 *   global invalidate iotlb
3453 	 *   enable translation
3454 	 */
3455 	for_each_iommu(iommu, drhd) {
3456 		if (drhd->ignored) {
3457 			/*
3458 			 * we always have to disable PMRs or DMA may fail on
3459 			 * this device
3460 			 */
3461 			if (force_on)
3462 				iommu_disable_protect_mem_regions(iommu);
3463 			continue;
3464 		}
3465 
3466 		iommu_flush_write_buffer(iommu);
3467 
3468 #ifdef CONFIG_INTEL_IOMMU_SVM
3469 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3470 			/*
3471 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3472 			 * could cause possible lock race condition.
3473 			 */
3474 			up_write(&dmar_global_lock);
3475 			ret = intel_svm_enable_prq(iommu);
3476 			down_write(&dmar_global_lock);
3477 			if (ret)
3478 				goto free_iommu;
3479 		}
3480 #endif
3481 		ret = dmar_set_interrupt(iommu);
3482 		if (ret)
3483 			goto free_iommu;
3484 	}
3485 
3486 	return 0;
3487 
3488 free_iommu:
3489 	for_each_active_iommu(iommu, drhd) {
3490 		disable_dmar_iommu(iommu);
3491 		free_dmar_iommu(iommu);
3492 	}
3493 
3494 	kfree(g_iommus);
3495 
3496 error:
3497 	return ret;
3498 }
3499 
3500 static inline int iommu_domain_cache_init(void)
3501 {
3502 	int ret = 0;
3503 
3504 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3505 					 sizeof(struct dmar_domain),
3506 					 0,
3507 					 SLAB_HWCACHE_ALIGN,
3508 
3509 					 NULL);
3510 	if (!iommu_domain_cache) {
3511 		pr_err("Couldn't create iommu_domain cache\n");
3512 		ret = -ENOMEM;
3513 	}
3514 
3515 	return ret;
3516 }
3517 
3518 static inline int iommu_devinfo_cache_init(void)
3519 {
3520 	int ret = 0;
3521 
3522 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3523 					 sizeof(struct device_domain_info),
3524 					 0,
3525 					 SLAB_HWCACHE_ALIGN,
3526 					 NULL);
3527 	if (!iommu_devinfo_cache) {
3528 		pr_err("Couldn't create devinfo cache\n");
3529 		ret = -ENOMEM;
3530 	}
3531 
3532 	return ret;
3533 }
3534 
3535 static int __init iommu_init_mempool(void)
3536 {
3537 	int ret;
3538 	ret = iova_cache_get();
3539 	if (ret)
3540 		return ret;
3541 
3542 	ret = iommu_domain_cache_init();
3543 	if (ret)
3544 		goto domain_error;
3545 
3546 	ret = iommu_devinfo_cache_init();
3547 	if (!ret)
3548 		return ret;
3549 
3550 	kmem_cache_destroy(iommu_domain_cache);
3551 domain_error:
3552 	iova_cache_put();
3553 
3554 	return -ENOMEM;
3555 }
3556 
3557 static void __init iommu_exit_mempool(void)
3558 {
3559 	kmem_cache_destroy(iommu_devinfo_cache);
3560 	kmem_cache_destroy(iommu_domain_cache);
3561 	iova_cache_put();
3562 }
3563 
3564 static void __init init_no_remapping_devices(void)
3565 {
3566 	struct dmar_drhd_unit *drhd;
3567 	struct device *dev;
3568 	int i;
3569 
3570 	for_each_drhd_unit(drhd) {
3571 		if (!drhd->include_all) {
3572 			for_each_active_dev_scope(drhd->devices,
3573 						  drhd->devices_cnt, i, dev)
3574 				break;
3575 			/* ignore DMAR unit if no devices exist */
3576 			if (i == drhd->devices_cnt)
3577 				drhd->ignored = 1;
3578 		}
3579 	}
3580 
3581 	for_each_active_drhd_unit(drhd) {
3582 		if (drhd->include_all)
3583 			continue;
3584 
3585 		for_each_active_dev_scope(drhd->devices,
3586 					  drhd->devices_cnt, i, dev)
3587 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3588 				break;
3589 		if (i < drhd->devices_cnt)
3590 			continue;
3591 
3592 		/* This IOMMU has *only* gfx devices. Either bypass it or
3593 		   set the gfx_mapped flag, as appropriate */
3594 		drhd->gfx_dedicated = 1;
3595 		if (!dmar_map_gfx)
3596 			drhd->ignored = 1;
3597 	}
3598 }
3599 
3600 #ifdef CONFIG_SUSPEND
3601 static int init_iommu_hw(void)
3602 {
3603 	struct dmar_drhd_unit *drhd;
3604 	struct intel_iommu *iommu = NULL;
3605 
3606 	for_each_active_iommu(iommu, drhd)
3607 		if (iommu->qi)
3608 			dmar_reenable_qi(iommu);
3609 
3610 	for_each_iommu(iommu, drhd) {
3611 		if (drhd->ignored) {
3612 			/*
3613 			 * we always have to disable PMRs or DMA may fail on
3614 			 * this device
3615 			 */
3616 			if (force_on)
3617 				iommu_disable_protect_mem_regions(iommu);
3618 			continue;
3619 		}
3620 
3621 		iommu_flush_write_buffer(iommu);
3622 		iommu_set_root_entry(iommu);
3623 		iommu_enable_translation(iommu);
3624 		iommu_disable_protect_mem_regions(iommu);
3625 	}
3626 
3627 	return 0;
3628 }
3629 
3630 static void iommu_flush_all(void)
3631 {
3632 	struct dmar_drhd_unit *drhd;
3633 	struct intel_iommu *iommu;
3634 
3635 	for_each_active_iommu(iommu, drhd) {
3636 		iommu->flush.flush_context(iommu, 0, 0, 0,
3637 					   DMA_CCMD_GLOBAL_INVL);
3638 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3639 					 DMA_TLB_GLOBAL_FLUSH);
3640 	}
3641 }
3642 
3643 static int iommu_suspend(void)
3644 {
3645 	struct dmar_drhd_unit *drhd;
3646 	struct intel_iommu *iommu = NULL;
3647 	unsigned long flag;
3648 
3649 	for_each_active_iommu(iommu, drhd) {
3650 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3651 					     GFP_KERNEL);
3652 		if (!iommu->iommu_state)
3653 			goto nomem;
3654 	}
3655 
3656 	iommu_flush_all();
3657 
3658 	for_each_active_iommu(iommu, drhd) {
3659 		iommu_disable_translation(iommu);
3660 
3661 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3662 
3663 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3664 			readl(iommu->reg + DMAR_FECTL_REG);
3665 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3666 			readl(iommu->reg + DMAR_FEDATA_REG);
3667 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3668 			readl(iommu->reg + DMAR_FEADDR_REG);
3669 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3670 			readl(iommu->reg + DMAR_FEUADDR_REG);
3671 
3672 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3673 	}
3674 	return 0;
3675 
3676 nomem:
3677 	for_each_active_iommu(iommu, drhd)
3678 		kfree(iommu->iommu_state);
3679 
3680 	return -ENOMEM;
3681 }
3682 
3683 static void iommu_resume(void)
3684 {
3685 	struct dmar_drhd_unit *drhd;
3686 	struct intel_iommu *iommu = NULL;
3687 	unsigned long flag;
3688 
3689 	if (init_iommu_hw()) {
3690 		if (force_on)
3691 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3692 		else
3693 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3694 		return;
3695 	}
3696 
3697 	for_each_active_iommu(iommu, drhd) {
3698 
3699 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3700 
3701 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3702 			iommu->reg + DMAR_FECTL_REG);
3703 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3704 			iommu->reg + DMAR_FEDATA_REG);
3705 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3706 			iommu->reg + DMAR_FEADDR_REG);
3707 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3708 			iommu->reg + DMAR_FEUADDR_REG);
3709 
3710 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3711 	}
3712 
3713 	for_each_active_iommu(iommu, drhd)
3714 		kfree(iommu->iommu_state);
3715 }
3716 
3717 static struct syscore_ops iommu_syscore_ops = {
3718 	.resume		= iommu_resume,
3719 	.suspend	= iommu_suspend,
3720 };
3721 
3722 static void __init init_iommu_pm_ops(void)
3723 {
3724 	register_syscore_ops(&iommu_syscore_ops);
3725 }
3726 
3727 #else
3728 static inline void init_iommu_pm_ops(void) {}
3729 #endif	/* CONFIG_PM */
3730 
3731 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3732 {
3733 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3734 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3735 	    rmrr->end_address <= rmrr->base_address ||
3736 	    arch_rmrr_sanity_check(rmrr))
3737 		return -EINVAL;
3738 
3739 	return 0;
3740 }
3741 
3742 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3743 {
3744 	struct acpi_dmar_reserved_memory *rmrr;
3745 	struct dmar_rmrr_unit *rmrru;
3746 
3747 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3748 	if (rmrr_sanity_check(rmrr)) {
3749 		pr_warn(FW_BUG
3750 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3751 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3752 			   rmrr->base_address, rmrr->end_address,
3753 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3754 			   dmi_get_system_info(DMI_BIOS_VERSION),
3755 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3756 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3757 	}
3758 
3759 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3760 	if (!rmrru)
3761 		goto out;
3762 
3763 	rmrru->hdr = header;
3764 
3765 	rmrru->base_address = rmrr->base_address;
3766 	rmrru->end_address = rmrr->end_address;
3767 
3768 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3769 				((void *)rmrr) + rmrr->header.length,
3770 				&rmrru->devices_cnt);
3771 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3772 		goto free_rmrru;
3773 
3774 	list_add(&rmrru->list, &dmar_rmrr_units);
3775 
3776 	return 0;
3777 free_rmrru:
3778 	kfree(rmrru);
3779 out:
3780 	return -ENOMEM;
3781 }
3782 
3783 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3784 {
3785 	struct dmar_atsr_unit *atsru;
3786 	struct acpi_dmar_atsr *tmp;
3787 
3788 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3789 				dmar_rcu_check()) {
3790 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3791 		if (atsr->segment != tmp->segment)
3792 			continue;
3793 		if (atsr->header.length != tmp->header.length)
3794 			continue;
3795 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3796 			return atsru;
3797 	}
3798 
3799 	return NULL;
3800 }
3801 
3802 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3803 {
3804 	struct acpi_dmar_atsr *atsr;
3805 	struct dmar_atsr_unit *atsru;
3806 
3807 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3808 		return 0;
3809 
3810 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3811 	atsru = dmar_find_atsr(atsr);
3812 	if (atsru)
3813 		return 0;
3814 
3815 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3816 	if (!atsru)
3817 		return -ENOMEM;
3818 
3819 	/*
3820 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3821 	 * copy the memory content because the memory buffer will be freed
3822 	 * on return.
3823 	 */
3824 	atsru->hdr = (void *)(atsru + 1);
3825 	memcpy(atsru->hdr, hdr, hdr->length);
3826 	atsru->include_all = atsr->flags & 0x1;
3827 	if (!atsru->include_all) {
3828 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3829 				(void *)atsr + atsr->header.length,
3830 				&atsru->devices_cnt);
3831 		if (atsru->devices_cnt && atsru->devices == NULL) {
3832 			kfree(atsru);
3833 			return -ENOMEM;
3834 		}
3835 	}
3836 
3837 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3838 
3839 	return 0;
3840 }
3841 
3842 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3843 {
3844 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3845 	kfree(atsru);
3846 }
3847 
3848 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3849 {
3850 	struct acpi_dmar_atsr *atsr;
3851 	struct dmar_atsr_unit *atsru;
3852 
3853 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3854 	atsru = dmar_find_atsr(atsr);
3855 	if (atsru) {
3856 		list_del_rcu(&atsru->list);
3857 		synchronize_rcu();
3858 		intel_iommu_free_atsr(atsru);
3859 	}
3860 
3861 	return 0;
3862 }
3863 
3864 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3865 {
3866 	int i;
3867 	struct device *dev;
3868 	struct acpi_dmar_atsr *atsr;
3869 	struct dmar_atsr_unit *atsru;
3870 
3871 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3872 	atsru = dmar_find_atsr(atsr);
3873 	if (!atsru)
3874 		return 0;
3875 
3876 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3877 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3878 					  i, dev)
3879 			return -EBUSY;
3880 	}
3881 
3882 	return 0;
3883 }
3884 
3885 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3886 {
3887 	struct dmar_satc_unit *satcu;
3888 	struct acpi_dmar_satc *tmp;
3889 
3890 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3891 				dmar_rcu_check()) {
3892 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3893 		if (satc->segment != tmp->segment)
3894 			continue;
3895 		if (satc->header.length != tmp->header.length)
3896 			continue;
3897 		if (memcmp(satc, tmp, satc->header.length) == 0)
3898 			return satcu;
3899 	}
3900 
3901 	return NULL;
3902 }
3903 
3904 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3905 {
3906 	struct acpi_dmar_satc *satc;
3907 	struct dmar_satc_unit *satcu;
3908 
3909 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3910 		return 0;
3911 
3912 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3913 	satcu = dmar_find_satc(satc);
3914 	if (satcu)
3915 		return 0;
3916 
3917 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3918 	if (!satcu)
3919 		return -ENOMEM;
3920 
3921 	satcu->hdr = (void *)(satcu + 1);
3922 	memcpy(satcu->hdr, hdr, hdr->length);
3923 	satcu->atc_required = satc->flags & 0x1;
3924 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3925 					      (void *)satc + satc->header.length,
3926 					      &satcu->devices_cnt);
3927 	if (satcu->devices_cnt && !satcu->devices) {
3928 		kfree(satcu);
3929 		return -ENOMEM;
3930 	}
3931 	list_add_rcu(&satcu->list, &dmar_satc_units);
3932 
3933 	return 0;
3934 }
3935 
3936 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3937 {
3938 	int sp, ret;
3939 	struct intel_iommu *iommu = dmaru->iommu;
3940 
3941 	if (g_iommus[iommu->seq_id])
3942 		return 0;
3943 
3944 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3945 	if (ret)
3946 		goto out;
3947 
3948 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3949 		pr_warn("%s: Doesn't support hardware pass through.\n",
3950 			iommu->name);
3951 		return -ENXIO;
3952 	}
3953 	if (!ecap_sc_support(iommu->ecap) &&
3954 	    domain_update_iommu_snooping(iommu)) {
3955 		pr_warn("%s: Doesn't support snooping.\n",
3956 			iommu->name);
3957 		return -ENXIO;
3958 	}
3959 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3960 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3961 		pr_warn("%s: Doesn't support large page.\n",
3962 			iommu->name);
3963 		return -ENXIO;
3964 	}
3965 
3966 	/*
3967 	 * Disable translation if already enabled prior to OS handover.
3968 	 */
3969 	if (iommu->gcmd & DMA_GCMD_TE)
3970 		iommu_disable_translation(iommu);
3971 
3972 	g_iommus[iommu->seq_id] = iommu;
3973 	ret = iommu_init_domains(iommu);
3974 	if (ret == 0)
3975 		ret = iommu_alloc_root_entry(iommu);
3976 	if (ret)
3977 		goto out;
3978 
3979 	intel_svm_check(iommu);
3980 
3981 	if (dmaru->ignored) {
3982 		/*
3983 		 * we always have to disable PMRs or DMA may fail on this device
3984 		 */
3985 		if (force_on)
3986 			iommu_disable_protect_mem_regions(iommu);
3987 		return 0;
3988 	}
3989 
3990 	intel_iommu_init_qi(iommu);
3991 	iommu_flush_write_buffer(iommu);
3992 
3993 #ifdef CONFIG_INTEL_IOMMU_SVM
3994 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3995 		ret = intel_svm_enable_prq(iommu);
3996 		if (ret)
3997 			goto disable_iommu;
3998 	}
3999 #endif
4000 	ret = dmar_set_interrupt(iommu);
4001 	if (ret)
4002 		goto disable_iommu;
4003 
4004 	iommu_set_root_entry(iommu);
4005 	iommu_enable_translation(iommu);
4006 
4007 	iommu_disable_protect_mem_regions(iommu);
4008 	return 0;
4009 
4010 disable_iommu:
4011 	disable_dmar_iommu(iommu);
4012 out:
4013 	free_dmar_iommu(iommu);
4014 	return ret;
4015 }
4016 
4017 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4018 {
4019 	int ret = 0;
4020 	struct intel_iommu *iommu = dmaru->iommu;
4021 
4022 	if (!intel_iommu_enabled)
4023 		return 0;
4024 	if (iommu == NULL)
4025 		return -EINVAL;
4026 
4027 	if (insert) {
4028 		ret = intel_iommu_add(dmaru);
4029 	} else {
4030 		disable_dmar_iommu(iommu);
4031 		free_dmar_iommu(iommu);
4032 	}
4033 
4034 	return ret;
4035 }
4036 
4037 static void intel_iommu_free_dmars(void)
4038 {
4039 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4040 	struct dmar_atsr_unit *atsru, *atsr_n;
4041 	struct dmar_satc_unit *satcu, *satc_n;
4042 
4043 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4044 		list_del(&rmrru->list);
4045 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4046 		kfree(rmrru);
4047 	}
4048 
4049 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4050 		list_del(&atsru->list);
4051 		intel_iommu_free_atsr(atsru);
4052 	}
4053 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
4054 		list_del(&satcu->list);
4055 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
4056 		kfree(satcu);
4057 	}
4058 }
4059 
4060 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4061 {
4062 	int i, ret = 1;
4063 	struct pci_bus *bus;
4064 	struct pci_dev *bridge = NULL;
4065 	struct device *tmp;
4066 	struct acpi_dmar_atsr *atsr;
4067 	struct dmar_atsr_unit *atsru;
4068 
4069 	dev = pci_physfn(dev);
4070 	for (bus = dev->bus; bus; bus = bus->parent) {
4071 		bridge = bus->self;
4072 		/* If it's an integrated device, allow ATS */
4073 		if (!bridge)
4074 			return 1;
4075 		/* Connected via non-PCIe: no ATS */
4076 		if (!pci_is_pcie(bridge) ||
4077 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4078 			return 0;
4079 		/* If we found the root port, look it up in the ATSR */
4080 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4081 			break;
4082 	}
4083 
4084 	rcu_read_lock();
4085 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4086 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4087 		if (atsr->segment != pci_domain_nr(dev->bus))
4088 			continue;
4089 
4090 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4091 			if (tmp == &bridge->dev)
4092 				goto out;
4093 
4094 		if (atsru->include_all)
4095 			goto out;
4096 	}
4097 	ret = 0;
4098 out:
4099 	rcu_read_unlock();
4100 
4101 	return ret;
4102 }
4103 
4104 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4105 {
4106 	int ret;
4107 	struct dmar_rmrr_unit *rmrru;
4108 	struct dmar_atsr_unit *atsru;
4109 	struct dmar_satc_unit *satcu;
4110 	struct acpi_dmar_atsr *atsr;
4111 	struct acpi_dmar_reserved_memory *rmrr;
4112 	struct acpi_dmar_satc *satc;
4113 
4114 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4115 		return 0;
4116 
4117 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4118 		rmrr = container_of(rmrru->hdr,
4119 				    struct acpi_dmar_reserved_memory, header);
4120 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4121 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4122 				((void *)rmrr) + rmrr->header.length,
4123 				rmrr->segment, rmrru->devices,
4124 				rmrru->devices_cnt);
4125 			if (ret < 0)
4126 				return ret;
4127 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4128 			dmar_remove_dev_scope(info, rmrr->segment,
4129 				rmrru->devices, rmrru->devices_cnt);
4130 		}
4131 	}
4132 
4133 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4134 		if (atsru->include_all)
4135 			continue;
4136 
4137 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4138 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4139 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4140 					(void *)atsr + atsr->header.length,
4141 					atsr->segment, atsru->devices,
4142 					atsru->devices_cnt);
4143 			if (ret > 0)
4144 				break;
4145 			else if (ret < 0)
4146 				return ret;
4147 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4148 			if (dmar_remove_dev_scope(info, atsr->segment,
4149 					atsru->devices, atsru->devices_cnt))
4150 				break;
4151 		}
4152 	}
4153 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4154 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4155 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4156 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4157 					(void *)satc + satc->header.length,
4158 					satc->segment, satcu->devices,
4159 					satcu->devices_cnt);
4160 			if (ret > 0)
4161 				break;
4162 			else if (ret < 0)
4163 				return ret;
4164 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4165 			if (dmar_remove_dev_scope(info, satc->segment,
4166 					satcu->devices, satcu->devices_cnt))
4167 				break;
4168 		}
4169 	}
4170 
4171 	return 0;
4172 }
4173 
4174 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4175 				       unsigned long val, void *v)
4176 {
4177 	struct memory_notify *mhp = v;
4178 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4179 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4180 			mhp->nr_pages - 1);
4181 
4182 	switch (val) {
4183 	case MEM_GOING_ONLINE:
4184 		if (iommu_domain_identity_map(si_domain,
4185 					      start_vpfn, last_vpfn)) {
4186 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4187 				start_vpfn, last_vpfn);
4188 			return NOTIFY_BAD;
4189 		}
4190 		break;
4191 
4192 	case MEM_OFFLINE:
4193 	case MEM_CANCEL_ONLINE:
4194 		{
4195 			struct dmar_drhd_unit *drhd;
4196 			struct intel_iommu *iommu;
4197 			struct page *freelist;
4198 
4199 			freelist = domain_unmap(si_domain,
4200 						start_vpfn, last_vpfn,
4201 						NULL);
4202 
4203 			rcu_read_lock();
4204 			for_each_active_iommu(iommu, drhd)
4205 				iommu_flush_iotlb_psi(iommu, si_domain,
4206 					start_vpfn, mhp->nr_pages,
4207 					!freelist, 0);
4208 			rcu_read_unlock();
4209 			dma_free_pagelist(freelist);
4210 		}
4211 		break;
4212 	}
4213 
4214 	return NOTIFY_OK;
4215 }
4216 
4217 static struct notifier_block intel_iommu_memory_nb = {
4218 	.notifier_call = intel_iommu_memory_notifier,
4219 	.priority = 0
4220 };
4221 
4222 static void intel_disable_iommus(void)
4223 {
4224 	struct intel_iommu *iommu = NULL;
4225 	struct dmar_drhd_unit *drhd;
4226 
4227 	for_each_iommu(iommu, drhd)
4228 		iommu_disable_translation(iommu);
4229 }
4230 
4231 void intel_iommu_shutdown(void)
4232 {
4233 	struct dmar_drhd_unit *drhd;
4234 	struct intel_iommu *iommu = NULL;
4235 
4236 	if (no_iommu || dmar_disabled)
4237 		return;
4238 
4239 	down_write(&dmar_global_lock);
4240 
4241 	/* Disable PMRs explicitly here. */
4242 	for_each_iommu(iommu, drhd)
4243 		iommu_disable_protect_mem_regions(iommu);
4244 
4245 	/* Make sure the IOMMUs are switched off */
4246 	intel_disable_iommus();
4247 
4248 	up_write(&dmar_global_lock);
4249 }
4250 
4251 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4252 {
4253 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4254 
4255 	return container_of(iommu_dev, struct intel_iommu, iommu);
4256 }
4257 
4258 static ssize_t version_show(struct device *dev,
4259 			    struct device_attribute *attr, char *buf)
4260 {
4261 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4262 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4263 	return sprintf(buf, "%d:%d\n",
4264 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4265 }
4266 static DEVICE_ATTR_RO(version);
4267 
4268 static ssize_t address_show(struct device *dev,
4269 			    struct device_attribute *attr, char *buf)
4270 {
4271 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4272 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4273 }
4274 static DEVICE_ATTR_RO(address);
4275 
4276 static ssize_t cap_show(struct device *dev,
4277 			struct device_attribute *attr, char *buf)
4278 {
4279 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4280 	return sprintf(buf, "%llx\n", iommu->cap);
4281 }
4282 static DEVICE_ATTR_RO(cap);
4283 
4284 static ssize_t ecap_show(struct device *dev,
4285 			 struct device_attribute *attr, char *buf)
4286 {
4287 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4288 	return sprintf(buf, "%llx\n", iommu->ecap);
4289 }
4290 static DEVICE_ATTR_RO(ecap);
4291 
4292 static ssize_t domains_supported_show(struct device *dev,
4293 				      struct device_attribute *attr, char *buf)
4294 {
4295 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4296 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4297 }
4298 static DEVICE_ATTR_RO(domains_supported);
4299 
4300 static ssize_t domains_used_show(struct device *dev,
4301 				 struct device_attribute *attr, char *buf)
4302 {
4303 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4304 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4305 						  cap_ndoms(iommu->cap)));
4306 }
4307 static DEVICE_ATTR_RO(domains_used);
4308 
4309 static struct attribute *intel_iommu_attrs[] = {
4310 	&dev_attr_version.attr,
4311 	&dev_attr_address.attr,
4312 	&dev_attr_cap.attr,
4313 	&dev_attr_ecap.attr,
4314 	&dev_attr_domains_supported.attr,
4315 	&dev_attr_domains_used.attr,
4316 	NULL,
4317 };
4318 
4319 static struct attribute_group intel_iommu_group = {
4320 	.name = "intel-iommu",
4321 	.attrs = intel_iommu_attrs,
4322 };
4323 
4324 const struct attribute_group *intel_iommu_groups[] = {
4325 	&intel_iommu_group,
4326 	NULL,
4327 };
4328 
4329 static inline bool has_external_pci(void)
4330 {
4331 	struct pci_dev *pdev = NULL;
4332 
4333 	for_each_pci_dev(pdev)
4334 		if (pdev->external_facing)
4335 			return true;
4336 
4337 	return false;
4338 }
4339 
4340 static int __init platform_optin_force_iommu(void)
4341 {
4342 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4343 		return 0;
4344 
4345 	if (no_iommu || dmar_disabled)
4346 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4347 
4348 	/*
4349 	 * If Intel-IOMMU is disabled by default, we will apply identity
4350 	 * map for all devices except those marked as being untrusted.
4351 	 */
4352 	if (dmar_disabled)
4353 		iommu_set_default_passthrough(false);
4354 
4355 	dmar_disabled = 0;
4356 	no_iommu = 0;
4357 
4358 	return 1;
4359 }
4360 
4361 static int __init probe_acpi_namespace_devices(void)
4362 {
4363 	struct dmar_drhd_unit *drhd;
4364 	/* To avoid a -Wunused-but-set-variable warning. */
4365 	struct intel_iommu *iommu __maybe_unused;
4366 	struct device *dev;
4367 	int i, ret = 0;
4368 
4369 	for_each_active_iommu(iommu, drhd) {
4370 		for_each_active_dev_scope(drhd->devices,
4371 					  drhd->devices_cnt, i, dev) {
4372 			struct acpi_device_physical_node *pn;
4373 			struct iommu_group *group;
4374 			struct acpi_device *adev;
4375 
4376 			if (dev->bus != &acpi_bus_type)
4377 				continue;
4378 
4379 			adev = to_acpi_device(dev);
4380 			mutex_lock(&adev->physical_node_lock);
4381 			list_for_each_entry(pn,
4382 					    &adev->physical_node_list, node) {
4383 				group = iommu_group_get(pn->dev);
4384 				if (group) {
4385 					iommu_group_put(group);
4386 					continue;
4387 				}
4388 
4389 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4390 				ret = iommu_probe_device(pn->dev);
4391 				if (ret)
4392 					break;
4393 			}
4394 			mutex_unlock(&adev->physical_node_lock);
4395 
4396 			if (ret)
4397 				return ret;
4398 		}
4399 	}
4400 
4401 	return 0;
4402 }
4403 
4404 int __init intel_iommu_init(void)
4405 {
4406 	int ret = -ENODEV;
4407 	struct dmar_drhd_unit *drhd;
4408 	struct intel_iommu *iommu;
4409 
4410 	/*
4411 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4412 	 * opt in, so enforce that.
4413 	 */
4414 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4415 		    platform_optin_force_iommu();
4416 
4417 	if (iommu_init_mempool()) {
4418 		if (force_on)
4419 			panic("tboot: Failed to initialize iommu memory\n");
4420 		return -ENOMEM;
4421 	}
4422 
4423 	down_write(&dmar_global_lock);
4424 	if (dmar_table_init()) {
4425 		if (force_on)
4426 			panic("tboot: Failed to initialize DMAR table\n");
4427 		goto out_free_dmar;
4428 	}
4429 
4430 	if (dmar_dev_scope_init() < 0) {
4431 		if (force_on)
4432 			panic("tboot: Failed to initialize DMAR device scope\n");
4433 		goto out_free_dmar;
4434 	}
4435 
4436 	up_write(&dmar_global_lock);
4437 
4438 	/*
4439 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4440 	 * complain later when we register it under the lock.
4441 	 */
4442 	dmar_register_bus_notifier();
4443 
4444 	down_write(&dmar_global_lock);
4445 
4446 	if (!no_iommu)
4447 		intel_iommu_debugfs_init();
4448 
4449 	if (no_iommu || dmar_disabled) {
4450 		/*
4451 		 * We exit the function here to ensure IOMMU's remapping and
4452 		 * mempool aren't setup, which means that the IOMMU's PMRs
4453 		 * won't be disabled via the call to init_dmars(). So disable
4454 		 * it explicitly here. The PMRs were setup by tboot prior to
4455 		 * calling SENTER, but the kernel is expected to reset/tear
4456 		 * down the PMRs.
4457 		 */
4458 		if (intel_iommu_tboot_noforce) {
4459 			for_each_iommu(iommu, drhd)
4460 				iommu_disable_protect_mem_regions(iommu);
4461 		}
4462 
4463 		/*
4464 		 * Make sure the IOMMUs are switched off, even when we
4465 		 * boot into a kexec kernel and the previous kernel left
4466 		 * them enabled
4467 		 */
4468 		intel_disable_iommus();
4469 		goto out_free_dmar;
4470 	}
4471 
4472 	if (list_empty(&dmar_rmrr_units))
4473 		pr_info("No RMRR found\n");
4474 
4475 	if (list_empty(&dmar_atsr_units))
4476 		pr_info("No ATSR found\n");
4477 
4478 	if (list_empty(&dmar_satc_units))
4479 		pr_info("No SATC found\n");
4480 
4481 	if (dmar_map_gfx)
4482 		intel_iommu_gfx_mapped = 1;
4483 
4484 	init_no_remapping_devices();
4485 
4486 	ret = init_dmars();
4487 	if (ret) {
4488 		if (force_on)
4489 			panic("tboot: Failed to initialize DMARs\n");
4490 		pr_err("Initialization failed\n");
4491 		goto out_free_dmar;
4492 	}
4493 	up_write(&dmar_global_lock);
4494 
4495 	init_iommu_pm_ops();
4496 
4497 	down_read(&dmar_global_lock);
4498 	for_each_active_iommu(iommu, drhd) {
4499 		/*
4500 		 * The flush queue implementation does not perform
4501 		 * page-selective invalidations that are required for efficient
4502 		 * TLB flushes in virtual environments.  The benefit of batching
4503 		 * is likely to be much lower than the overhead of synchronizing
4504 		 * the virtual and physical IOMMU page-tables.
4505 		 */
4506 		if (cap_caching_mode(iommu->cap)) {
4507 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4508 			iommu_set_dma_strict();
4509 		}
4510 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4511 				       intel_iommu_groups,
4512 				       "%s", iommu->name);
4513 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4514 	}
4515 	up_read(&dmar_global_lock);
4516 
4517 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4518 	if (si_domain && !hw_pass_through)
4519 		register_memory_notifier(&intel_iommu_memory_nb);
4520 
4521 	down_read(&dmar_global_lock);
4522 	if (probe_acpi_namespace_devices())
4523 		pr_warn("ACPI name space devices didn't probe correctly\n");
4524 
4525 	/* Finally, we enable the DMA remapping hardware. */
4526 	for_each_iommu(iommu, drhd) {
4527 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4528 			iommu_enable_translation(iommu);
4529 
4530 		iommu_disable_protect_mem_regions(iommu);
4531 	}
4532 	up_read(&dmar_global_lock);
4533 
4534 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4535 
4536 	intel_iommu_enabled = 1;
4537 
4538 	return 0;
4539 
4540 out_free_dmar:
4541 	intel_iommu_free_dmars();
4542 	up_write(&dmar_global_lock);
4543 	iommu_exit_mempool();
4544 	return ret;
4545 }
4546 
4547 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4548 {
4549 	struct device_domain_info *info = opaque;
4550 
4551 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4552 	return 0;
4553 }
4554 
4555 /*
4556  * NB - intel-iommu lacks any sort of reference counting for the users of
4557  * dependent devices.  If multiple endpoints have intersecting dependent
4558  * devices, unbinding the driver from any one of them will possibly leave
4559  * the others unable to operate.
4560  */
4561 static void domain_context_clear(struct device_domain_info *info)
4562 {
4563 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4564 		return;
4565 
4566 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4567 			       &domain_context_clear_one_cb, info);
4568 }
4569 
4570 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4571 {
4572 	struct dmar_domain *domain;
4573 	struct intel_iommu *iommu;
4574 	unsigned long flags;
4575 
4576 	assert_spin_locked(&device_domain_lock);
4577 
4578 	if (WARN_ON(!info))
4579 		return;
4580 
4581 	iommu = info->iommu;
4582 	domain = info->domain;
4583 
4584 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4585 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4586 			intel_pasid_tear_down_entry(iommu, info->dev,
4587 					PASID_RID2PASID, false);
4588 
4589 		iommu_disable_dev_iotlb(info);
4590 		domain_context_clear(info);
4591 		intel_pasid_free_table(info->dev);
4592 	}
4593 
4594 	unlink_domain_info(info);
4595 
4596 	spin_lock_irqsave(&iommu->lock, flags);
4597 	domain_detach_iommu(domain, iommu);
4598 	spin_unlock_irqrestore(&iommu->lock, flags);
4599 
4600 	free_devinfo_mem(info);
4601 }
4602 
4603 static void dmar_remove_one_dev_info(struct device *dev)
4604 {
4605 	struct device_domain_info *info;
4606 	unsigned long flags;
4607 
4608 	spin_lock_irqsave(&device_domain_lock, flags);
4609 	info = get_domain_info(dev);
4610 	if (info)
4611 		__dmar_remove_one_dev_info(info);
4612 	spin_unlock_irqrestore(&device_domain_lock, flags);
4613 }
4614 
4615 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4616 {
4617 	int adjust_width;
4618 
4619 	/* calculate AGAW */
4620 	domain->gaw = guest_width;
4621 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4622 	domain->agaw = width_to_agaw(adjust_width);
4623 
4624 	domain->iommu_coherency = false;
4625 	domain->iommu_snooping = false;
4626 	domain->iommu_superpage = 0;
4627 	domain->max_addr = 0;
4628 
4629 	/* always allocate the top pgd */
4630 	domain->pgd = alloc_pgtable_page(domain->nid);
4631 	if (!domain->pgd)
4632 		return -ENOMEM;
4633 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4634 	return 0;
4635 }
4636 
4637 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4638 {
4639 	struct dmar_domain *dmar_domain;
4640 	struct iommu_domain *domain;
4641 
4642 	switch (type) {
4643 	case IOMMU_DOMAIN_DMA:
4644 	case IOMMU_DOMAIN_DMA_FQ:
4645 	case IOMMU_DOMAIN_UNMANAGED:
4646 		dmar_domain = alloc_domain(type);
4647 		if (!dmar_domain) {
4648 			pr_err("Can't allocate dmar_domain\n");
4649 			return NULL;
4650 		}
4651 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4652 			pr_err("Domain initialization failed\n");
4653 			domain_exit(dmar_domain);
4654 			return NULL;
4655 		}
4656 
4657 		domain = &dmar_domain->domain;
4658 		domain->geometry.aperture_start = 0;
4659 		domain->geometry.aperture_end   =
4660 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4661 		domain->geometry.force_aperture = true;
4662 
4663 		return domain;
4664 	case IOMMU_DOMAIN_IDENTITY:
4665 		return &si_domain->domain;
4666 	default:
4667 		return NULL;
4668 	}
4669 
4670 	return NULL;
4671 }
4672 
4673 static void intel_iommu_domain_free(struct iommu_domain *domain)
4674 {
4675 	if (domain != &si_domain->domain)
4676 		domain_exit(to_dmar_domain(domain));
4677 }
4678 
4679 /*
4680  * Check whether a @domain could be attached to the @dev through the
4681  * aux-domain attach/detach APIs.
4682  */
4683 static inline bool
4684 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4685 {
4686 	struct device_domain_info *info = get_domain_info(dev);
4687 
4688 	return info && info->auxd_enabled &&
4689 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4690 }
4691 
4692 static inline struct subdev_domain_info *
4693 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4694 {
4695 	struct subdev_domain_info *sinfo;
4696 
4697 	if (!list_empty(&domain->subdevices)) {
4698 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4699 			if (sinfo->pdev == dev)
4700 				return sinfo;
4701 		}
4702 	}
4703 
4704 	return NULL;
4705 }
4706 
4707 static int auxiliary_link_device(struct dmar_domain *domain,
4708 				 struct device *dev)
4709 {
4710 	struct device_domain_info *info = get_domain_info(dev);
4711 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4712 
4713 	assert_spin_locked(&device_domain_lock);
4714 	if (WARN_ON(!info))
4715 		return -EINVAL;
4716 
4717 	if (!sinfo) {
4718 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4719 		if (!sinfo)
4720 			return -ENOMEM;
4721 		sinfo->domain = domain;
4722 		sinfo->pdev = dev;
4723 		list_add(&sinfo->link_phys, &info->subdevices);
4724 		list_add(&sinfo->link_domain, &domain->subdevices);
4725 	}
4726 
4727 	return ++sinfo->users;
4728 }
4729 
4730 static int auxiliary_unlink_device(struct dmar_domain *domain,
4731 				   struct device *dev)
4732 {
4733 	struct device_domain_info *info = get_domain_info(dev);
4734 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4735 	int ret;
4736 
4737 	assert_spin_locked(&device_domain_lock);
4738 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4739 		return -EINVAL;
4740 
4741 	ret = --sinfo->users;
4742 	if (!ret) {
4743 		list_del(&sinfo->link_phys);
4744 		list_del(&sinfo->link_domain);
4745 		kfree(sinfo);
4746 	}
4747 
4748 	return ret;
4749 }
4750 
4751 static int aux_domain_add_dev(struct dmar_domain *domain,
4752 			      struct device *dev)
4753 {
4754 	int ret;
4755 	unsigned long flags;
4756 	struct intel_iommu *iommu;
4757 
4758 	iommu = device_to_iommu(dev, NULL, NULL);
4759 	if (!iommu)
4760 		return -ENODEV;
4761 
4762 	if (domain->default_pasid <= 0) {
4763 		u32 pasid;
4764 
4765 		/* No private data needed for the default pasid */
4766 		pasid = ioasid_alloc(NULL, PASID_MIN,
4767 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4768 				     NULL);
4769 		if (pasid == INVALID_IOASID) {
4770 			pr_err("Can't allocate default pasid\n");
4771 			return -ENODEV;
4772 		}
4773 		domain->default_pasid = pasid;
4774 	}
4775 
4776 	spin_lock_irqsave(&device_domain_lock, flags);
4777 	ret = auxiliary_link_device(domain, dev);
4778 	if (ret <= 0)
4779 		goto link_failed;
4780 
4781 	/*
4782 	 * Subdevices from the same physical device can be attached to the
4783 	 * same domain. For such cases, only the first subdevice attachment
4784 	 * needs to go through the full steps in this function. So if ret >
4785 	 * 1, just goto out.
4786 	 */
4787 	if (ret > 1)
4788 		goto out;
4789 
4790 	/*
4791 	 * iommu->lock must be held to attach domain to iommu and setup the
4792 	 * pasid entry for second level translation.
4793 	 */
4794 	spin_lock(&iommu->lock);
4795 	ret = domain_attach_iommu(domain, iommu);
4796 	if (ret)
4797 		goto attach_failed;
4798 
4799 	/* Setup the PASID entry for mediated devices: */
4800 	if (domain_use_first_level(domain))
4801 		ret = domain_setup_first_level(iommu, domain, dev,
4802 					       domain->default_pasid);
4803 	else
4804 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4805 						     domain->default_pasid);
4806 	if (ret)
4807 		goto table_failed;
4808 
4809 	spin_unlock(&iommu->lock);
4810 out:
4811 	spin_unlock_irqrestore(&device_domain_lock, flags);
4812 
4813 	return 0;
4814 
4815 table_failed:
4816 	domain_detach_iommu(domain, iommu);
4817 attach_failed:
4818 	spin_unlock(&iommu->lock);
4819 	auxiliary_unlink_device(domain, dev);
4820 link_failed:
4821 	spin_unlock_irqrestore(&device_domain_lock, flags);
4822 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4823 		ioasid_put(domain->default_pasid);
4824 
4825 	return ret;
4826 }
4827 
4828 static void aux_domain_remove_dev(struct dmar_domain *domain,
4829 				  struct device *dev)
4830 {
4831 	struct device_domain_info *info;
4832 	struct intel_iommu *iommu;
4833 	unsigned long flags;
4834 
4835 	if (!is_aux_domain(dev, &domain->domain))
4836 		return;
4837 
4838 	spin_lock_irqsave(&device_domain_lock, flags);
4839 	info = get_domain_info(dev);
4840 	iommu = info->iommu;
4841 
4842 	if (!auxiliary_unlink_device(domain, dev)) {
4843 		spin_lock(&iommu->lock);
4844 		intel_pasid_tear_down_entry(iommu, dev,
4845 					    domain->default_pasid, false);
4846 		domain_detach_iommu(domain, iommu);
4847 		spin_unlock(&iommu->lock);
4848 	}
4849 
4850 	spin_unlock_irqrestore(&device_domain_lock, flags);
4851 
4852 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4853 		ioasid_put(domain->default_pasid);
4854 }
4855 
4856 static int prepare_domain_attach_device(struct iommu_domain *domain,
4857 					struct device *dev)
4858 {
4859 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4860 	struct intel_iommu *iommu;
4861 	int addr_width;
4862 
4863 	iommu = device_to_iommu(dev, NULL, NULL);
4864 	if (!iommu)
4865 		return -ENODEV;
4866 
4867 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4868 	    !ecap_nest(iommu->ecap)) {
4869 		dev_err(dev, "%s: iommu not support nested translation\n",
4870 			iommu->name);
4871 		return -EINVAL;
4872 	}
4873 
4874 	/* check if this iommu agaw is sufficient for max mapped address */
4875 	addr_width = agaw_to_width(iommu->agaw);
4876 	if (addr_width > cap_mgaw(iommu->cap))
4877 		addr_width = cap_mgaw(iommu->cap);
4878 
4879 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4880 		dev_err(dev, "%s: iommu width (%d) is not "
4881 		        "sufficient for the mapped address (%llx)\n",
4882 		        __func__, addr_width, dmar_domain->max_addr);
4883 		return -EFAULT;
4884 	}
4885 	dmar_domain->gaw = addr_width;
4886 
4887 	/*
4888 	 * Knock out extra levels of page tables if necessary
4889 	 */
4890 	while (iommu->agaw < dmar_domain->agaw) {
4891 		struct dma_pte *pte;
4892 
4893 		pte = dmar_domain->pgd;
4894 		if (dma_pte_present(pte)) {
4895 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4896 			free_pgtable_page(pte);
4897 		}
4898 		dmar_domain->agaw--;
4899 	}
4900 
4901 	return 0;
4902 }
4903 
4904 static int intel_iommu_attach_device(struct iommu_domain *domain,
4905 				     struct device *dev)
4906 {
4907 	int ret;
4908 
4909 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4910 	    device_is_rmrr_locked(dev)) {
4911 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4912 		return -EPERM;
4913 	}
4914 
4915 	if (is_aux_domain(dev, domain))
4916 		return -EPERM;
4917 
4918 	/* normally dev is not mapped */
4919 	if (unlikely(domain_context_mapped(dev))) {
4920 		struct dmar_domain *old_domain;
4921 
4922 		old_domain = find_domain(dev);
4923 		if (old_domain)
4924 			dmar_remove_one_dev_info(dev);
4925 	}
4926 
4927 	ret = prepare_domain_attach_device(domain, dev);
4928 	if (ret)
4929 		return ret;
4930 
4931 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4932 }
4933 
4934 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4935 					 struct device *dev)
4936 {
4937 	int ret;
4938 
4939 	if (!is_aux_domain(dev, domain))
4940 		return -EPERM;
4941 
4942 	ret = prepare_domain_attach_device(domain, dev);
4943 	if (ret)
4944 		return ret;
4945 
4946 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4947 }
4948 
4949 static void intel_iommu_detach_device(struct iommu_domain *domain,
4950 				      struct device *dev)
4951 {
4952 	dmar_remove_one_dev_info(dev);
4953 }
4954 
4955 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4956 					  struct device *dev)
4957 {
4958 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4959 }
4960 
4961 #ifdef CONFIG_INTEL_IOMMU_SVM
4962 /*
4963  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4964  * VT-d granularity. Invalidation is typically included in the unmap operation
4965  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4966  * owns the first level page tables. Invalidations of translation caches in the
4967  * guest are trapped and passed down to the host.
4968  *
4969  * vIOMMU in the guest will only expose first level page tables, therefore
4970  * we do not support IOTLB granularity for request without PASID (second level).
4971  *
4972  * For example, to find the VT-d granularity encoding for IOTLB
4973  * type and page selective granularity within PASID:
4974  * X: indexed by iommu cache type
4975  * Y: indexed by enum iommu_inv_granularity
4976  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4977  */
4978 
4979 static const int
4980 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4981 	/*
4982 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4983 	 * page selective (address granularity)
4984 	 */
4985 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4986 	/* PASID based dev TLBs */
4987 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4988 	/* PASID cache */
4989 	{-EINVAL, -EINVAL, -EINVAL}
4990 };
4991 
4992 static inline int to_vtd_granularity(int type, int granu)
4993 {
4994 	return inv_type_granu_table[type][granu];
4995 }
4996 
4997 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4998 {
4999 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5000 
5001 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5002 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5003 	 * granu size in contiguous memory.
5004 	 */
5005 	return order_base_2(nr_pages);
5006 }
5007 
5008 static int
5009 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5010 			   struct iommu_cache_invalidate_info *inv_info)
5011 {
5012 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5013 	struct device_domain_info *info;
5014 	struct intel_iommu *iommu;
5015 	unsigned long flags;
5016 	int cache_type;
5017 	u8 bus, devfn;
5018 	u16 did, sid;
5019 	int ret = 0;
5020 	u64 size = 0;
5021 
5022 	if (!inv_info || !dmar_domain)
5023 		return -EINVAL;
5024 
5025 	if (!dev || !dev_is_pci(dev))
5026 		return -ENODEV;
5027 
5028 	iommu = device_to_iommu(dev, &bus, &devfn);
5029 	if (!iommu)
5030 		return -ENODEV;
5031 
5032 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5033 		return -EINVAL;
5034 
5035 	spin_lock_irqsave(&device_domain_lock, flags);
5036 	spin_lock(&iommu->lock);
5037 	info = get_domain_info(dev);
5038 	if (!info) {
5039 		ret = -EINVAL;
5040 		goto out_unlock;
5041 	}
5042 	did = dmar_domain->iommu_did[iommu->seq_id];
5043 	sid = PCI_DEVID(bus, devfn);
5044 
5045 	/* Size is only valid in address selective invalidation */
5046 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5047 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5048 				   inv_info->granu.addr_info.nb_granules);
5049 
5050 	for_each_set_bit(cache_type,
5051 			 (unsigned long *)&inv_info->cache,
5052 			 IOMMU_CACHE_INV_TYPE_NR) {
5053 		int granu = 0;
5054 		u64 pasid = 0;
5055 		u64 addr = 0;
5056 
5057 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5058 		if (granu == -EINVAL) {
5059 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5060 					   cache_type, inv_info->granularity);
5061 			break;
5062 		}
5063 
5064 		/*
5065 		 * PASID is stored in different locations based on the
5066 		 * granularity.
5067 		 */
5068 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5069 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5070 			pasid = inv_info->granu.pasid_info.pasid;
5071 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5072 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5073 			pasid = inv_info->granu.addr_info.pasid;
5074 
5075 		switch (BIT(cache_type)) {
5076 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5077 			/* HW will ignore LSB bits based on address mask */
5078 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5079 			    size &&
5080 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5081 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5082 						   inv_info->granu.addr_info.addr, size);
5083 			}
5084 
5085 			/*
5086 			 * If granu is PASID-selective, address is ignored.
5087 			 * We use npages = -1 to indicate that.
5088 			 */
5089 			qi_flush_piotlb(iommu, did, pasid,
5090 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5091 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5092 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5093 
5094 			if (!info->ats_enabled)
5095 				break;
5096 			/*
5097 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5098 			 * in the guest may assume IOTLB flush is inclusive,
5099 			 * which is more efficient.
5100 			 */
5101 			fallthrough;
5102 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5103 			/*
5104 			 * PASID based device TLB invalidation does not support
5105 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5106 			 * IOMMU_INV_GRANU_ADDR.
5107 			 * The equivalent of that is we set the size to be the
5108 			 * entire range of 64 bit. User only provides PASID info
5109 			 * without address info. So we set addr to 0.
5110 			 */
5111 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5112 				size = 64 - VTD_PAGE_SHIFT;
5113 				addr = 0;
5114 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5115 				addr = inv_info->granu.addr_info.addr;
5116 			}
5117 
5118 			if (info->ats_enabled)
5119 				qi_flush_dev_iotlb_pasid(iommu, sid,
5120 						info->pfsid, pasid,
5121 						info->ats_qdep, addr,
5122 						size);
5123 			else
5124 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5125 			break;
5126 		default:
5127 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5128 					    cache_type);
5129 			ret = -EINVAL;
5130 		}
5131 	}
5132 out_unlock:
5133 	spin_unlock(&iommu->lock);
5134 	spin_unlock_irqrestore(&device_domain_lock, flags);
5135 
5136 	return ret;
5137 }
5138 #endif
5139 
5140 static int intel_iommu_map(struct iommu_domain *domain,
5141 			   unsigned long iova, phys_addr_t hpa,
5142 			   size_t size, int iommu_prot, gfp_t gfp)
5143 {
5144 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5145 	u64 max_addr;
5146 	int prot = 0;
5147 
5148 	if (iommu_prot & IOMMU_READ)
5149 		prot |= DMA_PTE_READ;
5150 	if (iommu_prot & IOMMU_WRITE)
5151 		prot |= DMA_PTE_WRITE;
5152 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5153 		prot |= DMA_PTE_SNP;
5154 
5155 	max_addr = iova + size;
5156 	if (dmar_domain->max_addr < max_addr) {
5157 		u64 end;
5158 
5159 		/* check if minimum agaw is sufficient for mapped address */
5160 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5161 		if (end < max_addr) {
5162 			pr_err("%s: iommu width (%d) is not "
5163 			       "sufficient for the mapped address (%llx)\n",
5164 			       __func__, dmar_domain->gaw, max_addr);
5165 			return -EFAULT;
5166 		}
5167 		dmar_domain->max_addr = max_addr;
5168 	}
5169 	/* Round up size to next multiple of PAGE_SIZE, if it and
5170 	   the low bits of hpa would take us onto the next page */
5171 	size = aligned_nrpages(hpa, size);
5172 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5173 				hpa >> VTD_PAGE_SHIFT, size, prot);
5174 }
5175 
5176 static int intel_iommu_map_pages(struct iommu_domain *domain,
5177 				 unsigned long iova, phys_addr_t paddr,
5178 				 size_t pgsize, size_t pgcount,
5179 				 int prot, gfp_t gfp, size_t *mapped)
5180 {
5181 	unsigned long pgshift = __ffs(pgsize);
5182 	size_t size = pgcount << pgshift;
5183 	int ret;
5184 
5185 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5186 		return -EINVAL;
5187 
5188 	if (!IS_ALIGNED(iova | paddr, pgsize))
5189 		return -EINVAL;
5190 
5191 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5192 	if (!ret && mapped)
5193 		*mapped = size;
5194 
5195 	return ret;
5196 }
5197 
5198 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5199 				unsigned long iova, size_t size,
5200 				struct iommu_iotlb_gather *gather)
5201 {
5202 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5203 	unsigned long start_pfn, last_pfn;
5204 	int level = 0;
5205 
5206 	/* Cope with horrid API which requires us to unmap more than the
5207 	   size argument if it happens to be a large-page mapping. */
5208 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5209 
5210 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5211 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5212 
5213 	start_pfn = iova >> VTD_PAGE_SHIFT;
5214 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5215 
5216 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5217 					last_pfn, gather->freelist);
5218 
5219 	if (dmar_domain->max_addr == iova + size)
5220 		dmar_domain->max_addr = iova;
5221 
5222 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5223 
5224 	return size;
5225 }
5226 
5227 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5228 				      unsigned long iova,
5229 				      size_t pgsize, size_t pgcount,
5230 				      struct iommu_iotlb_gather *gather)
5231 {
5232 	unsigned long pgshift = __ffs(pgsize);
5233 	size_t size = pgcount << pgshift;
5234 
5235 	return intel_iommu_unmap(domain, iova, size, gather);
5236 }
5237 
5238 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5239 				 struct iommu_iotlb_gather *gather)
5240 {
5241 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5242 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5243 	size_t size = gather->end - gather->start;
5244 	unsigned long start_pfn;
5245 	unsigned long nrpages;
5246 	int iommu_id;
5247 
5248 	nrpages = aligned_nrpages(gather->start, size);
5249 	start_pfn = mm_to_dma_pfn(iova_pfn);
5250 
5251 	for_each_domain_iommu(iommu_id, dmar_domain)
5252 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5253 				      start_pfn, nrpages, !gather->freelist, 0);
5254 
5255 	dma_free_pagelist(gather->freelist);
5256 }
5257 
5258 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5259 					    dma_addr_t iova)
5260 {
5261 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5262 	struct dma_pte *pte;
5263 	int level = 0;
5264 	u64 phys = 0;
5265 
5266 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5267 	if (pte && dma_pte_present(pte))
5268 		phys = dma_pte_addr(pte) +
5269 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5270 						VTD_PAGE_SHIFT) - 1));
5271 
5272 	return phys;
5273 }
5274 
5275 static bool intel_iommu_capable(enum iommu_cap cap)
5276 {
5277 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5278 		return domain_update_iommu_snooping(NULL);
5279 	if (cap == IOMMU_CAP_INTR_REMAP)
5280 		return irq_remapping_enabled == 1;
5281 
5282 	return false;
5283 }
5284 
5285 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5286 {
5287 	struct intel_iommu *iommu;
5288 
5289 	iommu = device_to_iommu(dev, NULL, NULL);
5290 	if (!iommu)
5291 		return ERR_PTR(-ENODEV);
5292 
5293 	if (translation_pre_enabled(iommu))
5294 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5295 
5296 	return &iommu->iommu;
5297 }
5298 
5299 static void intel_iommu_release_device(struct device *dev)
5300 {
5301 	struct intel_iommu *iommu;
5302 
5303 	iommu = device_to_iommu(dev, NULL, NULL);
5304 	if (!iommu)
5305 		return;
5306 
5307 	dmar_remove_one_dev_info(dev);
5308 
5309 	set_dma_ops(dev, NULL);
5310 }
5311 
5312 static void intel_iommu_probe_finalize(struct device *dev)
5313 {
5314 	set_dma_ops(dev, NULL);
5315 	iommu_setup_dma_ops(dev, 0, U64_MAX);
5316 }
5317 
5318 static void intel_iommu_get_resv_regions(struct device *device,
5319 					 struct list_head *head)
5320 {
5321 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5322 	struct iommu_resv_region *reg;
5323 	struct dmar_rmrr_unit *rmrr;
5324 	struct device *i_dev;
5325 	int i;
5326 
5327 	down_read(&dmar_global_lock);
5328 	for_each_rmrr_units(rmrr) {
5329 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5330 					  i, i_dev) {
5331 			struct iommu_resv_region *resv;
5332 			enum iommu_resv_type type;
5333 			size_t length;
5334 
5335 			if (i_dev != device &&
5336 			    !is_downstream_to_pci_bridge(device, i_dev))
5337 				continue;
5338 
5339 			length = rmrr->end_address - rmrr->base_address + 1;
5340 
5341 			type = device_rmrr_is_relaxable(device) ?
5342 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5343 
5344 			resv = iommu_alloc_resv_region(rmrr->base_address,
5345 						       length, prot, type);
5346 			if (!resv)
5347 				break;
5348 
5349 			list_add_tail(&resv->list, head);
5350 		}
5351 	}
5352 	up_read(&dmar_global_lock);
5353 
5354 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5355 	if (dev_is_pci(device)) {
5356 		struct pci_dev *pdev = to_pci_dev(device);
5357 
5358 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5359 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5360 						   IOMMU_RESV_DIRECT_RELAXABLE);
5361 			if (reg)
5362 				list_add_tail(&reg->list, head);
5363 		}
5364 	}
5365 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5366 
5367 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5368 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5369 				      0, IOMMU_RESV_MSI);
5370 	if (!reg)
5371 		return;
5372 	list_add_tail(&reg->list, head);
5373 }
5374 
5375 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5376 {
5377 	struct device_domain_info *info;
5378 	struct context_entry *context;
5379 	struct dmar_domain *domain;
5380 	unsigned long flags;
5381 	u64 ctx_lo;
5382 	int ret;
5383 
5384 	domain = find_domain(dev);
5385 	if (!domain)
5386 		return -EINVAL;
5387 
5388 	spin_lock_irqsave(&device_domain_lock, flags);
5389 	spin_lock(&iommu->lock);
5390 
5391 	ret = -EINVAL;
5392 	info = get_domain_info(dev);
5393 	if (!info || !info->pasid_supported)
5394 		goto out;
5395 
5396 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5397 	if (WARN_ON(!context))
5398 		goto out;
5399 
5400 	ctx_lo = context[0].lo;
5401 
5402 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5403 		ctx_lo |= CONTEXT_PASIDE;
5404 		context[0].lo = ctx_lo;
5405 		wmb();
5406 		iommu->flush.flush_context(iommu,
5407 					   domain->iommu_did[iommu->seq_id],
5408 					   PCI_DEVID(info->bus, info->devfn),
5409 					   DMA_CCMD_MASK_NOBIT,
5410 					   DMA_CCMD_DEVICE_INVL);
5411 	}
5412 
5413 	/* Enable PASID support in the device, if it wasn't already */
5414 	if (!info->pasid_enabled)
5415 		iommu_enable_dev_iotlb(info);
5416 
5417 	ret = 0;
5418 
5419  out:
5420 	spin_unlock(&iommu->lock);
5421 	spin_unlock_irqrestore(&device_domain_lock, flags);
5422 
5423 	return ret;
5424 }
5425 
5426 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5427 {
5428 	if (dev_is_pci(dev))
5429 		return pci_device_group(dev);
5430 	return generic_device_group(dev);
5431 }
5432 
5433 static int intel_iommu_enable_auxd(struct device *dev)
5434 {
5435 	struct device_domain_info *info;
5436 	struct intel_iommu *iommu;
5437 	unsigned long flags;
5438 	int ret;
5439 
5440 	iommu = device_to_iommu(dev, NULL, NULL);
5441 	if (!iommu || dmar_disabled)
5442 		return -EINVAL;
5443 
5444 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5445 		return -EINVAL;
5446 
5447 	ret = intel_iommu_enable_pasid(iommu, dev);
5448 	if (ret)
5449 		return -ENODEV;
5450 
5451 	spin_lock_irqsave(&device_domain_lock, flags);
5452 	info = get_domain_info(dev);
5453 	info->auxd_enabled = 1;
5454 	spin_unlock_irqrestore(&device_domain_lock, flags);
5455 
5456 	return 0;
5457 }
5458 
5459 static int intel_iommu_disable_auxd(struct device *dev)
5460 {
5461 	struct device_domain_info *info;
5462 	unsigned long flags;
5463 
5464 	spin_lock_irqsave(&device_domain_lock, flags);
5465 	info = get_domain_info(dev);
5466 	if (!WARN_ON(!info))
5467 		info->auxd_enabled = 0;
5468 	spin_unlock_irqrestore(&device_domain_lock, flags);
5469 
5470 	return 0;
5471 }
5472 
5473 static int intel_iommu_enable_sva(struct device *dev)
5474 {
5475 	struct device_domain_info *info = get_domain_info(dev);
5476 	struct intel_iommu *iommu;
5477 	int ret;
5478 
5479 	if (!info || dmar_disabled)
5480 		return -EINVAL;
5481 
5482 	iommu = info->iommu;
5483 	if (!iommu)
5484 		return -EINVAL;
5485 
5486 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5487 		return -ENODEV;
5488 
5489 	if (intel_iommu_enable_pasid(iommu, dev))
5490 		return -ENODEV;
5491 
5492 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5493 		return -EINVAL;
5494 
5495 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5496 	if (!ret)
5497 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5498 
5499 	return ret;
5500 }
5501 
5502 static int intel_iommu_disable_sva(struct device *dev)
5503 {
5504 	struct device_domain_info *info = get_domain_info(dev);
5505 	struct intel_iommu *iommu = info->iommu;
5506 	int ret;
5507 
5508 	ret = iommu_unregister_device_fault_handler(dev);
5509 	if (!ret)
5510 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5511 
5512 	return ret;
5513 }
5514 
5515 static int intel_iommu_enable_iopf(struct device *dev)
5516 {
5517 	struct device_domain_info *info = get_domain_info(dev);
5518 
5519 	if (info && info->pri_supported)
5520 		return 0;
5521 
5522 	return -ENODEV;
5523 }
5524 
5525 static int
5526 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5527 {
5528 	switch (feat) {
5529 	case IOMMU_DEV_FEAT_AUX:
5530 		return intel_iommu_enable_auxd(dev);
5531 
5532 	case IOMMU_DEV_FEAT_IOPF:
5533 		return intel_iommu_enable_iopf(dev);
5534 
5535 	case IOMMU_DEV_FEAT_SVA:
5536 		return intel_iommu_enable_sva(dev);
5537 
5538 	default:
5539 		return -ENODEV;
5540 	}
5541 }
5542 
5543 static int
5544 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5545 {
5546 	switch (feat) {
5547 	case IOMMU_DEV_FEAT_AUX:
5548 		return intel_iommu_disable_auxd(dev);
5549 
5550 	case IOMMU_DEV_FEAT_IOPF:
5551 		return 0;
5552 
5553 	case IOMMU_DEV_FEAT_SVA:
5554 		return intel_iommu_disable_sva(dev);
5555 
5556 	default:
5557 		return -ENODEV;
5558 	}
5559 }
5560 
5561 static bool
5562 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5563 {
5564 	struct device_domain_info *info = get_domain_info(dev);
5565 
5566 	if (feat == IOMMU_DEV_FEAT_AUX)
5567 		return scalable_mode_support() && info && info->auxd_enabled;
5568 
5569 	return false;
5570 }
5571 
5572 static int
5573 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5574 {
5575 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5576 
5577 	return dmar_domain->default_pasid > 0 ?
5578 			dmar_domain->default_pasid : -EINVAL;
5579 }
5580 
5581 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5582 					   struct device *dev)
5583 {
5584 	return attach_deferred(dev);
5585 }
5586 
5587 static int
5588 intel_iommu_enable_nesting(struct iommu_domain *domain)
5589 {
5590 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5591 	unsigned long flags;
5592 	int ret = -ENODEV;
5593 
5594 	spin_lock_irqsave(&device_domain_lock, flags);
5595 	if (list_empty(&dmar_domain->devices)) {
5596 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5597 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5598 		ret = 0;
5599 	}
5600 	spin_unlock_irqrestore(&device_domain_lock, flags);
5601 
5602 	return ret;
5603 }
5604 
5605 /*
5606  * Check that the device does not live on an external facing PCI port that is
5607  * marked as untrusted. Such devices should not be able to apply quirks and
5608  * thus not be able to bypass the IOMMU restrictions.
5609  */
5610 static bool risky_device(struct pci_dev *pdev)
5611 {
5612 	if (pdev->untrusted) {
5613 		pci_info(pdev,
5614 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5615 			 pdev->vendor, pdev->device);
5616 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5617 		return true;
5618 	}
5619 	return false;
5620 }
5621 
5622 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5623 				       unsigned long iova, size_t size)
5624 {
5625 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5626 	unsigned long pages = aligned_nrpages(iova, size);
5627 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5628 	struct intel_iommu *iommu;
5629 	int iommu_id;
5630 
5631 	for_each_domain_iommu(iommu_id, dmar_domain) {
5632 		iommu = g_iommus[iommu_id];
5633 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5634 	}
5635 }
5636 
5637 const struct iommu_ops intel_iommu_ops = {
5638 	.capable		= intel_iommu_capable,
5639 	.domain_alloc		= intel_iommu_domain_alloc,
5640 	.domain_free		= intel_iommu_domain_free,
5641 	.enable_nesting		= intel_iommu_enable_nesting,
5642 	.attach_dev		= intel_iommu_attach_device,
5643 	.detach_dev		= intel_iommu_detach_device,
5644 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5645 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5646 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5647 	.map_pages		= intel_iommu_map_pages,
5648 	.unmap_pages		= intel_iommu_unmap_pages,
5649 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5650 	.flush_iotlb_all        = intel_flush_iotlb_all,
5651 	.iotlb_sync		= intel_iommu_tlb_sync,
5652 	.iova_to_phys		= intel_iommu_iova_to_phys,
5653 	.probe_device		= intel_iommu_probe_device,
5654 	.probe_finalize		= intel_iommu_probe_finalize,
5655 	.release_device		= intel_iommu_release_device,
5656 	.get_resv_regions	= intel_iommu_get_resv_regions,
5657 	.put_resv_regions	= generic_iommu_put_resv_regions,
5658 	.device_group		= intel_iommu_device_group,
5659 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5660 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5661 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5662 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5663 	.def_domain_type	= device_def_domain_type,
5664 	.pgsize_bitmap		= SZ_4K,
5665 #ifdef CONFIG_INTEL_IOMMU_SVM
5666 	.cache_invalidate	= intel_iommu_sva_invalidate,
5667 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5668 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5669 	.sva_bind		= intel_svm_bind,
5670 	.sva_unbind		= intel_svm_unbind,
5671 	.sva_get_pasid		= intel_svm_get_pasid,
5672 	.page_response		= intel_svm_page_response,
5673 #endif
5674 };
5675 
5676 static void quirk_iommu_igfx(struct pci_dev *dev)
5677 {
5678 	if (risky_device(dev))
5679 		return;
5680 
5681 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5682 	dmar_map_gfx = 0;
5683 }
5684 
5685 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5693 
5694 /* Broadwell igfx malfunctions with dmar */
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5719 
5720 static void quirk_iommu_rwbf(struct pci_dev *dev)
5721 {
5722 	if (risky_device(dev))
5723 		return;
5724 
5725 	/*
5726 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5727 	 * but needs it. Same seems to hold for the desktop versions.
5728 	 */
5729 	pci_info(dev, "Forcing write-buffer flush capability\n");
5730 	rwbf_quirk = 1;
5731 }
5732 
5733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5736 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5737 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5738 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5739 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5740 
5741 #define GGC 0x52
5742 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5743 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5744 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5745 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5746 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5747 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5748 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5749 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5750 
5751 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5752 {
5753 	unsigned short ggc;
5754 
5755 	if (risky_device(dev))
5756 		return;
5757 
5758 	if (pci_read_config_word(dev, GGC, &ggc))
5759 		return;
5760 
5761 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5762 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5763 		dmar_map_gfx = 0;
5764 	} else if (dmar_map_gfx) {
5765 		/* we have to ensure the gfx device is idle before we flush */
5766 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5767 		iommu_set_dma_strict();
5768 	}
5769 }
5770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5774 
5775 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5776 {
5777 	unsigned short ver;
5778 
5779 	if (!IS_GFX_DEVICE(dev))
5780 		return;
5781 
5782 	ver = (dev->device >> 8) & 0xff;
5783 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5784 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5785 	    ver != 0x9a)
5786 		return;
5787 
5788 	if (risky_device(dev))
5789 		return;
5790 
5791 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5792 	iommu_skip_te_disable = 1;
5793 }
5794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5795 
5796 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5797    ISOCH DMAR unit for the Azalia sound device, but not give it any
5798    TLB entries, which causes it to deadlock. Check for that.  We do
5799    this in a function called from init_dmars(), instead of in a PCI
5800    quirk, because we don't want to print the obnoxious "BIOS broken"
5801    message if VT-d is actually disabled.
5802 */
5803 static void __init check_tylersburg_isoch(void)
5804 {
5805 	struct pci_dev *pdev;
5806 	uint32_t vtisochctrl;
5807 
5808 	/* If there's no Azalia in the system anyway, forget it. */
5809 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5810 	if (!pdev)
5811 		return;
5812 
5813 	if (risky_device(pdev)) {
5814 		pci_dev_put(pdev);
5815 		return;
5816 	}
5817 
5818 	pci_dev_put(pdev);
5819 
5820 	/* System Management Registers. Might be hidden, in which case
5821 	   we can't do the sanity check. But that's OK, because the
5822 	   known-broken BIOSes _don't_ actually hide it, so far. */
5823 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5824 	if (!pdev)
5825 		return;
5826 
5827 	if (risky_device(pdev)) {
5828 		pci_dev_put(pdev);
5829 		return;
5830 	}
5831 
5832 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5833 		pci_dev_put(pdev);
5834 		return;
5835 	}
5836 
5837 	pci_dev_put(pdev);
5838 
5839 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5840 	if (vtisochctrl & 1)
5841 		return;
5842 
5843 	/* Drop all bits other than the number of TLB entries */
5844 	vtisochctrl &= 0x1c;
5845 
5846 	/* If we have the recommended number of TLB entries (16), fine. */
5847 	if (vtisochctrl == 0x10)
5848 		return;
5849 
5850 	/* Zero TLB entries? You get to ride the short bus to school. */
5851 	if (!vtisochctrl) {
5852 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5853 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5854 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5855 		     dmi_get_system_info(DMI_BIOS_VERSION),
5856 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5857 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5858 		return;
5859 	}
5860 
5861 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5862 	       vtisochctrl);
5863 }
5864