xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 4b7ead03)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-map-ops.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int iommu_skip_te_disable;
359 
360 #define IDENTMAP_GFX		2
361 #define IDENTMAP_AZALIA		4
362 
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365 
366 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
367 struct device_domain_info *get_domain_info(struct device *dev)
368 {
369 	struct device_domain_info *info;
370 
371 	if (!dev)
372 		return NULL;
373 
374 	info = dev_iommu_priv_get(dev);
375 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
376 		return NULL;
377 
378 	return info;
379 }
380 
381 DEFINE_SPINLOCK(device_domain_lock);
382 static LIST_HEAD(device_domain_list);
383 
384 /*
385  * Iterate over elements in device_domain_list and call the specified
386  * callback @fn against each element.
387  */
388 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
389 				     void *data), void *data)
390 {
391 	int ret = 0;
392 	unsigned long flags;
393 	struct device_domain_info *info;
394 
395 	spin_lock_irqsave(&device_domain_lock, flags);
396 	list_for_each_entry(info, &device_domain_list, global) {
397 		ret = fn(info, data);
398 		if (ret) {
399 			spin_unlock_irqrestore(&device_domain_lock, flags);
400 			return ret;
401 		}
402 	}
403 	spin_unlock_irqrestore(&device_domain_lock, flags);
404 
405 	return 0;
406 }
407 
408 const struct iommu_ops intel_iommu_ops;
409 
410 static bool translation_pre_enabled(struct intel_iommu *iommu)
411 {
412 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
413 }
414 
415 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
416 {
417 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
418 }
419 
420 static void init_translation_status(struct intel_iommu *iommu)
421 {
422 	u32 gsts;
423 
424 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
425 	if (gsts & DMA_GSTS_TES)
426 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
427 }
428 
429 static int __init intel_iommu_setup(char *str)
430 {
431 	if (!str)
432 		return -EINVAL;
433 	while (*str) {
434 		if (!strncmp(str, "on", 2)) {
435 			dmar_disabled = 0;
436 			pr_info("IOMMU enabled\n");
437 		} else if (!strncmp(str, "off", 3)) {
438 			dmar_disabled = 1;
439 			no_platform_optin = 1;
440 			pr_info("IOMMU disabled\n");
441 		} else if (!strncmp(str, "igfx_off", 8)) {
442 			dmar_map_gfx = 0;
443 			pr_info("Disable GFX device mapping\n");
444 		} else if (!strncmp(str, "forcedac", 8)) {
445 			pr_info("Forcing DAC for PCI devices\n");
446 			dmar_forcedac = 1;
447 		} else if (!strncmp(str, "strict", 6)) {
448 			pr_info("Disable batched IOTLB flush\n");
449 			intel_iommu_strict = 1;
450 		} else if (!strncmp(str, "sp_off", 6)) {
451 			pr_info("Disable supported super page\n");
452 			intel_iommu_superpage = 0;
453 		} else if (!strncmp(str, "sm_on", 5)) {
454 			pr_info("Intel-IOMMU: scalable mode supported\n");
455 			intel_iommu_sm = 1;
456 		} else if (!strncmp(str, "tboot_noforce", 13)) {
457 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
458 			intel_iommu_tboot_noforce = 1;
459 		}
460 
461 		str += strcspn(str, ",");
462 		while (*str == ',')
463 			str++;
464 	}
465 	return 0;
466 }
467 __setup("intel_iommu=", intel_iommu_setup);
468 
469 static struct kmem_cache *iommu_domain_cache;
470 static struct kmem_cache *iommu_devinfo_cache;
471 
472 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 {
474 	struct dmar_domain **domains;
475 	int idx = did >> 8;
476 
477 	domains = iommu->domains[idx];
478 	if (!domains)
479 		return NULL;
480 
481 	return domains[did & 0xff];
482 }
483 
484 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
485 			     struct dmar_domain *domain)
486 {
487 	struct dmar_domain **domains;
488 	int idx = did >> 8;
489 
490 	if (!iommu->domains[idx]) {
491 		size_t size = 256 * sizeof(struct dmar_domain *);
492 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
493 	}
494 
495 	domains = iommu->domains[idx];
496 	if (WARN_ON(!domains))
497 		return;
498 	else
499 		domains[did & 0xff] = domain;
500 }
501 
502 void *alloc_pgtable_page(int node)
503 {
504 	struct page *page;
505 	void *vaddr = NULL;
506 
507 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 	if (page)
509 		vaddr = page_address(page);
510 	return vaddr;
511 }
512 
513 void free_pgtable_page(void *vaddr)
514 {
515 	free_page((unsigned long)vaddr);
516 }
517 
518 static inline void *alloc_domain_mem(void)
519 {
520 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
521 }
522 
523 static void free_domain_mem(void *vaddr)
524 {
525 	kmem_cache_free(iommu_domain_cache, vaddr);
526 }
527 
528 static inline void * alloc_devinfo_mem(void)
529 {
530 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
531 }
532 
533 static inline void free_devinfo_mem(void *vaddr)
534 {
535 	kmem_cache_free(iommu_devinfo_cache, vaddr);
536 }
537 
538 static inline int domain_type_is_si(struct dmar_domain *domain)
539 {
540 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
541 }
542 
543 static inline bool domain_use_first_level(struct dmar_domain *domain)
544 {
545 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
546 }
547 
548 static inline int domain_pfn_supported(struct dmar_domain *domain,
549 				       unsigned long pfn)
550 {
551 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
552 
553 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
554 }
555 
556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 {
558 	unsigned long sagaw;
559 	int agaw = -1;
560 
561 	sagaw = cap_sagaw(iommu->cap);
562 	for (agaw = width_to_agaw(max_gaw);
563 	     agaw >= 0; agaw--) {
564 		if (test_bit(agaw, &sagaw))
565 			break;
566 	}
567 
568 	return agaw;
569 }
570 
571 /*
572  * Calculate max SAGAW for each iommu.
573  */
574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
575 {
576 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
577 }
578 
579 /*
580  * calculate agaw for each iommu.
581  * "SAGAW" may be different across iommus, use a default agaw, and
582  * get a supported less agaw for iommus that don't support the default agaw.
583  */
584 int iommu_calculate_agaw(struct intel_iommu *iommu)
585 {
586 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
587 }
588 
589 /* This functionin only returns single iommu in a domain */
590 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
591 {
592 	int iommu_id;
593 
594 	/* si_domain and vm domain should not get here. */
595 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
596 		return NULL;
597 
598 	for_each_domain_iommu(iommu_id, domain)
599 		break;
600 
601 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
602 		return NULL;
603 
604 	return g_iommus[iommu_id];
605 }
606 
607 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
608 {
609 	return sm_supported(iommu) ?
610 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
611 }
612 
613 static void domain_update_iommu_coherency(struct dmar_domain *domain)
614 {
615 	struct dmar_drhd_unit *drhd;
616 	struct intel_iommu *iommu;
617 	bool found = false;
618 	int i;
619 
620 	domain->iommu_coherency = 1;
621 
622 	for_each_domain_iommu(i, domain) {
623 		found = true;
624 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
625 			domain->iommu_coherency = 0;
626 			break;
627 		}
628 	}
629 	if (found)
630 		return;
631 
632 	/* No hardware attached; use lowest common denominator */
633 	rcu_read_lock();
634 	for_each_active_iommu(iommu, drhd) {
635 		if (!iommu_paging_structure_coherency(iommu)) {
636 			domain->iommu_coherency = 0;
637 			break;
638 		}
639 	}
640 	rcu_read_unlock();
641 }
642 
643 static int domain_update_iommu_snooping(struct intel_iommu *skip)
644 {
645 	struct dmar_drhd_unit *drhd;
646 	struct intel_iommu *iommu;
647 	int ret = 1;
648 
649 	rcu_read_lock();
650 	for_each_active_iommu(iommu, drhd) {
651 		if (iommu != skip) {
652 			if (!ecap_sc_support(iommu->ecap)) {
653 				ret = 0;
654 				break;
655 			}
656 		}
657 	}
658 	rcu_read_unlock();
659 
660 	return ret;
661 }
662 
663 static int domain_update_iommu_superpage(struct dmar_domain *domain,
664 					 struct intel_iommu *skip)
665 {
666 	struct dmar_drhd_unit *drhd;
667 	struct intel_iommu *iommu;
668 	int mask = 0x3;
669 
670 	if (!intel_iommu_superpage) {
671 		return 0;
672 	}
673 
674 	/* set iommu_superpage to the smallest common denominator */
675 	rcu_read_lock();
676 	for_each_active_iommu(iommu, drhd) {
677 		if (iommu != skip) {
678 			if (domain && domain_use_first_level(domain)) {
679 				if (!cap_fl1gp_support(iommu->cap))
680 					mask = 0x1;
681 			} else {
682 				mask &= cap_super_page_val(iommu->cap);
683 			}
684 
685 			if (!mask)
686 				break;
687 		}
688 	}
689 	rcu_read_unlock();
690 
691 	return fls(mask);
692 }
693 
694 static int domain_update_device_node(struct dmar_domain *domain)
695 {
696 	struct device_domain_info *info;
697 	int nid = NUMA_NO_NODE;
698 
699 	assert_spin_locked(&device_domain_lock);
700 
701 	if (list_empty(&domain->devices))
702 		return NUMA_NO_NODE;
703 
704 	list_for_each_entry(info, &domain->devices, link) {
705 		if (!info->dev)
706 			continue;
707 
708 		/*
709 		 * There could possibly be multiple device numa nodes as devices
710 		 * within the same domain may sit behind different IOMMUs. There
711 		 * isn't perfect answer in such situation, so we select first
712 		 * come first served policy.
713 		 */
714 		nid = dev_to_node(info->dev);
715 		if (nid != NUMA_NO_NODE)
716 			break;
717 	}
718 
719 	return nid;
720 }
721 
722 static void domain_update_iotlb(struct dmar_domain *domain);
723 
724 /* Some capabilities may be different across iommus */
725 static void domain_update_iommu_cap(struct dmar_domain *domain)
726 {
727 	domain_update_iommu_coherency(domain);
728 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
729 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
730 
731 	/*
732 	 * If RHSA is missing, we should default to the device numa domain
733 	 * as fall back.
734 	 */
735 	if (domain->nid == NUMA_NO_NODE)
736 		domain->nid = domain_update_device_node(domain);
737 
738 	/*
739 	 * First-level translation restricts the input-address to a
740 	 * canonical address (i.e., address bits 63:N have the same
741 	 * value as address bit [N-1], where N is 48-bits with 4-level
742 	 * paging and 57-bits with 5-level paging). Hence, skip bit
743 	 * [N-1].
744 	 */
745 	if (domain_use_first_level(domain))
746 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
747 	else
748 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
749 
750 	domain_update_iotlb(domain);
751 }
752 
753 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
754 					 u8 devfn, int alloc)
755 {
756 	struct root_entry *root = &iommu->root_entry[bus];
757 	struct context_entry *context;
758 	u64 *entry;
759 
760 	entry = &root->lo;
761 	if (sm_supported(iommu)) {
762 		if (devfn >= 0x80) {
763 			devfn -= 0x80;
764 			entry = &root->hi;
765 		}
766 		devfn *= 2;
767 	}
768 	if (*entry & 1)
769 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
770 	else {
771 		unsigned long phy_addr;
772 		if (!alloc)
773 			return NULL;
774 
775 		context = alloc_pgtable_page(iommu->node);
776 		if (!context)
777 			return NULL;
778 
779 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
780 		phy_addr = virt_to_phys((void *)context);
781 		*entry = phy_addr | 1;
782 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
783 	}
784 	return &context[devfn];
785 }
786 
787 static bool attach_deferred(struct device *dev)
788 {
789 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
790 }
791 
792 /**
793  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
794  *				 sub-hierarchy of a candidate PCI-PCI bridge
795  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
796  * @bridge: the candidate PCI-PCI bridge
797  *
798  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
799  */
800 static bool
801 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
802 {
803 	struct pci_dev *pdev, *pbridge;
804 
805 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
806 		return false;
807 
808 	pdev = to_pci_dev(dev);
809 	pbridge = to_pci_dev(bridge);
810 
811 	if (pbridge->subordinate &&
812 	    pbridge->subordinate->number <= pdev->bus->number &&
813 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
814 		return true;
815 
816 	return false;
817 }
818 
819 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
820 {
821 	struct dmar_drhd_unit *drhd;
822 	u32 vtbar;
823 	int rc;
824 
825 	/* We know that this device on this chipset has its own IOMMU.
826 	 * If we find it under a different IOMMU, then the BIOS is lying
827 	 * to us. Hope that the IOMMU for this device is actually
828 	 * disabled, and it needs no translation...
829 	 */
830 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
831 	if (rc) {
832 		/* "can't" happen */
833 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
834 		return false;
835 	}
836 	vtbar &= 0xffff0000;
837 
838 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
839 	drhd = dmar_find_matched_drhd_unit(pdev);
840 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
841 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
842 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
843 		return true;
844 	}
845 
846 	return false;
847 }
848 
849 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
850 {
851 	if (!iommu || iommu->drhd->ignored)
852 		return true;
853 
854 	if (dev_is_pci(dev)) {
855 		struct pci_dev *pdev = to_pci_dev(dev);
856 
857 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
858 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
859 		    quirk_ioat_snb_local_iommu(pdev))
860 			return true;
861 	}
862 
863 	return false;
864 }
865 
866 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
867 {
868 	struct dmar_drhd_unit *drhd = NULL;
869 	struct pci_dev *pdev = NULL;
870 	struct intel_iommu *iommu;
871 	struct device *tmp;
872 	u16 segment = 0;
873 	int i;
874 
875 	if (!dev)
876 		return NULL;
877 
878 	if (dev_is_pci(dev)) {
879 		struct pci_dev *pf_pdev;
880 
881 		pdev = pci_real_dma_dev(to_pci_dev(dev));
882 
883 		/* VFs aren't listed in scope tables; we need to look up
884 		 * the PF instead to find the IOMMU. */
885 		pf_pdev = pci_physfn(pdev);
886 		dev = &pf_pdev->dev;
887 		segment = pci_domain_nr(pdev->bus);
888 	} else if (has_acpi_companion(dev))
889 		dev = &ACPI_COMPANION(dev)->dev;
890 
891 	rcu_read_lock();
892 	for_each_iommu(iommu, drhd) {
893 		if (pdev && segment != drhd->segment)
894 			continue;
895 
896 		for_each_active_dev_scope(drhd->devices,
897 					  drhd->devices_cnt, i, tmp) {
898 			if (tmp == dev) {
899 				/* For a VF use its original BDF# not that of the PF
900 				 * which we used for the IOMMU lookup. Strictly speaking
901 				 * we could do this for all PCI devices; we only need to
902 				 * get the BDF# from the scope table for ACPI matches. */
903 				if (pdev && pdev->is_virtfn)
904 					goto got_pdev;
905 
906 				if (bus && devfn) {
907 					*bus = drhd->devices[i].bus;
908 					*devfn = drhd->devices[i].devfn;
909 				}
910 				goto out;
911 			}
912 
913 			if (is_downstream_to_pci_bridge(dev, tmp))
914 				goto got_pdev;
915 		}
916 
917 		if (pdev && drhd->include_all) {
918 		got_pdev:
919 			if (bus && devfn) {
920 				*bus = pdev->bus->number;
921 				*devfn = pdev->devfn;
922 			}
923 			goto out;
924 		}
925 	}
926 	iommu = NULL;
927  out:
928 	if (iommu_is_dummy(iommu, dev))
929 		iommu = NULL;
930 
931 	rcu_read_unlock();
932 
933 	return iommu;
934 }
935 
936 static void domain_flush_cache(struct dmar_domain *domain,
937 			       void *addr, int size)
938 {
939 	if (!domain->iommu_coherency)
940 		clflush_cache_range(addr, size);
941 }
942 
943 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
944 {
945 	struct context_entry *context;
946 	int ret = 0;
947 	unsigned long flags;
948 
949 	spin_lock_irqsave(&iommu->lock, flags);
950 	context = iommu_context_addr(iommu, bus, devfn, 0);
951 	if (context)
952 		ret = context_present(context);
953 	spin_unlock_irqrestore(&iommu->lock, flags);
954 	return ret;
955 }
956 
957 static void free_context_table(struct intel_iommu *iommu)
958 {
959 	int i;
960 	unsigned long flags;
961 	struct context_entry *context;
962 
963 	spin_lock_irqsave(&iommu->lock, flags);
964 	if (!iommu->root_entry) {
965 		goto out;
966 	}
967 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
968 		context = iommu_context_addr(iommu, i, 0, 0);
969 		if (context)
970 			free_pgtable_page(context);
971 
972 		if (!sm_supported(iommu))
973 			continue;
974 
975 		context = iommu_context_addr(iommu, i, 0x80, 0);
976 		if (context)
977 			free_pgtable_page(context);
978 
979 	}
980 	free_pgtable_page(iommu->root_entry);
981 	iommu->root_entry = NULL;
982 out:
983 	spin_unlock_irqrestore(&iommu->lock, flags);
984 }
985 
986 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
987 				      unsigned long pfn, int *target_level)
988 {
989 	struct dma_pte *parent, *pte;
990 	int level = agaw_to_level(domain->agaw);
991 	int offset;
992 
993 	BUG_ON(!domain->pgd);
994 
995 	if (!domain_pfn_supported(domain, pfn))
996 		/* Address beyond IOMMU's addressing capabilities. */
997 		return NULL;
998 
999 	parent = domain->pgd;
1000 
1001 	while (1) {
1002 		void *tmp_page;
1003 
1004 		offset = pfn_level_offset(pfn, level);
1005 		pte = &parent[offset];
1006 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1007 			break;
1008 		if (level == *target_level)
1009 			break;
1010 
1011 		if (!dma_pte_present(pte)) {
1012 			uint64_t pteval;
1013 
1014 			tmp_page = alloc_pgtable_page(domain->nid);
1015 
1016 			if (!tmp_page)
1017 				return NULL;
1018 
1019 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1020 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1021 			if (domain_use_first_level(domain))
1022 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1023 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1024 				/* Someone else set it while we were thinking; use theirs. */
1025 				free_pgtable_page(tmp_page);
1026 			else
1027 				domain_flush_cache(domain, pte, sizeof(*pte));
1028 		}
1029 		if (level == 1)
1030 			break;
1031 
1032 		parent = phys_to_virt(dma_pte_addr(pte));
1033 		level--;
1034 	}
1035 
1036 	if (!*target_level)
1037 		*target_level = level;
1038 
1039 	return pte;
1040 }
1041 
1042 /* return address's pte at specific level */
1043 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1044 					 unsigned long pfn,
1045 					 int level, int *large_page)
1046 {
1047 	struct dma_pte *parent, *pte;
1048 	int total = agaw_to_level(domain->agaw);
1049 	int offset;
1050 
1051 	parent = domain->pgd;
1052 	while (level <= total) {
1053 		offset = pfn_level_offset(pfn, total);
1054 		pte = &parent[offset];
1055 		if (level == total)
1056 			return pte;
1057 
1058 		if (!dma_pte_present(pte)) {
1059 			*large_page = total;
1060 			break;
1061 		}
1062 
1063 		if (dma_pte_superpage(pte)) {
1064 			*large_page = total;
1065 			return pte;
1066 		}
1067 
1068 		parent = phys_to_virt(dma_pte_addr(pte));
1069 		total--;
1070 	}
1071 	return NULL;
1072 }
1073 
1074 /* clear last level pte, a tlb flush should be followed */
1075 static void dma_pte_clear_range(struct dmar_domain *domain,
1076 				unsigned long start_pfn,
1077 				unsigned long last_pfn)
1078 {
1079 	unsigned int large_page;
1080 	struct dma_pte *first_pte, *pte;
1081 
1082 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1083 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1084 	BUG_ON(start_pfn > last_pfn);
1085 
1086 	/* we don't need lock here; nobody else touches the iova range */
1087 	do {
1088 		large_page = 1;
1089 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1090 		if (!pte) {
1091 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1092 			continue;
1093 		}
1094 		do {
1095 			dma_clear_pte(pte);
1096 			start_pfn += lvl_to_nr_pages(large_page);
1097 			pte++;
1098 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1099 
1100 		domain_flush_cache(domain, first_pte,
1101 				   (void *)pte - (void *)first_pte);
1102 
1103 	} while (start_pfn && start_pfn <= last_pfn);
1104 }
1105 
1106 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1107 			       int retain_level, struct dma_pte *pte,
1108 			       unsigned long pfn, unsigned long start_pfn,
1109 			       unsigned long last_pfn)
1110 {
1111 	pfn = max(start_pfn, pfn);
1112 	pte = &pte[pfn_level_offset(pfn, level)];
1113 
1114 	do {
1115 		unsigned long level_pfn;
1116 		struct dma_pte *level_pte;
1117 
1118 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1119 			goto next;
1120 
1121 		level_pfn = pfn & level_mask(level);
1122 		level_pte = phys_to_virt(dma_pte_addr(pte));
1123 
1124 		if (level > 2) {
1125 			dma_pte_free_level(domain, level - 1, retain_level,
1126 					   level_pte, level_pfn, start_pfn,
1127 					   last_pfn);
1128 		}
1129 
1130 		/*
1131 		 * Free the page table if we're below the level we want to
1132 		 * retain and the range covers the entire table.
1133 		 */
1134 		if (level < retain_level && !(start_pfn > level_pfn ||
1135 		      last_pfn < level_pfn + level_size(level) - 1)) {
1136 			dma_clear_pte(pte);
1137 			domain_flush_cache(domain, pte, sizeof(*pte));
1138 			free_pgtable_page(level_pte);
1139 		}
1140 next:
1141 		pfn += level_size(level);
1142 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1143 }
1144 
1145 /*
1146  * clear last level (leaf) ptes and free page table pages below the
1147  * level we wish to keep intact.
1148  */
1149 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1150 				   unsigned long start_pfn,
1151 				   unsigned long last_pfn,
1152 				   int retain_level)
1153 {
1154 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1155 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1156 	BUG_ON(start_pfn > last_pfn);
1157 
1158 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1159 
1160 	/* We don't need lock here; nobody else touches the iova range */
1161 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1162 			   domain->pgd, 0, start_pfn, last_pfn);
1163 
1164 	/* free pgd */
1165 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1166 		free_pgtable_page(domain->pgd);
1167 		domain->pgd = NULL;
1168 	}
1169 }
1170 
1171 /* When a page at a given level is being unlinked from its parent, we don't
1172    need to *modify* it at all. All we need to do is make a list of all the
1173    pages which can be freed just as soon as we've flushed the IOTLB and we
1174    know the hardware page-walk will no longer touch them.
1175    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1176    be freed. */
1177 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1178 					    int level, struct dma_pte *pte,
1179 					    struct page *freelist)
1180 {
1181 	struct page *pg;
1182 
1183 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1184 	pg->freelist = freelist;
1185 	freelist = pg;
1186 
1187 	if (level == 1)
1188 		return freelist;
1189 
1190 	pte = page_address(pg);
1191 	do {
1192 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1193 			freelist = dma_pte_list_pagetables(domain, level - 1,
1194 							   pte, freelist);
1195 		pte++;
1196 	} while (!first_pte_in_page(pte));
1197 
1198 	return freelist;
1199 }
1200 
1201 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1202 					struct dma_pte *pte, unsigned long pfn,
1203 					unsigned long start_pfn,
1204 					unsigned long last_pfn,
1205 					struct page *freelist)
1206 {
1207 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1208 
1209 	pfn = max(start_pfn, pfn);
1210 	pte = &pte[pfn_level_offset(pfn, level)];
1211 
1212 	do {
1213 		unsigned long level_pfn;
1214 
1215 		if (!dma_pte_present(pte))
1216 			goto next;
1217 
1218 		level_pfn = pfn & level_mask(level);
1219 
1220 		/* If range covers entire pagetable, free it */
1221 		if (start_pfn <= level_pfn &&
1222 		    last_pfn >= level_pfn + level_size(level) - 1) {
1223 			/* These suborbinate page tables are going away entirely. Don't
1224 			   bother to clear them; we're just going to *free* them. */
1225 			if (level > 1 && !dma_pte_superpage(pte))
1226 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1227 
1228 			dma_clear_pte(pte);
1229 			if (!first_pte)
1230 				first_pte = pte;
1231 			last_pte = pte;
1232 		} else if (level > 1) {
1233 			/* Recurse down into a level that isn't *entirely* obsolete */
1234 			freelist = dma_pte_clear_level(domain, level - 1,
1235 						       phys_to_virt(dma_pte_addr(pte)),
1236 						       level_pfn, start_pfn, last_pfn,
1237 						       freelist);
1238 		}
1239 next:
1240 		pfn += level_size(level);
1241 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1242 
1243 	if (first_pte)
1244 		domain_flush_cache(domain, first_pte,
1245 				   (void *)++last_pte - (void *)first_pte);
1246 
1247 	return freelist;
1248 }
1249 
1250 /* We can't just free the pages because the IOMMU may still be walking
1251    the page tables, and may have cached the intermediate levels. The
1252    pages can only be freed after the IOTLB flush has been done. */
1253 static struct page *domain_unmap(struct dmar_domain *domain,
1254 				 unsigned long start_pfn,
1255 				 unsigned long last_pfn,
1256 				 struct page *freelist)
1257 {
1258 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1259 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1260 	BUG_ON(start_pfn > last_pfn);
1261 
1262 	/* we don't need lock here; nobody else touches the iova range */
1263 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1264 				       domain->pgd, 0, start_pfn, last_pfn,
1265 				       freelist);
1266 
1267 	/* free pgd */
1268 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1269 		struct page *pgd_page = virt_to_page(domain->pgd);
1270 		pgd_page->freelist = freelist;
1271 		freelist = pgd_page;
1272 
1273 		domain->pgd = NULL;
1274 	}
1275 
1276 	return freelist;
1277 }
1278 
1279 static void dma_free_pagelist(struct page *freelist)
1280 {
1281 	struct page *pg;
1282 
1283 	while ((pg = freelist)) {
1284 		freelist = pg->freelist;
1285 		free_pgtable_page(page_address(pg));
1286 	}
1287 }
1288 
1289 /* iommu handling */
1290 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1291 {
1292 	struct root_entry *root;
1293 	unsigned long flags;
1294 
1295 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1296 	if (!root) {
1297 		pr_err("Allocating root entry for %s failed\n",
1298 			iommu->name);
1299 		return -ENOMEM;
1300 	}
1301 
1302 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1303 
1304 	spin_lock_irqsave(&iommu->lock, flags);
1305 	iommu->root_entry = root;
1306 	spin_unlock_irqrestore(&iommu->lock, flags);
1307 
1308 	return 0;
1309 }
1310 
1311 static void iommu_set_root_entry(struct intel_iommu *iommu)
1312 {
1313 	u64 addr;
1314 	u32 sts;
1315 	unsigned long flag;
1316 
1317 	addr = virt_to_phys(iommu->root_entry);
1318 	if (sm_supported(iommu))
1319 		addr |= DMA_RTADDR_SMT;
1320 
1321 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1323 
1324 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1325 
1326 	/* Make sure hardware complete it */
1327 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1328 		      readl, (sts & DMA_GSTS_RTPS), sts);
1329 
1330 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1331 }
1332 
1333 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1334 {
1335 	u32 val;
1336 	unsigned long flag;
1337 
1338 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1339 		return;
1340 
1341 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1343 
1344 	/* Make sure hardware complete it */
1345 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1347 
1348 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 }
1350 
1351 /* return value determine if we need a write buffer flush */
1352 static void __iommu_flush_context(struct intel_iommu *iommu,
1353 				  u16 did, u16 source_id, u8 function_mask,
1354 				  u64 type)
1355 {
1356 	u64 val = 0;
1357 	unsigned long flag;
1358 
1359 	switch (type) {
1360 	case DMA_CCMD_GLOBAL_INVL:
1361 		val = DMA_CCMD_GLOBAL_INVL;
1362 		break;
1363 	case DMA_CCMD_DOMAIN_INVL:
1364 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1365 		break;
1366 	case DMA_CCMD_DEVICE_INVL:
1367 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1368 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1369 		break;
1370 	default:
1371 		BUG();
1372 	}
1373 	val |= DMA_CCMD_ICC;
1374 
1375 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1376 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1377 
1378 	/* Make sure hardware complete it */
1379 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1380 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1381 
1382 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1383 }
1384 
1385 /* return value determine if we need a write buffer flush */
1386 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1387 				u64 addr, unsigned int size_order, u64 type)
1388 {
1389 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1390 	u64 val = 0, val_iva = 0;
1391 	unsigned long flag;
1392 
1393 	switch (type) {
1394 	case DMA_TLB_GLOBAL_FLUSH:
1395 		/* global flush doesn't need set IVA_REG */
1396 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1397 		break;
1398 	case DMA_TLB_DSI_FLUSH:
1399 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1400 		break;
1401 	case DMA_TLB_PSI_FLUSH:
1402 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1403 		/* IH bit is passed in as part of address */
1404 		val_iva = size_order | addr;
1405 		break;
1406 	default:
1407 		BUG();
1408 	}
1409 	/* Note: set drain read/write */
1410 #if 0
1411 	/*
1412 	 * This is probably to be super secure.. Looks like we can
1413 	 * ignore it without any impact.
1414 	 */
1415 	if (cap_read_drain(iommu->cap))
1416 		val |= DMA_TLB_READ_DRAIN;
1417 #endif
1418 	if (cap_write_drain(iommu->cap))
1419 		val |= DMA_TLB_WRITE_DRAIN;
1420 
1421 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1422 	/* Note: Only uses first TLB reg currently */
1423 	if (val_iva)
1424 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1425 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1426 
1427 	/* Make sure hardware complete it */
1428 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1429 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1430 
1431 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1432 
1433 	/* check IOTLB invalidation granularity */
1434 	if (DMA_TLB_IAIG(val) == 0)
1435 		pr_err("Flush IOTLB failed\n");
1436 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1437 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1438 			(unsigned long long)DMA_TLB_IIRG(type),
1439 			(unsigned long long)DMA_TLB_IAIG(val));
1440 }
1441 
1442 static struct device_domain_info *
1443 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1444 			 u8 bus, u8 devfn)
1445 {
1446 	struct device_domain_info *info;
1447 
1448 	assert_spin_locked(&device_domain_lock);
1449 
1450 	if (!iommu->qi)
1451 		return NULL;
1452 
1453 	list_for_each_entry(info, &domain->devices, link)
1454 		if (info->iommu == iommu && info->bus == bus &&
1455 		    info->devfn == devfn) {
1456 			if (info->ats_supported && info->dev)
1457 				return info;
1458 			break;
1459 		}
1460 
1461 	return NULL;
1462 }
1463 
1464 static void domain_update_iotlb(struct dmar_domain *domain)
1465 {
1466 	struct device_domain_info *info;
1467 	bool has_iotlb_device = false;
1468 
1469 	assert_spin_locked(&device_domain_lock);
1470 
1471 	list_for_each_entry(info, &domain->devices, link)
1472 		if (info->ats_enabled) {
1473 			has_iotlb_device = true;
1474 			break;
1475 		}
1476 
1477 	if (!has_iotlb_device) {
1478 		struct subdev_domain_info *sinfo;
1479 
1480 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1481 			info = get_domain_info(sinfo->pdev);
1482 			if (info && info->ats_enabled) {
1483 				has_iotlb_device = true;
1484 				break;
1485 			}
1486 		}
1487 	}
1488 
1489 	domain->has_iotlb_device = has_iotlb_device;
1490 }
1491 
1492 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1493 {
1494 	struct pci_dev *pdev;
1495 
1496 	assert_spin_locked(&device_domain_lock);
1497 
1498 	if (!info || !dev_is_pci(info->dev))
1499 		return;
1500 
1501 	pdev = to_pci_dev(info->dev);
1502 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1503 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1504 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1505 	 * reserved, which should be set to 0.
1506 	 */
1507 	if (!ecap_dit(info->iommu->ecap))
1508 		info->pfsid = 0;
1509 	else {
1510 		struct pci_dev *pf_pdev;
1511 
1512 		/* pdev will be returned if device is not a vf */
1513 		pf_pdev = pci_physfn(pdev);
1514 		info->pfsid = pci_dev_id(pf_pdev);
1515 	}
1516 
1517 #ifdef CONFIG_INTEL_IOMMU_SVM
1518 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1519 	   the device if you enable PASID support after ATS support is
1520 	   undefined. So always enable PASID support on devices which
1521 	   have it, even if we can't yet know if we're ever going to
1522 	   use it. */
1523 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1524 		info->pasid_enabled = 1;
1525 
1526 	if (info->pri_supported &&
1527 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1528 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1529 		info->pri_enabled = 1;
1530 #endif
1531 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1532 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1533 		info->ats_enabled = 1;
1534 		domain_update_iotlb(info->domain);
1535 		info->ats_qdep = pci_ats_queue_depth(pdev);
1536 	}
1537 }
1538 
1539 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1540 {
1541 	struct pci_dev *pdev;
1542 
1543 	assert_spin_locked(&device_domain_lock);
1544 
1545 	if (!dev_is_pci(info->dev))
1546 		return;
1547 
1548 	pdev = to_pci_dev(info->dev);
1549 
1550 	if (info->ats_enabled) {
1551 		pci_disable_ats(pdev);
1552 		info->ats_enabled = 0;
1553 		domain_update_iotlb(info->domain);
1554 	}
1555 #ifdef CONFIG_INTEL_IOMMU_SVM
1556 	if (info->pri_enabled) {
1557 		pci_disable_pri(pdev);
1558 		info->pri_enabled = 0;
1559 	}
1560 	if (info->pasid_enabled) {
1561 		pci_disable_pasid(pdev);
1562 		info->pasid_enabled = 0;
1563 	}
1564 #endif
1565 }
1566 
1567 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1568 				    u64 addr, unsigned int mask)
1569 {
1570 	u16 sid, qdep;
1571 
1572 	if (!info || !info->ats_enabled)
1573 		return;
1574 
1575 	sid = info->bus << 8 | info->devfn;
1576 	qdep = info->ats_qdep;
1577 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578 			   qdep, addr, mask);
1579 }
1580 
1581 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1582 				  u64 addr, unsigned mask)
1583 {
1584 	unsigned long flags;
1585 	struct device_domain_info *info;
1586 	struct subdev_domain_info *sinfo;
1587 
1588 	if (!domain->has_iotlb_device)
1589 		return;
1590 
1591 	spin_lock_irqsave(&device_domain_lock, flags);
1592 	list_for_each_entry(info, &domain->devices, link)
1593 		__iommu_flush_dev_iotlb(info, addr, mask);
1594 
1595 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1596 		info = get_domain_info(sinfo->pdev);
1597 		__iommu_flush_dev_iotlb(info, addr, mask);
1598 	}
1599 	spin_unlock_irqrestore(&device_domain_lock, flags);
1600 }
1601 
1602 static void domain_flush_piotlb(struct intel_iommu *iommu,
1603 				struct dmar_domain *domain,
1604 				u64 addr, unsigned long npages, bool ih)
1605 {
1606 	u16 did = domain->iommu_did[iommu->seq_id];
1607 
1608 	if (domain->default_pasid)
1609 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1610 				addr, npages, ih);
1611 
1612 	if (!list_empty(&domain->devices))
1613 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1614 }
1615 
1616 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1617 				  struct dmar_domain *domain,
1618 				  unsigned long pfn, unsigned int pages,
1619 				  int ih, int map)
1620 {
1621 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1622 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1623 	u16 did = domain->iommu_did[iommu->seq_id];
1624 
1625 	BUG_ON(pages == 0);
1626 
1627 	if (ih)
1628 		ih = 1 << 6;
1629 
1630 	if (domain_use_first_level(domain)) {
1631 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1632 	} else {
1633 		/*
1634 		 * Fallback to domain selective flush if no PSI support or
1635 		 * the size is too big. PSI requires page size to be 2 ^ x,
1636 		 * and the base address is naturally aligned to the size.
1637 		 */
1638 		if (!cap_pgsel_inv(iommu->cap) ||
1639 		    mask > cap_max_amask_val(iommu->cap))
1640 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1641 							DMA_TLB_DSI_FLUSH);
1642 		else
1643 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1644 							DMA_TLB_PSI_FLUSH);
1645 	}
1646 
1647 	/*
1648 	 * In caching mode, changes of pages from non-present to present require
1649 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1650 	 */
1651 	if (!cap_caching_mode(iommu->cap) || !map)
1652 		iommu_flush_dev_iotlb(domain, addr, mask);
1653 }
1654 
1655 /* Notification for newly created mappings */
1656 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1657 					struct dmar_domain *domain,
1658 					unsigned long pfn, unsigned int pages)
1659 {
1660 	/*
1661 	 * It's a non-present to present mapping. Only flush if caching mode
1662 	 * and second level.
1663 	 */
1664 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1665 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1666 	else
1667 		iommu_flush_write_buffer(iommu);
1668 }
1669 
1670 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1671 {
1672 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1673 	int idx;
1674 
1675 	for_each_domain_iommu(idx, dmar_domain) {
1676 		struct intel_iommu *iommu = g_iommus[idx];
1677 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1678 
1679 		if (domain_use_first_level(dmar_domain))
1680 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1681 		else
1682 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1683 						 DMA_TLB_DSI_FLUSH);
1684 
1685 		if (!cap_caching_mode(iommu->cap))
1686 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1687 					      0, MAX_AGAW_PFN_WIDTH);
1688 	}
1689 }
1690 
1691 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1692 {
1693 	u32 pmen;
1694 	unsigned long flags;
1695 
1696 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1697 		return;
1698 
1699 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1701 	pmen &= ~DMA_PMEN_EPM;
1702 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1703 
1704 	/* wait for the protected region status bit to clear */
1705 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1706 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1707 
1708 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1709 }
1710 
1711 static void iommu_enable_translation(struct intel_iommu *iommu)
1712 {
1713 	u32 sts;
1714 	unsigned long flags;
1715 
1716 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1717 	iommu->gcmd |= DMA_GCMD_TE;
1718 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1719 
1720 	/* Make sure hardware complete it */
1721 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1722 		      readl, (sts & DMA_GSTS_TES), sts);
1723 
1724 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1725 }
1726 
1727 static void iommu_disable_translation(struct intel_iommu *iommu)
1728 {
1729 	u32 sts;
1730 	unsigned long flag;
1731 
1732 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1733 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1734 		return;
1735 
1736 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1737 	iommu->gcmd &= ~DMA_GCMD_TE;
1738 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1739 
1740 	/* Make sure hardware complete it */
1741 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1742 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1743 
1744 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1745 }
1746 
1747 static int iommu_init_domains(struct intel_iommu *iommu)
1748 {
1749 	u32 ndomains, nlongs;
1750 	size_t size;
1751 
1752 	ndomains = cap_ndoms(iommu->cap);
1753 	pr_debug("%s: Number of Domains supported <%d>\n",
1754 		 iommu->name, ndomains);
1755 	nlongs = BITS_TO_LONGS(ndomains);
1756 
1757 	spin_lock_init(&iommu->lock);
1758 
1759 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1760 	if (!iommu->domain_ids) {
1761 		pr_err("%s: Allocating domain id array failed\n",
1762 		       iommu->name);
1763 		return -ENOMEM;
1764 	}
1765 
1766 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1767 	iommu->domains = kzalloc(size, GFP_KERNEL);
1768 
1769 	if (iommu->domains) {
1770 		size = 256 * sizeof(struct dmar_domain *);
1771 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1772 	}
1773 
1774 	if (!iommu->domains || !iommu->domains[0]) {
1775 		pr_err("%s: Allocating domain array failed\n",
1776 		       iommu->name);
1777 		kfree(iommu->domain_ids);
1778 		kfree(iommu->domains);
1779 		iommu->domain_ids = NULL;
1780 		iommu->domains    = NULL;
1781 		return -ENOMEM;
1782 	}
1783 
1784 	/*
1785 	 * If Caching mode is set, then invalid translations are tagged
1786 	 * with domain-id 0, hence we need to pre-allocate it. We also
1787 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1788 	 * make sure it is not used for a real domain.
1789 	 */
1790 	set_bit(0, iommu->domain_ids);
1791 
1792 	/*
1793 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1794 	 * entry for first-level or pass-through translation modes should
1795 	 * be programmed with a domain id different from those used for
1796 	 * second-level or nested translation. We reserve a domain id for
1797 	 * this purpose.
1798 	 */
1799 	if (sm_supported(iommu))
1800 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1801 
1802 	return 0;
1803 }
1804 
1805 static void disable_dmar_iommu(struct intel_iommu *iommu)
1806 {
1807 	struct device_domain_info *info, *tmp;
1808 	unsigned long flags;
1809 
1810 	if (!iommu->domains || !iommu->domain_ids)
1811 		return;
1812 
1813 	spin_lock_irqsave(&device_domain_lock, flags);
1814 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1815 		if (info->iommu != iommu)
1816 			continue;
1817 
1818 		if (!info->dev || !info->domain)
1819 			continue;
1820 
1821 		__dmar_remove_one_dev_info(info);
1822 	}
1823 	spin_unlock_irqrestore(&device_domain_lock, flags);
1824 
1825 	if (iommu->gcmd & DMA_GCMD_TE)
1826 		iommu_disable_translation(iommu);
1827 }
1828 
1829 static void free_dmar_iommu(struct intel_iommu *iommu)
1830 {
1831 	if ((iommu->domains) && (iommu->domain_ids)) {
1832 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1833 		int i;
1834 
1835 		for (i = 0; i < elems; i++)
1836 			kfree(iommu->domains[i]);
1837 		kfree(iommu->domains);
1838 		kfree(iommu->domain_ids);
1839 		iommu->domains = NULL;
1840 		iommu->domain_ids = NULL;
1841 	}
1842 
1843 	g_iommus[iommu->seq_id] = NULL;
1844 
1845 	/* free context mapping */
1846 	free_context_table(iommu);
1847 
1848 #ifdef CONFIG_INTEL_IOMMU_SVM
1849 	if (pasid_supported(iommu)) {
1850 		if (ecap_prs(iommu->ecap))
1851 			intel_svm_finish_prq(iommu);
1852 	}
1853 	if (vccap_pasid(iommu->vccap))
1854 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1855 
1856 #endif
1857 }
1858 
1859 /*
1860  * Check and return whether first level is used by default for
1861  * DMA translation.
1862  */
1863 static bool first_level_by_default(void)
1864 {
1865 	struct dmar_drhd_unit *drhd;
1866 	struct intel_iommu *iommu;
1867 	static int first_level_support = -1;
1868 
1869 	if (likely(first_level_support != -1))
1870 		return first_level_support;
1871 
1872 	first_level_support = 1;
1873 
1874 	rcu_read_lock();
1875 	for_each_active_iommu(iommu, drhd) {
1876 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1877 			first_level_support = 0;
1878 			break;
1879 		}
1880 	}
1881 	rcu_read_unlock();
1882 
1883 	return first_level_support;
1884 }
1885 
1886 static struct dmar_domain *alloc_domain(int flags)
1887 {
1888 	struct dmar_domain *domain;
1889 
1890 	domain = alloc_domain_mem();
1891 	if (!domain)
1892 		return NULL;
1893 
1894 	memset(domain, 0, sizeof(*domain));
1895 	domain->nid = NUMA_NO_NODE;
1896 	domain->flags = flags;
1897 	if (first_level_by_default())
1898 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1899 	domain->has_iotlb_device = false;
1900 	INIT_LIST_HEAD(&domain->devices);
1901 	INIT_LIST_HEAD(&domain->subdevices);
1902 
1903 	return domain;
1904 }
1905 
1906 /* Must be called with iommu->lock */
1907 static int domain_attach_iommu(struct dmar_domain *domain,
1908 			       struct intel_iommu *iommu)
1909 {
1910 	unsigned long ndomains;
1911 	int num;
1912 
1913 	assert_spin_locked(&device_domain_lock);
1914 	assert_spin_locked(&iommu->lock);
1915 
1916 	domain->iommu_refcnt[iommu->seq_id] += 1;
1917 	domain->iommu_count += 1;
1918 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1919 		ndomains = cap_ndoms(iommu->cap);
1920 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1921 
1922 		if (num >= ndomains) {
1923 			pr_err("%s: No free domain ids\n", iommu->name);
1924 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1925 			domain->iommu_count -= 1;
1926 			return -ENOSPC;
1927 		}
1928 
1929 		set_bit(num, iommu->domain_ids);
1930 		set_iommu_domain(iommu, num, domain);
1931 
1932 		domain->iommu_did[iommu->seq_id] = num;
1933 		domain->nid			 = iommu->node;
1934 
1935 		domain_update_iommu_cap(domain);
1936 	}
1937 
1938 	return 0;
1939 }
1940 
1941 static int domain_detach_iommu(struct dmar_domain *domain,
1942 			       struct intel_iommu *iommu)
1943 {
1944 	int num, count;
1945 
1946 	assert_spin_locked(&device_domain_lock);
1947 	assert_spin_locked(&iommu->lock);
1948 
1949 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1950 	count = --domain->iommu_count;
1951 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1952 		num = domain->iommu_did[iommu->seq_id];
1953 		clear_bit(num, iommu->domain_ids);
1954 		set_iommu_domain(iommu, num, NULL);
1955 
1956 		domain_update_iommu_cap(domain);
1957 		domain->iommu_did[iommu->seq_id] = 0;
1958 	}
1959 
1960 	return count;
1961 }
1962 
1963 static inline int guestwidth_to_adjustwidth(int gaw)
1964 {
1965 	int agaw;
1966 	int r = (gaw - 12) % 9;
1967 
1968 	if (r == 0)
1969 		agaw = gaw;
1970 	else
1971 		agaw = gaw + 9 - r;
1972 	if (agaw > 64)
1973 		agaw = 64;
1974 	return agaw;
1975 }
1976 
1977 static void domain_exit(struct dmar_domain *domain)
1978 {
1979 
1980 	/* Remove associated devices and clear attached or cached domains */
1981 	domain_remove_dev_info(domain);
1982 
1983 	/* destroy iovas */
1984 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1985 		iommu_put_dma_cookie(&domain->domain);
1986 
1987 	if (domain->pgd) {
1988 		struct page *freelist;
1989 
1990 		freelist = domain_unmap(domain, 0,
1991 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1992 		dma_free_pagelist(freelist);
1993 	}
1994 
1995 	free_domain_mem(domain);
1996 }
1997 
1998 /*
1999  * Get the PASID directory size for scalable mode context entry.
2000  * Value of X in the PDTS field of a scalable mode context entry
2001  * indicates PASID directory with 2^(X + 7) entries.
2002  */
2003 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2004 {
2005 	int pds, max_pde;
2006 
2007 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2008 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2009 	if (pds < 7)
2010 		return 0;
2011 
2012 	return pds - 7;
2013 }
2014 
2015 /*
2016  * Set the RID_PASID field of a scalable mode context entry. The
2017  * IOMMU hardware will use the PASID value set in this field for
2018  * DMA translations of DMA requests without PASID.
2019  */
2020 static inline void
2021 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2022 {
2023 	context->hi |= pasid & ((1 << 20) - 1);
2024 }
2025 
2026 /*
2027  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2028  * entry.
2029  */
2030 static inline void context_set_sm_dte(struct context_entry *context)
2031 {
2032 	context->lo |= (1 << 2);
2033 }
2034 
2035 /*
2036  * Set the PRE(Page Request Enable) field of a scalable mode context
2037  * entry.
2038  */
2039 static inline void context_set_sm_pre(struct context_entry *context)
2040 {
2041 	context->lo |= (1 << 4);
2042 }
2043 
2044 /* Convert value to context PASID directory size field coding. */
2045 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2046 
2047 static int domain_context_mapping_one(struct dmar_domain *domain,
2048 				      struct intel_iommu *iommu,
2049 				      struct pasid_table *table,
2050 				      u8 bus, u8 devfn)
2051 {
2052 	u16 did = domain->iommu_did[iommu->seq_id];
2053 	int translation = CONTEXT_TT_MULTI_LEVEL;
2054 	struct device_domain_info *info = NULL;
2055 	struct context_entry *context;
2056 	unsigned long flags;
2057 	int ret;
2058 
2059 	WARN_ON(did == 0);
2060 
2061 	if (hw_pass_through && domain_type_is_si(domain))
2062 		translation = CONTEXT_TT_PASS_THROUGH;
2063 
2064 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2065 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2066 
2067 	BUG_ON(!domain->pgd);
2068 
2069 	spin_lock_irqsave(&device_domain_lock, flags);
2070 	spin_lock(&iommu->lock);
2071 
2072 	ret = -ENOMEM;
2073 	context = iommu_context_addr(iommu, bus, devfn, 1);
2074 	if (!context)
2075 		goto out_unlock;
2076 
2077 	ret = 0;
2078 	if (context_present(context))
2079 		goto out_unlock;
2080 
2081 	/*
2082 	 * For kdump cases, old valid entries may be cached due to the
2083 	 * in-flight DMA and copied pgtable, but there is no unmapping
2084 	 * behaviour for them, thus we need an explicit cache flush for
2085 	 * the newly-mapped device. For kdump, at this point, the device
2086 	 * is supposed to finish reset at its driver probe stage, so no
2087 	 * in-flight DMA will exist, and we don't need to worry anymore
2088 	 * hereafter.
2089 	 */
2090 	if (context_copied(context)) {
2091 		u16 did_old = context_domain_id(context);
2092 
2093 		if (did_old < cap_ndoms(iommu->cap)) {
2094 			iommu->flush.flush_context(iommu, did_old,
2095 						   (((u16)bus) << 8) | devfn,
2096 						   DMA_CCMD_MASK_NOBIT,
2097 						   DMA_CCMD_DEVICE_INVL);
2098 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2099 						 DMA_TLB_DSI_FLUSH);
2100 		}
2101 	}
2102 
2103 	context_clear_entry(context);
2104 
2105 	if (sm_supported(iommu)) {
2106 		unsigned long pds;
2107 
2108 		WARN_ON(!table);
2109 
2110 		/* Setup the PASID DIR pointer: */
2111 		pds = context_get_sm_pds(table);
2112 		context->lo = (u64)virt_to_phys(table->table) |
2113 				context_pdts(pds);
2114 
2115 		/* Setup the RID_PASID field: */
2116 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2117 
2118 		/*
2119 		 * Setup the Device-TLB enable bit and Page request
2120 		 * Enable bit:
2121 		 */
2122 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2123 		if (info && info->ats_supported)
2124 			context_set_sm_dte(context);
2125 		if (info && info->pri_supported)
2126 			context_set_sm_pre(context);
2127 	} else {
2128 		struct dma_pte *pgd = domain->pgd;
2129 		int agaw;
2130 
2131 		context_set_domain_id(context, did);
2132 
2133 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2134 			/*
2135 			 * Skip top levels of page tables for iommu which has
2136 			 * less agaw than default. Unnecessary for PT mode.
2137 			 */
2138 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2139 				ret = -ENOMEM;
2140 				pgd = phys_to_virt(dma_pte_addr(pgd));
2141 				if (!dma_pte_present(pgd))
2142 					goto out_unlock;
2143 			}
2144 
2145 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146 			if (info && info->ats_supported)
2147 				translation = CONTEXT_TT_DEV_IOTLB;
2148 			else
2149 				translation = CONTEXT_TT_MULTI_LEVEL;
2150 
2151 			context_set_address_root(context, virt_to_phys(pgd));
2152 			context_set_address_width(context, agaw);
2153 		} else {
2154 			/*
2155 			 * In pass through mode, AW must be programmed to
2156 			 * indicate the largest AGAW value supported by
2157 			 * hardware. And ASR is ignored by hardware.
2158 			 */
2159 			context_set_address_width(context, iommu->msagaw);
2160 		}
2161 
2162 		context_set_translation_type(context, translation);
2163 	}
2164 
2165 	context_set_fault_enable(context);
2166 	context_set_present(context);
2167 	if (!ecap_coherent(iommu->ecap))
2168 		clflush_cache_range(context, sizeof(*context));
2169 
2170 	/*
2171 	 * It's a non-present to present mapping. If hardware doesn't cache
2172 	 * non-present entry we only need to flush the write-buffer. If the
2173 	 * _does_ cache non-present entries, then it does so in the special
2174 	 * domain #0, which we have to flush:
2175 	 */
2176 	if (cap_caching_mode(iommu->cap)) {
2177 		iommu->flush.flush_context(iommu, 0,
2178 					   (((u16)bus) << 8) | devfn,
2179 					   DMA_CCMD_MASK_NOBIT,
2180 					   DMA_CCMD_DEVICE_INVL);
2181 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2182 	} else {
2183 		iommu_flush_write_buffer(iommu);
2184 	}
2185 	iommu_enable_dev_iotlb(info);
2186 
2187 	ret = 0;
2188 
2189 out_unlock:
2190 	spin_unlock(&iommu->lock);
2191 	spin_unlock_irqrestore(&device_domain_lock, flags);
2192 
2193 	return ret;
2194 }
2195 
2196 struct domain_context_mapping_data {
2197 	struct dmar_domain *domain;
2198 	struct intel_iommu *iommu;
2199 	struct pasid_table *table;
2200 };
2201 
2202 static int domain_context_mapping_cb(struct pci_dev *pdev,
2203 				     u16 alias, void *opaque)
2204 {
2205 	struct domain_context_mapping_data *data = opaque;
2206 
2207 	return domain_context_mapping_one(data->domain, data->iommu,
2208 					  data->table, PCI_BUS_NUM(alias),
2209 					  alias & 0xff);
2210 }
2211 
2212 static int
2213 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2214 {
2215 	struct domain_context_mapping_data data;
2216 	struct pasid_table *table;
2217 	struct intel_iommu *iommu;
2218 	u8 bus, devfn;
2219 
2220 	iommu = device_to_iommu(dev, &bus, &devfn);
2221 	if (!iommu)
2222 		return -ENODEV;
2223 
2224 	table = intel_pasid_get_table(dev);
2225 
2226 	if (!dev_is_pci(dev))
2227 		return domain_context_mapping_one(domain, iommu, table,
2228 						  bus, devfn);
2229 
2230 	data.domain = domain;
2231 	data.iommu = iommu;
2232 	data.table = table;
2233 
2234 	return pci_for_each_dma_alias(to_pci_dev(dev),
2235 				      &domain_context_mapping_cb, &data);
2236 }
2237 
2238 static int domain_context_mapped_cb(struct pci_dev *pdev,
2239 				    u16 alias, void *opaque)
2240 {
2241 	struct intel_iommu *iommu = opaque;
2242 
2243 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2244 }
2245 
2246 static int domain_context_mapped(struct device *dev)
2247 {
2248 	struct intel_iommu *iommu;
2249 	u8 bus, devfn;
2250 
2251 	iommu = device_to_iommu(dev, &bus, &devfn);
2252 	if (!iommu)
2253 		return -ENODEV;
2254 
2255 	if (!dev_is_pci(dev))
2256 		return device_context_mapped(iommu, bus, devfn);
2257 
2258 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2259 				       domain_context_mapped_cb, iommu);
2260 }
2261 
2262 /* Returns a number of VTD pages, but aligned to MM page size */
2263 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2264 					    size_t size)
2265 {
2266 	host_addr &= ~PAGE_MASK;
2267 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2268 }
2269 
2270 /* Return largest possible superpage level for a given mapping */
2271 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2272 					  unsigned long iov_pfn,
2273 					  unsigned long phy_pfn,
2274 					  unsigned long pages)
2275 {
2276 	int support, level = 1;
2277 	unsigned long pfnmerge;
2278 
2279 	support = domain->iommu_superpage;
2280 
2281 	/* To use a large page, the virtual *and* physical addresses
2282 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2283 	   of them will mean we have to use smaller pages. So just
2284 	   merge them and check both at once. */
2285 	pfnmerge = iov_pfn | phy_pfn;
2286 
2287 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2288 		pages >>= VTD_STRIDE_SHIFT;
2289 		if (!pages)
2290 			break;
2291 		pfnmerge >>= VTD_STRIDE_SHIFT;
2292 		level++;
2293 		support--;
2294 	}
2295 	return level;
2296 }
2297 
2298 static int
2299 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2300 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2301 {
2302 	struct dma_pte *first_pte = NULL, *pte = NULL;
2303 	unsigned int largepage_lvl = 0;
2304 	unsigned long lvl_pages = 0;
2305 	phys_addr_t pteval;
2306 	u64 attr;
2307 
2308 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2309 
2310 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2311 		return -EINVAL;
2312 
2313 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2314 	if (domain_use_first_level(domain))
2315 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2316 
2317 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2318 
2319 	while (nr_pages > 0) {
2320 		uint64_t tmp;
2321 
2322 		if (!pte) {
2323 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2324 					phys_pfn, nr_pages);
2325 
2326 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2327 			if (!pte)
2328 				return -ENOMEM;
2329 			/* It is large page*/
2330 			if (largepage_lvl > 1) {
2331 				unsigned long nr_superpages, end_pfn;
2332 
2333 				pteval |= DMA_PTE_LARGE_PAGE;
2334 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2335 
2336 				nr_superpages = nr_pages / lvl_pages;
2337 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2338 
2339 				/*
2340 				 * Ensure that old small page tables are
2341 				 * removed to make room for superpage(s).
2342 				 * We're adding new large pages, so make sure
2343 				 * we don't remove their parent tables.
2344 				 */
2345 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2346 						       largepage_lvl + 1);
2347 			} else {
2348 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2349 			}
2350 
2351 		}
2352 		/* We don't need lock here, nobody else
2353 		 * touches the iova range
2354 		 */
2355 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2356 		if (tmp) {
2357 			static int dumps = 5;
2358 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2359 				iov_pfn, tmp, (unsigned long long)pteval);
2360 			if (dumps) {
2361 				dumps--;
2362 				debug_dma_dump_mappings(NULL);
2363 			}
2364 			WARN_ON(1);
2365 		}
2366 
2367 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2368 
2369 		BUG_ON(nr_pages < lvl_pages);
2370 
2371 		nr_pages -= lvl_pages;
2372 		iov_pfn += lvl_pages;
2373 		phys_pfn += lvl_pages;
2374 		pteval += lvl_pages * VTD_PAGE_SIZE;
2375 
2376 		/* If the next PTE would be the first in a new page, then we
2377 		 * need to flush the cache on the entries we've just written.
2378 		 * And then we'll need to recalculate 'pte', so clear it and
2379 		 * let it get set again in the if (!pte) block above.
2380 		 *
2381 		 * If we're done (!nr_pages) we need to flush the cache too.
2382 		 *
2383 		 * Also if we've been setting superpages, we may need to
2384 		 * recalculate 'pte' and switch back to smaller pages for the
2385 		 * end of the mapping, if the trailing size is not enough to
2386 		 * use another superpage (i.e. nr_pages < lvl_pages).
2387 		 */
2388 		pte++;
2389 		if (!nr_pages || first_pte_in_page(pte) ||
2390 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2391 			domain_flush_cache(domain, first_pte,
2392 					   (void *)pte - (void *)first_pte);
2393 			pte = NULL;
2394 		}
2395 	}
2396 
2397 	return 0;
2398 }
2399 
2400 static int
2401 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2402 	       unsigned long phys_pfn, unsigned long nr_pages, int prot)
2403 {
2404 	int iommu_id, ret;
2405 	struct intel_iommu *iommu;
2406 
2407 	/* Do the real mapping first */
2408 	ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2409 	if (ret)
2410 		return ret;
2411 
2412 	for_each_domain_iommu(iommu_id, domain) {
2413 		iommu = g_iommus[iommu_id];
2414 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2415 	}
2416 
2417 	return 0;
2418 }
2419 
2420 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2421 {
2422 	unsigned long flags;
2423 	struct context_entry *context;
2424 	u16 did_old;
2425 
2426 	if (!iommu)
2427 		return;
2428 
2429 	spin_lock_irqsave(&iommu->lock, flags);
2430 	context = iommu_context_addr(iommu, bus, devfn, 0);
2431 	if (!context) {
2432 		spin_unlock_irqrestore(&iommu->lock, flags);
2433 		return;
2434 	}
2435 	did_old = context_domain_id(context);
2436 	context_clear_entry(context);
2437 	__iommu_flush_cache(iommu, context, sizeof(*context));
2438 	spin_unlock_irqrestore(&iommu->lock, flags);
2439 	iommu->flush.flush_context(iommu,
2440 				   did_old,
2441 				   (((u16)bus) << 8) | devfn,
2442 				   DMA_CCMD_MASK_NOBIT,
2443 				   DMA_CCMD_DEVICE_INVL);
2444 	iommu->flush.flush_iotlb(iommu,
2445 				 did_old,
2446 				 0,
2447 				 0,
2448 				 DMA_TLB_DSI_FLUSH);
2449 }
2450 
2451 static inline void unlink_domain_info(struct device_domain_info *info)
2452 {
2453 	assert_spin_locked(&device_domain_lock);
2454 	list_del(&info->link);
2455 	list_del(&info->global);
2456 	if (info->dev)
2457 		dev_iommu_priv_set(info->dev, NULL);
2458 }
2459 
2460 static void domain_remove_dev_info(struct dmar_domain *domain)
2461 {
2462 	struct device_domain_info *info, *tmp;
2463 	unsigned long flags;
2464 
2465 	spin_lock_irqsave(&device_domain_lock, flags);
2466 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2467 		__dmar_remove_one_dev_info(info);
2468 	spin_unlock_irqrestore(&device_domain_lock, flags);
2469 }
2470 
2471 struct dmar_domain *find_domain(struct device *dev)
2472 {
2473 	struct device_domain_info *info;
2474 
2475 	if (unlikely(!dev || !dev->iommu))
2476 		return NULL;
2477 
2478 	if (unlikely(attach_deferred(dev)))
2479 		return NULL;
2480 
2481 	/* No lock here, assumes no domain exit in normal case */
2482 	info = get_domain_info(dev);
2483 	if (likely(info))
2484 		return info->domain;
2485 
2486 	return NULL;
2487 }
2488 
2489 static inline struct device_domain_info *
2490 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2491 {
2492 	struct device_domain_info *info;
2493 
2494 	list_for_each_entry(info, &device_domain_list, global)
2495 		if (info->segment == segment && info->bus == bus &&
2496 		    info->devfn == devfn)
2497 			return info;
2498 
2499 	return NULL;
2500 }
2501 
2502 static int domain_setup_first_level(struct intel_iommu *iommu,
2503 				    struct dmar_domain *domain,
2504 				    struct device *dev,
2505 				    u32 pasid)
2506 {
2507 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2508 	struct dma_pte *pgd = domain->pgd;
2509 	int agaw, level;
2510 
2511 	/*
2512 	 * Skip top levels of page tables for iommu which has
2513 	 * less agaw than default. Unnecessary for PT mode.
2514 	 */
2515 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2516 		pgd = phys_to_virt(dma_pte_addr(pgd));
2517 		if (!dma_pte_present(pgd))
2518 			return -ENOMEM;
2519 	}
2520 
2521 	level = agaw_to_level(agaw);
2522 	if (level != 4 && level != 5)
2523 		return -EINVAL;
2524 
2525 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2526 
2527 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2528 					     domain->iommu_did[iommu->seq_id],
2529 					     flags);
2530 }
2531 
2532 static bool dev_is_real_dma_subdevice(struct device *dev)
2533 {
2534 	return dev && dev_is_pci(dev) &&
2535 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2536 }
2537 
2538 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2539 						    int bus, int devfn,
2540 						    struct device *dev,
2541 						    struct dmar_domain *domain)
2542 {
2543 	struct dmar_domain *found = NULL;
2544 	struct device_domain_info *info;
2545 	unsigned long flags;
2546 	int ret;
2547 
2548 	info = alloc_devinfo_mem();
2549 	if (!info)
2550 		return NULL;
2551 
2552 	if (!dev_is_real_dma_subdevice(dev)) {
2553 		info->bus = bus;
2554 		info->devfn = devfn;
2555 		info->segment = iommu->segment;
2556 	} else {
2557 		struct pci_dev *pdev = to_pci_dev(dev);
2558 
2559 		info->bus = pdev->bus->number;
2560 		info->devfn = pdev->devfn;
2561 		info->segment = pci_domain_nr(pdev->bus);
2562 	}
2563 
2564 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2565 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2566 	info->ats_qdep = 0;
2567 	info->dev = dev;
2568 	info->domain = domain;
2569 	info->iommu = iommu;
2570 	info->pasid_table = NULL;
2571 	info->auxd_enabled = 0;
2572 	INIT_LIST_HEAD(&info->subdevices);
2573 
2574 	if (dev && dev_is_pci(dev)) {
2575 		struct pci_dev *pdev = to_pci_dev(info->dev);
2576 
2577 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2578 		    pci_ats_supported(pdev) &&
2579 		    dmar_find_matched_atsr_unit(pdev))
2580 			info->ats_supported = 1;
2581 
2582 		if (sm_supported(iommu)) {
2583 			if (pasid_supported(iommu)) {
2584 				int features = pci_pasid_features(pdev);
2585 				if (features >= 0)
2586 					info->pasid_supported = features | 1;
2587 			}
2588 
2589 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2590 			    pci_pri_supported(pdev))
2591 				info->pri_supported = 1;
2592 		}
2593 	}
2594 
2595 	spin_lock_irqsave(&device_domain_lock, flags);
2596 	if (dev)
2597 		found = find_domain(dev);
2598 
2599 	if (!found) {
2600 		struct device_domain_info *info2;
2601 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2602 						       info->devfn);
2603 		if (info2) {
2604 			found      = info2->domain;
2605 			info2->dev = dev;
2606 		}
2607 	}
2608 
2609 	if (found) {
2610 		spin_unlock_irqrestore(&device_domain_lock, flags);
2611 		free_devinfo_mem(info);
2612 		/* Caller must free the original domain */
2613 		return found;
2614 	}
2615 
2616 	spin_lock(&iommu->lock);
2617 	ret = domain_attach_iommu(domain, iommu);
2618 	spin_unlock(&iommu->lock);
2619 
2620 	if (ret) {
2621 		spin_unlock_irqrestore(&device_domain_lock, flags);
2622 		free_devinfo_mem(info);
2623 		return NULL;
2624 	}
2625 
2626 	list_add(&info->link, &domain->devices);
2627 	list_add(&info->global, &device_domain_list);
2628 	if (dev)
2629 		dev_iommu_priv_set(dev, info);
2630 	spin_unlock_irqrestore(&device_domain_lock, flags);
2631 
2632 	/* PASID table is mandatory for a PCI device in scalable mode. */
2633 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2634 		ret = intel_pasid_alloc_table(dev);
2635 		if (ret) {
2636 			dev_err(dev, "PASID table allocation failed\n");
2637 			dmar_remove_one_dev_info(dev);
2638 			return NULL;
2639 		}
2640 
2641 		/* Setup the PASID entry for requests without PASID: */
2642 		spin_lock_irqsave(&iommu->lock, flags);
2643 		if (hw_pass_through && domain_type_is_si(domain))
2644 			ret = intel_pasid_setup_pass_through(iommu, domain,
2645 					dev, PASID_RID2PASID);
2646 		else if (domain_use_first_level(domain))
2647 			ret = domain_setup_first_level(iommu, domain, dev,
2648 					PASID_RID2PASID);
2649 		else
2650 			ret = intel_pasid_setup_second_level(iommu, domain,
2651 					dev, PASID_RID2PASID);
2652 		spin_unlock_irqrestore(&iommu->lock, flags);
2653 		if (ret) {
2654 			dev_err(dev, "Setup RID2PASID failed\n");
2655 			dmar_remove_one_dev_info(dev);
2656 			return NULL;
2657 		}
2658 	}
2659 
2660 	if (dev && domain_context_mapping(domain, dev)) {
2661 		dev_err(dev, "Domain context map failed\n");
2662 		dmar_remove_one_dev_info(dev);
2663 		return NULL;
2664 	}
2665 
2666 	return domain;
2667 }
2668 
2669 static int iommu_domain_identity_map(struct dmar_domain *domain,
2670 				     unsigned long first_vpfn,
2671 				     unsigned long last_vpfn)
2672 {
2673 	/*
2674 	 * RMRR range might have overlap with physical memory range,
2675 	 * clear it first
2676 	 */
2677 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2678 
2679 	return __domain_mapping(domain, first_vpfn,
2680 				first_vpfn, last_vpfn - first_vpfn + 1,
2681 				DMA_PTE_READ|DMA_PTE_WRITE);
2682 }
2683 
2684 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2685 
2686 static int __init si_domain_init(int hw)
2687 {
2688 	struct dmar_rmrr_unit *rmrr;
2689 	struct device *dev;
2690 	int i, nid, ret;
2691 
2692 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2693 	if (!si_domain)
2694 		return -EFAULT;
2695 
2696 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2697 		domain_exit(si_domain);
2698 		return -EFAULT;
2699 	}
2700 
2701 	if (hw)
2702 		return 0;
2703 
2704 	for_each_online_node(nid) {
2705 		unsigned long start_pfn, end_pfn;
2706 		int i;
2707 
2708 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2709 			ret = iommu_domain_identity_map(si_domain,
2710 					mm_to_dma_pfn(start_pfn),
2711 					mm_to_dma_pfn(end_pfn));
2712 			if (ret)
2713 				return ret;
2714 		}
2715 	}
2716 
2717 	/*
2718 	 * Identity map the RMRRs so that devices with RMRRs could also use
2719 	 * the si_domain.
2720 	 */
2721 	for_each_rmrr_units(rmrr) {
2722 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2723 					  i, dev) {
2724 			unsigned long long start = rmrr->base_address;
2725 			unsigned long long end = rmrr->end_address;
2726 
2727 			if (WARN_ON(end < start ||
2728 				    end >> agaw_to_width(si_domain->agaw)))
2729 				continue;
2730 
2731 			ret = iommu_domain_identity_map(si_domain,
2732 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2733 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2734 			if (ret)
2735 				return ret;
2736 		}
2737 	}
2738 
2739 	return 0;
2740 }
2741 
2742 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2743 {
2744 	struct dmar_domain *ndomain;
2745 	struct intel_iommu *iommu;
2746 	u8 bus, devfn;
2747 
2748 	iommu = device_to_iommu(dev, &bus, &devfn);
2749 	if (!iommu)
2750 		return -ENODEV;
2751 
2752 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2753 	if (ndomain != domain)
2754 		return -EBUSY;
2755 
2756 	return 0;
2757 }
2758 
2759 static bool device_has_rmrr(struct device *dev)
2760 {
2761 	struct dmar_rmrr_unit *rmrr;
2762 	struct device *tmp;
2763 	int i;
2764 
2765 	rcu_read_lock();
2766 	for_each_rmrr_units(rmrr) {
2767 		/*
2768 		 * Return TRUE if this RMRR contains the device that
2769 		 * is passed in.
2770 		 */
2771 		for_each_active_dev_scope(rmrr->devices,
2772 					  rmrr->devices_cnt, i, tmp)
2773 			if (tmp == dev ||
2774 			    is_downstream_to_pci_bridge(dev, tmp)) {
2775 				rcu_read_unlock();
2776 				return true;
2777 			}
2778 	}
2779 	rcu_read_unlock();
2780 	return false;
2781 }
2782 
2783 /**
2784  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2785  * is relaxable (ie. is allowed to be not enforced under some conditions)
2786  * @dev: device handle
2787  *
2788  * We assume that PCI USB devices with RMRRs have them largely
2789  * for historical reasons and that the RMRR space is not actively used post
2790  * boot.  This exclusion may change if vendors begin to abuse it.
2791  *
2792  * The same exception is made for graphics devices, with the requirement that
2793  * any use of the RMRR regions will be torn down before assigning the device
2794  * to a guest.
2795  *
2796  * Return: true if the RMRR is relaxable, false otherwise
2797  */
2798 static bool device_rmrr_is_relaxable(struct device *dev)
2799 {
2800 	struct pci_dev *pdev;
2801 
2802 	if (!dev_is_pci(dev))
2803 		return false;
2804 
2805 	pdev = to_pci_dev(dev);
2806 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2807 		return true;
2808 	else
2809 		return false;
2810 }
2811 
2812 /*
2813  * There are a couple cases where we need to restrict the functionality of
2814  * devices associated with RMRRs.  The first is when evaluating a device for
2815  * identity mapping because problems exist when devices are moved in and out
2816  * of domains and their respective RMRR information is lost.  This means that
2817  * a device with associated RMRRs will never be in a "passthrough" domain.
2818  * The second is use of the device through the IOMMU API.  This interface
2819  * expects to have full control of the IOVA space for the device.  We cannot
2820  * satisfy both the requirement that RMRR access is maintained and have an
2821  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2822  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2823  * We therefore prevent devices associated with an RMRR from participating in
2824  * the IOMMU API, which eliminates them from device assignment.
2825  *
2826  * In both cases, devices which have relaxable RMRRs are not concerned by this
2827  * restriction. See device_rmrr_is_relaxable comment.
2828  */
2829 static bool device_is_rmrr_locked(struct device *dev)
2830 {
2831 	if (!device_has_rmrr(dev))
2832 		return false;
2833 
2834 	if (device_rmrr_is_relaxable(dev))
2835 		return false;
2836 
2837 	return true;
2838 }
2839 
2840 /*
2841  * Return the required default domain type for a specific device.
2842  *
2843  * @dev: the device in query
2844  * @startup: true if this is during early boot
2845  *
2846  * Returns:
2847  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2848  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2849  *  - 0: both identity and dynamic domains work for this device
2850  */
2851 static int device_def_domain_type(struct device *dev)
2852 {
2853 	if (dev_is_pci(dev)) {
2854 		struct pci_dev *pdev = to_pci_dev(dev);
2855 
2856 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2857 			return IOMMU_DOMAIN_IDENTITY;
2858 
2859 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2860 			return IOMMU_DOMAIN_IDENTITY;
2861 	}
2862 
2863 	return 0;
2864 }
2865 
2866 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2867 {
2868 	/*
2869 	 * Start from the sane iommu hardware state.
2870 	 * If the queued invalidation is already initialized by us
2871 	 * (for example, while enabling interrupt-remapping) then
2872 	 * we got the things already rolling from a sane state.
2873 	 */
2874 	if (!iommu->qi) {
2875 		/*
2876 		 * Clear any previous faults.
2877 		 */
2878 		dmar_fault(-1, iommu);
2879 		/*
2880 		 * Disable queued invalidation if supported and already enabled
2881 		 * before OS handover.
2882 		 */
2883 		dmar_disable_qi(iommu);
2884 	}
2885 
2886 	if (dmar_enable_qi(iommu)) {
2887 		/*
2888 		 * Queued Invalidate not enabled, use Register Based Invalidate
2889 		 */
2890 		iommu->flush.flush_context = __iommu_flush_context;
2891 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2892 		pr_info("%s: Using Register based invalidation\n",
2893 			iommu->name);
2894 	} else {
2895 		iommu->flush.flush_context = qi_flush_context;
2896 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2897 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2898 	}
2899 }
2900 
2901 static int copy_context_table(struct intel_iommu *iommu,
2902 			      struct root_entry *old_re,
2903 			      struct context_entry **tbl,
2904 			      int bus, bool ext)
2905 {
2906 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2907 	struct context_entry *new_ce = NULL, ce;
2908 	struct context_entry *old_ce = NULL;
2909 	struct root_entry re;
2910 	phys_addr_t old_ce_phys;
2911 
2912 	tbl_idx = ext ? bus * 2 : bus;
2913 	memcpy(&re, old_re, sizeof(re));
2914 
2915 	for (devfn = 0; devfn < 256; devfn++) {
2916 		/* First calculate the correct index */
2917 		idx = (ext ? devfn * 2 : devfn) % 256;
2918 
2919 		if (idx == 0) {
2920 			/* First save what we may have and clean up */
2921 			if (new_ce) {
2922 				tbl[tbl_idx] = new_ce;
2923 				__iommu_flush_cache(iommu, new_ce,
2924 						    VTD_PAGE_SIZE);
2925 				pos = 1;
2926 			}
2927 
2928 			if (old_ce)
2929 				memunmap(old_ce);
2930 
2931 			ret = 0;
2932 			if (devfn < 0x80)
2933 				old_ce_phys = root_entry_lctp(&re);
2934 			else
2935 				old_ce_phys = root_entry_uctp(&re);
2936 
2937 			if (!old_ce_phys) {
2938 				if (ext && devfn == 0) {
2939 					/* No LCTP, try UCTP */
2940 					devfn = 0x7f;
2941 					continue;
2942 				} else {
2943 					goto out;
2944 				}
2945 			}
2946 
2947 			ret = -ENOMEM;
2948 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2949 					MEMREMAP_WB);
2950 			if (!old_ce)
2951 				goto out;
2952 
2953 			new_ce = alloc_pgtable_page(iommu->node);
2954 			if (!new_ce)
2955 				goto out_unmap;
2956 
2957 			ret = 0;
2958 		}
2959 
2960 		/* Now copy the context entry */
2961 		memcpy(&ce, old_ce + idx, sizeof(ce));
2962 
2963 		if (!__context_present(&ce))
2964 			continue;
2965 
2966 		did = context_domain_id(&ce);
2967 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2968 			set_bit(did, iommu->domain_ids);
2969 
2970 		/*
2971 		 * We need a marker for copied context entries. This
2972 		 * marker needs to work for the old format as well as
2973 		 * for extended context entries.
2974 		 *
2975 		 * Bit 67 of the context entry is used. In the old
2976 		 * format this bit is available to software, in the
2977 		 * extended format it is the PGE bit, but PGE is ignored
2978 		 * by HW if PASIDs are disabled (and thus still
2979 		 * available).
2980 		 *
2981 		 * So disable PASIDs first and then mark the entry
2982 		 * copied. This means that we don't copy PASID
2983 		 * translations from the old kernel, but this is fine as
2984 		 * faults there are not fatal.
2985 		 */
2986 		context_clear_pasid_enable(&ce);
2987 		context_set_copied(&ce);
2988 
2989 		new_ce[idx] = ce;
2990 	}
2991 
2992 	tbl[tbl_idx + pos] = new_ce;
2993 
2994 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2995 
2996 out_unmap:
2997 	memunmap(old_ce);
2998 
2999 out:
3000 	return ret;
3001 }
3002 
3003 static int copy_translation_tables(struct intel_iommu *iommu)
3004 {
3005 	struct context_entry **ctxt_tbls;
3006 	struct root_entry *old_rt;
3007 	phys_addr_t old_rt_phys;
3008 	int ctxt_table_entries;
3009 	unsigned long flags;
3010 	u64 rtaddr_reg;
3011 	int bus, ret;
3012 	bool new_ext, ext;
3013 
3014 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3015 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3016 	new_ext    = !!ecap_ecs(iommu->ecap);
3017 
3018 	/*
3019 	 * The RTT bit can only be changed when translation is disabled,
3020 	 * but disabling translation means to open a window for data
3021 	 * corruption. So bail out and don't copy anything if we would
3022 	 * have to change the bit.
3023 	 */
3024 	if (new_ext != ext)
3025 		return -EINVAL;
3026 
3027 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3028 	if (!old_rt_phys)
3029 		return -EINVAL;
3030 
3031 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3032 	if (!old_rt)
3033 		return -ENOMEM;
3034 
3035 	/* This is too big for the stack - allocate it from slab */
3036 	ctxt_table_entries = ext ? 512 : 256;
3037 	ret = -ENOMEM;
3038 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3039 	if (!ctxt_tbls)
3040 		goto out_unmap;
3041 
3042 	for (bus = 0; bus < 256; bus++) {
3043 		ret = copy_context_table(iommu, &old_rt[bus],
3044 					 ctxt_tbls, bus, ext);
3045 		if (ret) {
3046 			pr_err("%s: Failed to copy context table for bus %d\n",
3047 				iommu->name, bus);
3048 			continue;
3049 		}
3050 	}
3051 
3052 	spin_lock_irqsave(&iommu->lock, flags);
3053 
3054 	/* Context tables are copied, now write them to the root_entry table */
3055 	for (bus = 0; bus < 256; bus++) {
3056 		int idx = ext ? bus * 2 : bus;
3057 		u64 val;
3058 
3059 		if (ctxt_tbls[idx]) {
3060 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3061 			iommu->root_entry[bus].lo = val;
3062 		}
3063 
3064 		if (!ext || !ctxt_tbls[idx + 1])
3065 			continue;
3066 
3067 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3068 		iommu->root_entry[bus].hi = val;
3069 	}
3070 
3071 	spin_unlock_irqrestore(&iommu->lock, flags);
3072 
3073 	kfree(ctxt_tbls);
3074 
3075 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3076 
3077 	ret = 0;
3078 
3079 out_unmap:
3080 	memunmap(old_rt);
3081 
3082 	return ret;
3083 }
3084 
3085 #ifdef CONFIG_INTEL_IOMMU_SVM
3086 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3087 {
3088 	struct intel_iommu *iommu = data;
3089 	ioasid_t ioasid;
3090 
3091 	if (!iommu)
3092 		return INVALID_IOASID;
3093 	/*
3094 	 * VT-d virtual command interface always uses the full 20 bit
3095 	 * PASID range. Host can partition guest PASID range based on
3096 	 * policies but it is out of guest's control.
3097 	 */
3098 	if (min < PASID_MIN || max > intel_pasid_max_id)
3099 		return INVALID_IOASID;
3100 
3101 	if (vcmd_alloc_pasid(iommu, &ioasid))
3102 		return INVALID_IOASID;
3103 
3104 	return ioasid;
3105 }
3106 
3107 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3108 {
3109 	struct intel_iommu *iommu = data;
3110 
3111 	if (!iommu)
3112 		return;
3113 	/*
3114 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3115 	 * We can only free the PASID when all the devices are unbound.
3116 	 */
3117 	if (ioasid_find(NULL, ioasid, NULL)) {
3118 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3119 		return;
3120 	}
3121 	vcmd_free_pasid(iommu, ioasid);
3122 }
3123 
3124 static void register_pasid_allocator(struct intel_iommu *iommu)
3125 {
3126 	/*
3127 	 * If we are running in the host, no need for custom allocator
3128 	 * in that PASIDs are allocated from the host system-wide.
3129 	 */
3130 	if (!cap_caching_mode(iommu->cap))
3131 		return;
3132 
3133 	if (!sm_supported(iommu)) {
3134 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3135 		return;
3136 	}
3137 
3138 	/*
3139 	 * Register a custom PASID allocator if we are running in a guest,
3140 	 * guest PASID must be obtained via virtual command interface.
3141 	 * There can be multiple vIOMMUs in each guest but only one allocator
3142 	 * is active. All vIOMMU allocators will eventually be calling the same
3143 	 * host allocator.
3144 	 */
3145 	if (!vccap_pasid(iommu->vccap))
3146 		return;
3147 
3148 	pr_info("Register custom PASID allocator\n");
3149 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3150 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3151 	iommu->pasid_allocator.pdata = (void *)iommu;
3152 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3153 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3154 		/*
3155 		 * Disable scalable mode on this IOMMU if there
3156 		 * is no custom allocator. Mixing SM capable vIOMMU
3157 		 * and non-SM vIOMMU are not supported.
3158 		 */
3159 		intel_iommu_sm = 0;
3160 	}
3161 }
3162 #endif
3163 
3164 static int __init init_dmars(void)
3165 {
3166 	struct dmar_drhd_unit *drhd;
3167 	struct intel_iommu *iommu;
3168 	int ret;
3169 
3170 	/*
3171 	 * for each drhd
3172 	 *    allocate root
3173 	 *    initialize and program root entry to not present
3174 	 * endfor
3175 	 */
3176 	for_each_drhd_unit(drhd) {
3177 		/*
3178 		 * lock not needed as this is only incremented in the single
3179 		 * threaded kernel __init code path all other access are read
3180 		 * only
3181 		 */
3182 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3183 			g_num_of_iommus++;
3184 			continue;
3185 		}
3186 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3187 	}
3188 
3189 	/* Preallocate enough resources for IOMMU hot-addition */
3190 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3191 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3192 
3193 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3194 			GFP_KERNEL);
3195 	if (!g_iommus) {
3196 		pr_err("Allocating global iommu array failed\n");
3197 		ret = -ENOMEM;
3198 		goto error;
3199 	}
3200 
3201 	for_each_iommu(iommu, drhd) {
3202 		if (drhd->ignored) {
3203 			iommu_disable_translation(iommu);
3204 			continue;
3205 		}
3206 
3207 		/*
3208 		 * Find the max pasid size of all IOMMU's in the system.
3209 		 * We need to ensure the system pasid table is no bigger
3210 		 * than the smallest supported.
3211 		 */
3212 		if (pasid_supported(iommu)) {
3213 			u32 temp = 2 << ecap_pss(iommu->ecap);
3214 
3215 			intel_pasid_max_id = min_t(u32, temp,
3216 						   intel_pasid_max_id);
3217 		}
3218 
3219 		g_iommus[iommu->seq_id] = iommu;
3220 
3221 		intel_iommu_init_qi(iommu);
3222 
3223 		ret = iommu_init_domains(iommu);
3224 		if (ret)
3225 			goto free_iommu;
3226 
3227 		init_translation_status(iommu);
3228 
3229 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3230 			iommu_disable_translation(iommu);
3231 			clear_translation_pre_enabled(iommu);
3232 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3233 				iommu->name);
3234 		}
3235 
3236 		/*
3237 		 * TBD:
3238 		 * we could share the same root & context tables
3239 		 * among all IOMMU's. Need to Split it later.
3240 		 */
3241 		ret = iommu_alloc_root_entry(iommu);
3242 		if (ret)
3243 			goto free_iommu;
3244 
3245 		if (translation_pre_enabled(iommu)) {
3246 			pr_info("Translation already enabled - trying to copy translation structures\n");
3247 
3248 			ret = copy_translation_tables(iommu);
3249 			if (ret) {
3250 				/*
3251 				 * We found the IOMMU with translation
3252 				 * enabled - but failed to copy over the
3253 				 * old root-entry table. Try to proceed
3254 				 * by disabling translation now and
3255 				 * allocating a clean root-entry table.
3256 				 * This might cause DMAR faults, but
3257 				 * probably the dump will still succeed.
3258 				 */
3259 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3260 				       iommu->name);
3261 				iommu_disable_translation(iommu);
3262 				clear_translation_pre_enabled(iommu);
3263 			} else {
3264 				pr_info("Copied translation tables from previous kernel for %s\n",
3265 					iommu->name);
3266 			}
3267 		}
3268 
3269 		if (!ecap_pass_through(iommu->ecap))
3270 			hw_pass_through = 0;
3271 		intel_svm_check(iommu);
3272 	}
3273 
3274 	/*
3275 	 * Now that qi is enabled on all iommus, set the root entry and flush
3276 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3277 	 * flush_context function will loop forever and the boot hangs.
3278 	 */
3279 	for_each_active_iommu(iommu, drhd) {
3280 		iommu_flush_write_buffer(iommu);
3281 #ifdef CONFIG_INTEL_IOMMU_SVM
3282 		register_pasid_allocator(iommu);
3283 #endif
3284 		iommu_set_root_entry(iommu);
3285 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3286 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3287 	}
3288 
3289 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3290 	dmar_map_gfx = 0;
3291 #endif
3292 
3293 	if (!dmar_map_gfx)
3294 		iommu_identity_mapping |= IDENTMAP_GFX;
3295 
3296 	check_tylersburg_isoch();
3297 
3298 	ret = si_domain_init(hw_pass_through);
3299 	if (ret)
3300 		goto free_iommu;
3301 
3302 	/*
3303 	 * for each drhd
3304 	 *   enable fault log
3305 	 *   global invalidate context cache
3306 	 *   global invalidate iotlb
3307 	 *   enable translation
3308 	 */
3309 	for_each_iommu(iommu, drhd) {
3310 		if (drhd->ignored) {
3311 			/*
3312 			 * we always have to disable PMRs or DMA may fail on
3313 			 * this device
3314 			 */
3315 			if (force_on)
3316 				iommu_disable_protect_mem_regions(iommu);
3317 			continue;
3318 		}
3319 
3320 		iommu_flush_write_buffer(iommu);
3321 
3322 #ifdef CONFIG_INTEL_IOMMU_SVM
3323 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3324 			/*
3325 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3326 			 * could cause possible lock race condition.
3327 			 */
3328 			up_write(&dmar_global_lock);
3329 			ret = intel_svm_enable_prq(iommu);
3330 			down_write(&dmar_global_lock);
3331 			if (ret)
3332 				goto free_iommu;
3333 		}
3334 #endif
3335 		ret = dmar_set_interrupt(iommu);
3336 		if (ret)
3337 			goto free_iommu;
3338 	}
3339 
3340 	return 0;
3341 
3342 free_iommu:
3343 	for_each_active_iommu(iommu, drhd) {
3344 		disable_dmar_iommu(iommu);
3345 		free_dmar_iommu(iommu);
3346 	}
3347 
3348 	kfree(g_iommus);
3349 
3350 error:
3351 	return ret;
3352 }
3353 
3354 static inline int iommu_domain_cache_init(void)
3355 {
3356 	int ret = 0;
3357 
3358 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3359 					 sizeof(struct dmar_domain),
3360 					 0,
3361 					 SLAB_HWCACHE_ALIGN,
3362 
3363 					 NULL);
3364 	if (!iommu_domain_cache) {
3365 		pr_err("Couldn't create iommu_domain cache\n");
3366 		ret = -ENOMEM;
3367 	}
3368 
3369 	return ret;
3370 }
3371 
3372 static inline int iommu_devinfo_cache_init(void)
3373 {
3374 	int ret = 0;
3375 
3376 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3377 					 sizeof(struct device_domain_info),
3378 					 0,
3379 					 SLAB_HWCACHE_ALIGN,
3380 					 NULL);
3381 	if (!iommu_devinfo_cache) {
3382 		pr_err("Couldn't create devinfo cache\n");
3383 		ret = -ENOMEM;
3384 	}
3385 
3386 	return ret;
3387 }
3388 
3389 static int __init iommu_init_mempool(void)
3390 {
3391 	int ret;
3392 	ret = iova_cache_get();
3393 	if (ret)
3394 		return ret;
3395 
3396 	ret = iommu_domain_cache_init();
3397 	if (ret)
3398 		goto domain_error;
3399 
3400 	ret = iommu_devinfo_cache_init();
3401 	if (!ret)
3402 		return ret;
3403 
3404 	kmem_cache_destroy(iommu_domain_cache);
3405 domain_error:
3406 	iova_cache_put();
3407 
3408 	return -ENOMEM;
3409 }
3410 
3411 static void __init iommu_exit_mempool(void)
3412 {
3413 	kmem_cache_destroy(iommu_devinfo_cache);
3414 	kmem_cache_destroy(iommu_domain_cache);
3415 	iova_cache_put();
3416 }
3417 
3418 static void __init init_no_remapping_devices(void)
3419 {
3420 	struct dmar_drhd_unit *drhd;
3421 	struct device *dev;
3422 	int i;
3423 
3424 	for_each_drhd_unit(drhd) {
3425 		if (!drhd->include_all) {
3426 			for_each_active_dev_scope(drhd->devices,
3427 						  drhd->devices_cnt, i, dev)
3428 				break;
3429 			/* ignore DMAR unit if no devices exist */
3430 			if (i == drhd->devices_cnt)
3431 				drhd->ignored = 1;
3432 		}
3433 	}
3434 
3435 	for_each_active_drhd_unit(drhd) {
3436 		if (drhd->include_all)
3437 			continue;
3438 
3439 		for_each_active_dev_scope(drhd->devices,
3440 					  drhd->devices_cnt, i, dev)
3441 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3442 				break;
3443 		if (i < drhd->devices_cnt)
3444 			continue;
3445 
3446 		/* This IOMMU has *only* gfx devices. Either bypass it or
3447 		   set the gfx_mapped flag, as appropriate */
3448 		drhd->gfx_dedicated = 1;
3449 		if (!dmar_map_gfx)
3450 			drhd->ignored = 1;
3451 	}
3452 }
3453 
3454 #ifdef CONFIG_SUSPEND
3455 static int init_iommu_hw(void)
3456 {
3457 	struct dmar_drhd_unit *drhd;
3458 	struct intel_iommu *iommu = NULL;
3459 
3460 	for_each_active_iommu(iommu, drhd)
3461 		if (iommu->qi)
3462 			dmar_reenable_qi(iommu);
3463 
3464 	for_each_iommu(iommu, drhd) {
3465 		if (drhd->ignored) {
3466 			/*
3467 			 * we always have to disable PMRs or DMA may fail on
3468 			 * this device
3469 			 */
3470 			if (force_on)
3471 				iommu_disable_protect_mem_regions(iommu);
3472 			continue;
3473 		}
3474 
3475 		iommu_flush_write_buffer(iommu);
3476 
3477 		iommu_set_root_entry(iommu);
3478 
3479 		iommu->flush.flush_context(iommu, 0, 0, 0,
3480 					   DMA_CCMD_GLOBAL_INVL);
3481 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3482 		iommu_enable_translation(iommu);
3483 		iommu_disable_protect_mem_regions(iommu);
3484 	}
3485 
3486 	return 0;
3487 }
3488 
3489 static void iommu_flush_all(void)
3490 {
3491 	struct dmar_drhd_unit *drhd;
3492 	struct intel_iommu *iommu;
3493 
3494 	for_each_active_iommu(iommu, drhd) {
3495 		iommu->flush.flush_context(iommu, 0, 0, 0,
3496 					   DMA_CCMD_GLOBAL_INVL);
3497 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3498 					 DMA_TLB_GLOBAL_FLUSH);
3499 	}
3500 }
3501 
3502 static int iommu_suspend(void)
3503 {
3504 	struct dmar_drhd_unit *drhd;
3505 	struct intel_iommu *iommu = NULL;
3506 	unsigned long flag;
3507 
3508 	for_each_active_iommu(iommu, drhd) {
3509 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3510 					     GFP_KERNEL);
3511 		if (!iommu->iommu_state)
3512 			goto nomem;
3513 	}
3514 
3515 	iommu_flush_all();
3516 
3517 	for_each_active_iommu(iommu, drhd) {
3518 		iommu_disable_translation(iommu);
3519 
3520 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3521 
3522 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3523 			readl(iommu->reg + DMAR_FECTL_REG);
3524 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3525 			readl(iommu->reg + DMAR_FEDATA_REG);
3526 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3527 			readl(iommu->reg + DMAR_FEADDR_REG);
3528 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3529 			readl(iommu->reg + DMAR_FEUADDR_REG);
3530 
3531 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3532 	}
3533 	return 0;
3534 
3535 nomem:
3536 	for_each_active_iommu(iommu, drhd)
3537 		kfree(iommu->iommu_state);
3538 
3539 	return -ENOMEM;
3540 }
3541 
3542 static void iommu_resume(void)
3543 {
3544 	struct dmar_drhd_unit *drhd;
3545 	struct intel_iommu *iommu = NULL;
3546 	unsigned long flag;
3547 
3548 	if (init_iommu_hw()) {
3549 		if (force_on)
3550 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3551 		else
3552 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3553 		return;
3554 	}
3555 
3556 	for_each_active_iommu(iommu, drhd) {
3557 
3558 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3559 
3560 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3561 			iommu->reg + DMAR_FECTL_REG);
3562 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3563 			iommu->reg + DMAR_FEDATA_REG);
3564 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3565 			iommu->reg + DMAR_FEADDR_REG);
3566 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3567 			iommu->reg + DMAR_FEUADDR_REG);
3568 
3569 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3570 	}
3571 
3572 	for_each_active_iommu(iommu, drhd)
3573 		kfree(iommu->iommu_state);
3574 }
3575 
3576 static struct syscore_ops iommu_syscore_ops = {
3577 	.resume		= iommu_resume,
3578 	.suspend	= iommu_suspend,
3579 };
3580 
3581 static void __init init_iommu_pm_ops(void)
3582 {
3583 	register_syscore_ops(&iommu_syscore_ops);
3584 }
3585 
3586 #else
3587 static inline void init_iommu_pm_ops(void) {}
3588 #endif	/* CONFIG_PM */
3589 
3590 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3591 {
3592 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3593 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3594 	    rmrr->end_address <= rmrr->base_address ||
3595 	    arch_rmrr_sanity_check(rmrr))
3596 		return -EINVAL;
3597 
3598 	return 0;
3599 }
3600 
3601 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3602 {
3603 	struct acpi_dmar_reserved_memory *rmrr;
3604 	struct dmar_rmrr_unit *rmrru;
3605 
3606 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3607 	if (rmrr_sanity_check(rmrr)) {
3608 		pr_warn(FW_BUG
3609 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3610 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3611 			   rmrr->base_address, rmrr->end_address,
3612 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3613 			   dmi_get_system_info(DMI_BIOS_VERSION),
3614 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3615 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3616 	}
3617 
3618 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3619 	if (!rmrru)
3620 		goto out;
3621 
3622 	rmrru->hdr = header;
3623 
3624 	rmrru->base_address = rmrr->base_address;
3625 	rmrru->end_address = rmrr->end_address;
3626 
3627 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3628 				((void *)rmrr) + rmrr->header.length,
3629 				&rmrru->devices_cnt);
3630 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3631 		goto free_rmrru;
3632 
3633 	list_add(&rmrru->list, &dmar_rmrr_units);
3634 
3635 	return 0;
3636 free_rmrru:
3637 	kfree(rmrru);
3638 out:
3639 	return -ENOMEM;
3640 }
3641 
3642 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3643 {
3644 	struct dmar_atsr_unit *atsru;
3645 	struct acpi_dmar_atsr *tmp;
3646 
3647 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3648 				dmar_rcu_check()) {
3649 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3650 		if (atsr->segment != tmp->segment)
3651 			continue;
3652 		if (atsr->header.length != tmp->header.length)
3653 			continue;
3654 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3655 			return atsru;
3656 	}
3657 
3658 	return NULL;
3659 }
3660 
3661 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3662 {
3663 	struct acpi_dmar_atsr *atsr;
3664 	struct dmar_atsr_unit *atsru;
3665 
3666 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3667 		return 0;
3668 
3669 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3670 	atsru = dmar_find_atsr(atsr);
3671 	if (atsru)
3672 		return 0;
3673 
3674 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3675 	if (!atsru)
3676 		return -ENOMEM;
3677 
3678 	/*
3679 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3680 	 * copy the memory content because the memory buffer will be freed
3681 	 * on return.
3682 	 */
3683 	atsru->hdr = (void *)(atsru + 1);
3684 	memcpy(atsru->hdr, hdr, hdr->length);
3685 	atsru->include_all = atsr->flags & 0x1;
3686 	if (!atsru->include_all) {
3687 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3688 				(void *)atsr + atsr->header.length,
3689 				&atsru->devices_cnt);
3690 		if (atsru->devices_cnt && atsru->devices == NULL) {
3691 			kfree(atsru);
3692 			return -ENOMEM;
3693 		}
3694 	}
3695 
3696 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3697 
3698 	return 0;
3699 }
3700 
3701 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3702 {
3703 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3704 	kfree(atsru);
3705 }
3706 
3707 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3708 {
3709 	struct acpi_dmar_atsr *atsr;
3710 	struct dmar_atsr_unit *atsru;
3711 
3712 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3713 	atsru = dmar_find_atsr(atsr);
3714 	if (atsru) {
3715 		list_del_rcu(&atsru->list);
3716 		synchronize_rcu();
3717 		intel_iommu_free_atsr(atsru);
3718 	}
3719 
3720 	return 0;
3721 }
3722 
3723 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3724 {
3725 	int i;
3726 	struct device *dev;
3727 	struct acpi_dmar_atsr *atsr;
3728 	struct dmar_atsr_unit *atsru;
3729 
3730 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3731 	atsru = dmar_find_atsr(atsr);
3732 	if (!atsru)
3733 		return 0;
3734 
3735 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3736 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3737 					  i, dev)
3738 			return -EBUSY;
3739 	}
3740 
3741 	return 0;
3742 }
3743 
3744 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3745 {
3746 	int sp, ret;
3747 	struct intel_iommu *iommu = dmaru->iommu;
3748 
3749 	if (g_iommus[iommu->seq_id])
3750 		return 0;
3751 
3752 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3753 		pr_warn("%s: Doesn't support hardware pass through.\n",
3754 			iommu->name);
3755 		return -ENXIO;
3756 	}
3757 	if (!ecap_sc_support(iommu->ecap) &&
3758 	    domain_update_iommu_snooping(iommu)) {
3759 		pr_warn("%s: Doesn't support snooping.\n",
3760 			iommu->name);
3761 		return -ENXIO;
3762 	}
3763 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3764 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3765 		pr_warn("%s: Doesn't support large page.\n",
3766 			iommu->name);
3767 		return -ENXIO;
3768 	}
3769 
3770 	/*
3771 	 * Disable translation if already enabled prior to OS handover.
3772 	 */
3773 	if (iommu->gcmd & DMA_GCMD_TE)
3774 		iommu_disable_translation(iommu);
3775 
3776 	g_iommus[iommu->seq_id] = iommu;
3777 	ret = iommu_init_domains(iommu);
3778 	if (ret == 0)
3779 		ret = iommu_alloc_root_entry(iommu);
3780 	if (ret)
3781 		goto out;
3782 
3783 	intel_svm_check(iommu);
3784 
3785 	if (dmaru->ignored) {
3786 		/*
3787 		 * we always have to disable PMRs or DMA may fail on this device
3788 		 */
3789 		if (force_on)
3790 			iommu_disable_protect_mem_regions(iommu);
3791 		return 0;
3792 	}
3793 
3794 	intel_iommu_init_qi(iommu);
3795 	iommu_flush_write_buffer(iommu);
3796 
3797 #ifdef CONFIG_INTEL_IOMMU_SVM
3798 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3799 		ret = intel_svm_enable_prq(iommu);
3800 		if (ret)
3801 			goto disable_iommu;
3802 	}
3803 #endif
3804 	ret = dmar_set_interrupt(iommu);
3805 	if (ret)
3806 		goto disable_iommu;
3807 
3808 	iommu_set_root_entry(iommu);
3809 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3810 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3811 	iommu_enable_translation(iommu);
3812 
3813 	iommu_disable_protect_mem_regions(iommu);
3814 	return 0;
3815 
3816 disable_iommu:
3817 	disable_dmar_iommu(iommu);
3818 out:
3819 	free_dmar_iommu(iommu);
3820 	return ret;
3821 }
3822 
3823 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3824 {
3825 	int ret = 0;
3826 	struct intel_iommu *iommu = dmaru->iommu;
3827 
3828 	if (!intel_iommu_enabled)
3829 		return 0;
3830 	if (iommu == NULL)
3831 		return -EINVAL;
3832 
3833 	if (insert) {
3834 		ret = intel_iommu_add(dmaru);
3835 	} else {
3836 		disable_dmar_iommu(iommu);
3837 		free_dmar_iommu(iommu);
3838 	}
3839 
3840 	return ret;
3841 }
3842 
3843 static void intel_iommu_free_dmars(void)
3844 {
3845 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3846 	struct dmar_atsr_unit *atsru, *atsr_n;
3847 
3848 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3849 		list_del(&rmrru->list);
3850 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3851 		kfree(rmrru);
3852 	}
3853 
3854 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3855 		list_del(&atsru->list);
3856 		intel_iommu_free_atsr(atsru);
3857 	}
3858 }
3859 
3860 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3861 {
3862 	int i, ret = 1;
3863 	struct pci_bus *bus;
3864 	struct pci_dev *bridge = NULL;
3865 	struct device *tmp;
3866 	struct acpi_dmar_atsr *atsr;
3867 	struct dmar_atsr_unit *atsru;
3868 
3869 	dev = pci_physfn(dev);
3870 	for (bus = dev->bus; bus; bus = bus->parent) {
3871 		bridge = bus->self;
3872 		/* If it's an integrated device, allow ATS */
3873 		if (!bridge)
3874 			return 1;
3875 		/* Connected via non-PCIe: no ATS */
3876 		if (!pci_is_pcie(bridge) ||
3877 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3878 			return 0;
3879 		/* If we found the root port, look it up in the ATSR */
3880 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3881 			break;
3882 	}
3883 
3884 	rcu_read_lock();
3885 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3886 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3887 		if (atsr->segment != pci_domain_nr(dev->bus))
3888 			continue;
3889 
3890 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3891 			if (tmp == &bridge->dev)
3892 				goto out;
3893 
3894 		if (atsru->include_all)
3895 			goto out;
3896 	}
3897 	ret = 0;
3898 out:
3899 	rcu_read_unlock();
3900 
3901 	return ret;
3902 }
3903 
3904 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3905 {
3906 	int ret;
3907 	struct dmar_rmrr_unit *rmrru;
3908 	struct dmar_atsr_unit *atsru;
3909 	struct acpi_dmar_atsr *atsr;
3910 	struct acpi_dmar_reserved_memory *rmrr;
3911 
3912 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3913 		return 0;
3914 
3915 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3916 		rmrr = container_of(rmrru->hdr,
3917 				    struct acpi_dmar_reserved_memory, header);
3918 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3919 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3920 				((void *)rmrr) + rmrr->header.length,
3921 				rmrr->segment, rmrru->devices,
3922 				rmrru->devices_cnt);
3923 			if (ret < 0)
3924 				return ret;
3925 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3926 			dmar_remove_dev_scope(info, rmrr->segment,
3927 				rmrru->devices, rmrru->devices_cnt);
3928 		}
3929 	}
3930 
3931 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3932 		if (atsru->include_all)
3933 			continue;
3934 
3935 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3936 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3937 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3938 					(void *)atsr + atsr->header.length,
3939 					atsr->segment, atsru->devices,
3940 					atsru->devices_cnt);
3941 			if (ret > 0)
3942 				break;
3943 			else if (ret < 0)
3944 				return ret;
3945 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3946 			if (dmar_remove_dev_scope(info, atsr->segment,
3947 					atsru->devices, atsru->devices_cnt))
3948 				break;
3949 		}
3950 	}
3951 
3952 	return 0;
3953 }
3954 
3955 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3956 				       unsigned long val, void *v)
3957 {
3958 	struct memory_notify *mhp = v;
3959 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3960 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3961 			mhp->nr_pages - 1);
3962 
3963 	switch (val) {
3964 	case MEM_GOING_ONLINE:
3965 		if (iommu_domain_identity_map(si_domain,
3966 					      start_vpfn, last_vpfn)) {
3967 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3968 				start_vpfn, last_vpfn);
3969 			return NOTIFY_BAD;
3970 		}
3971 		break;
3972 
3973 	case MEM_OFFLINE:
3974 	case MEM_CANCEL_ONLINE:
3975 		{
3976 			struct dmar_drhd_unit *drhd;
3977 			struct intel_iommu *iommu;
3978 			struct page *freelist;
3979 
3980 			freelist = domain_unmap(si_domain,
3981 						start_vpfn, last_vpfn,
3982 						NULL);
3983 
3984 			rcu_read_lock();
3985 			for_each_active_iommu(iommu, drhd)
3986 				iommu_flush_iotlb_psi(iommu, si_domain,
3987 					start_vpfn, mhp->nr_pages,
3988 					!freelist, 0);
3989 			rcu_read_unlock();
3990 			dma_free_pagelist(freelist);
3991 		}
3992 		break;
3993 	}
3994 
3995 	return NOTIFY_OK;
3996 }
3997 
3998 static struct notifier_block intel_iommu_memory_nb = {
3999 	.notifier_call = intel_iommu_memory_notifier,
4000 	.priority = 0
4001 };
4002 
4003 static void free_all_cpu_cached_iovas(unsigned int cpu)
4004 {
4005 	int i;
4006 
4007 	for (i = 0; i < g_num_of_iommus; i++) {
4008 		struct intel_iommu *iommu = g_iommus[i];
4009 		struct dmar_domain *domain;
4010 		int did;
4011 
4012 		if (!iommu)
4013 			continue;
4014 
4015 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4016 			domain = get_iommu_domain(iommu, (u16)did);
4017 
4018 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4019 				continue;
4020 
4021 			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4022 		}
4023 	}
4024 }
4025 
4026 static int intel_iommu_cpu_dead(unsigned int cpu)
4027 {
4028 	free_all_cpu_cached_iovas(cpu);
4029 	return 0;
4030 }
4031 
4032 static void intel_disable_iommus(void)
4033 {
4034 	struct intel_iommu *iommu = NULL;
4035 	struct dmar_drhd_unit *drhd;
4036 
4037 	for_each_iommu(iommu, drhd)
4038 		iommu_disable_translation(iommu);
4039 }
4040 
4041 void intel_iommu_shutdown(void)
4042 {
4043 	struct dmar_drhd_unit *drhd;
4044 	struct intel_iommu *iommu = NULL;
4045 
4046 	if (no_iommu || dmar_disabled)
4047 		return;
4048 
4049 	down_write(&dmar_global_lock);
4050 
4051 	/* Disable PMRs explicitly here. */
4052 	for_each_iommu(iommu, drhd)
4053 		iommu_disable_protect_mem_regions(iommu);
4054 
4055 	/* Make sure the IOMMUs are switched off */
4056 	intel_disable_iommus();
4057 
4058 	up_write(&dmar_global_lock);
4059 }
4060 
4061 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4062 {
4063 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4064 
4065 	return container_of(iommu_dev, struct intel_iommu, iommu);
4066 }
4067 
4068 static ssize_t intel_iommu_show_version(struct device *dev,
4069 					struct device_attribute *attr,
4070 					char *buf)
4071 {
4072 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4073 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4074 	return sprintf(buf, "%d:%d\n",
4075 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4076 }
4077 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4078 
4079 static ssize_t intel_iommu_show_address(struct device *dev,
4080 					struct device_attribute *attr,
4081 					char *buf)
4082 {
4083 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4084 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4085 }
4086 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4087 
4088 static ssize_t intel_iommu_show_cap(struct device *dev,
4089 				    struct device_attribute *attr,
4090 				    char *buf)
4091 {
4092 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4093 	return sprintf(buf, "%llx\n", iommu->cap);
4094 }
4095 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4096 
4097 static ssize_t intel_iommu_show_ecap(struct device *dev,
4098 				    struct device_attribute *attr,
4099 				    char *buf)
4100 {
4101 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4102 	return sprintf(buf, "%llx\n", iommu->ecap);
4103 }
4104 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4105 
4106 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4107 				      struct device_attribute *attr,
4108 				      char *buf)
4109 {
4110 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4111 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4112 }
4113 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4114 
4115 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4116 					   struct device_attribute *attr,
4117 					   char *buf)
4118 {
4119 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4120 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4121 						  cap_ndoms(iommu->cap)));
4122 }
4123 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4124 
4125 static struct attribute *intel_iommu_attrs[] = {
4126 	&dev_attr_version.attr,
4127 	&dev_attr_address.attr,
4128 	&dev_attr_cap.attr,
4129 	&dev_attr_ecap.attr,
4130 	&dev_attr_domains_supported.attr,
4131 	&dev_attr_domains_used.attr,
4132 	NULL,
4133 };
4134 
4135 static struct attribute_group intel_iommu_group = {
4136 	.name = "intel-iommu",
4137 	.attrs = intel_iommu_attrs,
4138 };
4139 
4140 const struct attribute_group *intel_iommu_groups[] = {
4141 	&intel_iommu_group,
4142 	NULL,
4143 };
4144 
4145 static inline bool has_external_pci(void)
4146 {
4147 	struct pci_dev *pdev = NULL;
4148 
4149 	for_each_pci_dev(pdev)
4150 		if (pdev->external_facing)
4151 			return true;
4152 
4153 	return false;
4154 }
4155 
4156 static int __init platform_optin_force_iommu(void)
4157 {
4158 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4159 		return 0;
4160 
4161 	if (no_iommu || dmar_disabled)
4162 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4163 
4164 	/*
4165 	 * If Intel-IOMMU is disabled by default, we will apply identity
4166 	 * map for all devices except those marked as being untrusted.
4167 	 */
4168 	if (dmar_disabled)
4169 		iommu_set_default_passthrough(false);
4170 
4171 	dmar_disabled = 0;
4172 	no_iommu = 0;
4173 
4174 	return 1;
4175 }
4176 
4177 static int __init probe_acpi_namespace_devices(void)
4178 {
4179 	struct dmar_drhd_unit *drhd;
4180 	/* To avoid a -Wunused-but-set-variable warning. */
4181 	struct intel_iommu *iommu __maybe_unused;
4182 	struct device *dev;
4183 	int i, ret = 0;
4184 
4185 	for_each_active_iommu(iommu, drhd) {
4186 		for_each_active_dev_scope(drhd->devices,
4187 					  drhd->devices_cnt, i, dev) {
4188 			struct acpi_device_physical_node *pn;
4189 			struct iommu_group *group;
4190 			struct acpi_device *adev;
4191 
4192 			if (dev->bus != &acpi_bus_type)
4193 				continue;
4194 
4195 			adev = to_acpi_device(dev);
4196 			mutex_lock(&adev->physical_node_lock);
4197 			list_for_each_entry(pn,
4198 					    &adev->physical_node_list, node) {
4199 				group = iommu_group_get(pn->dev);
4200 				if (group) {
4201 					iommu_group_put(group);
4202 					continue;
4203 				}
4204 
4205 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4206 				ret = iommu_probe_device(pn->dev);
4207 				if (ret)
4208 					break;
4209 			}
4210 			mutex_unlock(&adev->physical_node_lock);
4211 
4212 			if (ret)
4213 				return ret;
4214 		}
4215 	}
4216 
4217 	return 0;
4218 }
4219 
4220 int __init intel_iommu_init(void)
4221 {
4222 	int ret = -ENODEV;
4223 	struct dmar_drhd_unit *drhd;
4224 	struct intel_iommu *iommu;
4225 
4226 	/*
4227 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4228 	 * opt in, so enforce that.
4229 	 */
4230 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4231 		    platform_optin_force_iommu();
4232 
4233 	if (iommu_init_mempool()) {
4234 		if (force_on)
4235 			panic("tboot: Failed to initialize iommu memory\n");
4236 		return -ENOMEM;
4237 	}
4238 
4239 	down_write(&dmar_global_lock);
4240 	if (dmar_table_init()) {
4241 		if (force_on)
4242 			panic("tboot: Failed to initialize DMAR table\n");
4243 		goto out_free_dmar;
4244 	}
4245 
4246 	if (dmar_dev_scope_init() < 0) {
4247 		if (force_on)
4248 			panic("tboot: Failed to initialize DMAR device scope\n");
4249 		goto out_free_dmar;
4250 	}
4251 
4252 	up_write(&dmar_global_lock);
4253 
4254 	/*
4255 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4256 	 * complain later when we register it under the lock.
4257 	 */
4258 	dmar_register_bus_notifier();
4259 
4260 	down_write(&dmar_global_lock);
4261 
4262 	if (!no_iommu)
4263 		intel_iommu_debugfs_init();
4264 
4265 	if (no_iommu || dmar_disabled) {
4266 		/*
4267 		 * We exit the function here to ensure IOMMU's remapping and
4268 		 * mempool aren't setup, which means that the IOMMU's PMRs
4269 		 * won't be disabled via the call to init_dmars(). So disable
4270 		 * it explicitly here. The PMRs were setup by tboot prior to
4271 		 * calling SENTER, but the kernel is expected to reset/tear
4272 		 * down the PMRs.
4273 		 */
4274 		if (intel_iommu_tboot_noforce) {
4275 			for_each_iommu(iommu, drhd)
4276 				iommu_disable_protect_mem_regions(iommu);
4277 		}
4278 
4279 		/*
4280 		 * Make sure the IOMMUs are switched off, even when we
4281 		 * boot into a kexec kernel and the previous kernel left
4282 		 * them enabled
4283 		 */
4284 		intel_disable_iommus();
4285 		goto out_free_dmar;
4286 	}
4287 
4288 	if (list_empty(&dmar_rmrr_units))
4289 		pr_info("No RMRR found\n");
4290 
4291 	if (list_empty(&dmar_atsr_units))
4292 		pr_info("No ATSR found\n");
4293 
4294 	if (dmar_map_gfx)
4295 		intel_iommu_gfx_mapped = 1;
4296 
4297 	init_no_remapping_devices();
4298 
4299 	ret = init_dmars();
4300 	if (ret) {
4301 		if (force_on)
4302 			panic("tboot: Failed to initialize DMARs\n");
4303 		pr_err("Initialization failed\n");
4304 		goto out_free_dmar;
4305 	}
4306 	up_write(&dmar_global_lock);
4307 
4308 	init_iommu_pm_ops();
4309 
4310 	down_read(&dmar_global_lock);
4311 	for_each_active_iommu(iommu, drhd) {
4312 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4313 				       intel_iommu_groups,
4314 				       "%s", iommu->name);
4315 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4316 		iommu_device_register(&iommu->iommu);
4317 	}
4318 	up_read(&dmar_global_lock);
4319 
4320 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4321 	if (si_domain && !hw_pass_through)
4322 		register_memory_notifier(&intel_iommu_memory_nb);
4323 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4324 			  intel_iommu_cpu_dead);
4325 
4326 	down_read(&dmar_global_lock);
4327 	if (probe_acpi_namespace_devices())
4328 		pr_warn("ACPI name space devices didn't probe correctly\n");
4329 
4330 	/* Finally, we enable the DMA remapping hardware. */
4331 	for_each_iommu(iommu, drhd) {
4332 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4333 			iommu_enable_translation(iommu);
4334 
4335 		iommu_disable_protect_mem_regions(iommu);
4336 	}
4337 	up_read(&dmar_global_lock);
4338 
4339 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4340 
4341 	intel_iommu_enabled = 1;
4342 
4343 	return 0;
4344 
4345 out_free_dmar:
4346 	intel_iommu_free_dmars();
4347 	up_write(&dmar_global_lock);
4348 	iommu_exit_mempool();
4349 	return ret;
4350 }
4351 
4352 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4353 {
4354 	struct intel_iommu *iommu = opaque;
4355 
4356 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4357 	return 0;
4358 }
4359 
4360 /*
4361  * NB - intel-iommu lacks any sort of reference counting for the users of
4362  * dependent devices.  If multiple endpoints have intersecting dependent
4363  * devices, unbinding the driver from any one of them will possibly leave
4364  * the others unable to operate.
4365  */
4366 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4367 {
4368 	if (!iommu || !dev || !dev_is_pci(dev))
4369 		return;
4370 
4371 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4372 }
4373 
4374 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4375 {
4376 	struct dmar_domain *domain;
4377 	struct intel_iommu *iommu;
4378 	unsigned long flags;
4379 
4380 	assert_spin_locked(&device_domain_lock);
4381 
4382 	if (WARN_ON(!info))
4383 		return;
4384 
4385 	iommu = info->iommu;
4386 	domain = info->domain;
4387 
4388 	if (info->dev) {
4389 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4390 			intel_pasid_tear_down_entry(iommu, info->dev,
4391 					PASID_RID2PASID, false);
4392 
4393 		iommu_disable_dev_iotlb(info);
4394 		if (!dev_is_real_dma_subdevice(info->dev))
4395 			domain_context_clear(iommu, info->dev);
4396 		intel_pasid_free_table(info->dev);
4397 	}
4398 
4399 	unlink_domain_info(info);
4400 
4401 	spin_lock_irqsave(&iommu->lock, flags);
4402 	domain_detach_iommu(domain, iommu);
4403 	spin_unlock_irqrestore(&iommu->lock, flags);
4404 
4405 	free_devinfo_mem(info);
4406 }
4407 
4408 static void dmar_remove_one_dev_info(struct device *dev)
4409 {
4410 	struct device_domain_info *info;
4411 	unsigned long flags;
4412 
4413 	spin_lock_irqsave(&device_domain_lock, flags);
4414 	info = get_domain_info(dev);
4415 	if (info)
4416 		__dmar_remove_one_dev_info(info);
4417 	spin_unlock_irqrestore(&device_domain_lock, flags);
4418 }
4419 
4420 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4421 {
4422 	int adjust_width;
4423 
4424 	/* calculate AGAW */
4425 	domain->gaw = guest_width;
4426 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4427 	domain->agaw = width_to_agaw(adjust_width);
4428 
4429 	domain->iommu_coherency = 0;
4430 	domain->iommu_snooping = 0;
4431 	domain->iommu_superpage = 0;
4432 	domain->max_addr = 0;
4433 
4434 	/* always allocate the top pgd */
4435 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4436 	if (!domain->pgd)
4437 		return -ENOMEM;
4438 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4439 	return 0;
4440 }
4441 
4442 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4443 {
4444 	struct dmar_domain *dmar_domain;
4445 	struct iommu_domain *domain;
4446 
4447 	switch (type) {
4448 	case IOMMU_DOMAIN_DMA:
4449 	case IOMMU_DOMAIN_UNMANAGED:
4450 		dmar_domain = alloc_domain(0);
4451 		if (!dmar_domain) {
4452 			pr_err("Can't allocate dmar_domain\n");
4453 			return NULL;
4454 		}
4455 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4456 			pr_err("Domain initialization failed\n");
4457 			domain_exit(dmar_domain);
4458 			return NULL;
4459 		}
4460 
4461 		if (type == IOMMU_DOMAIN_DMA &&
4462 		    iommu_get_dma_cookie(&dmar_domain->domain))
4463 			return NULL;
4464 
4465 		domain = &dmar_domain->domain;
4466 		domain->geometry.aperture_start = 0;
4467 		domain->geometry.aperture_end   =
4468 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4469 		domain->geometry.force_aperture = true;
4470 
4471 		return domain;
4472 	case IOMMU_DOMAIN_IDENTITY:
4473 		return &si_domain->domain;
4474 	default:
4475 		return NULL;
4476 	}
4477 
4478 	return NULL;
4479 }
4480 
4481 static void intel_iommu_domain_free(struct iommu_domain *domain)
4482 {
4483 	if (domain != &si_domain->domain)
4484 		domain_exit(to_dmar_domain(domain));
4485 }
4486 
4487 /*
4488  * Check whether a @domain could be attached to the @dev through the
4489  * aux-domain attach/detach APIs.
4490  */
4491 static inline bool
4492 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4493 {
4494 	struct device_domain_info *info = get_domain_info(dev);
4495 
4496 	return info && info->auxd_enabled &&
4497 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4498 }
4499 
4500 static inline struct subdev_domain_info *
4501 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4502 {
4503 	struct subdev_domain_info *sinfo;
4504 
4505 	if (!list_empty(&domain->subdevices)) {
4506 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4507 			if (sinfo->pdev == dev)
4508 				return sinfo;
4509 		}
4510 	}
4511 
4512 	return NULL;
4513 }
4514 
4515 static int auxiliary_link_device(struct dmar_domain *domain,
4516 				 struct device *dev)
4517 {
4518 	struct device_domain_info *info = get_domain_info(dev);
4519 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4520 
4521 	assert_spin_locked(&device_domain_lock);
4522 	if (WARN_ON(!info))
4523 		return -EINVAL;
4524 
4525 	if (!sinfo) {
4526 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4527 		sinfo->domain = domain;
4528 		sinfo->pdev = dev;
4529 		list_add(&sinfo->link_phys, &info->subdevices);
4530 		list_add(&sinfo->link_domain, &domain->subdevices);
4531 	}
4532 
4533 	return ++sinfo->users;
4534 }
4535 
4536 static int auxiliary_unlink_device(struct dmar_domain *domain,
4537 				   struct device *dev)
4538 {
4539 	struct device_domain_info *info = get_domain_info(dev);
4540 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4541 	int ret;
4542 
4543 	assert_spin_locked(&device_domain_lock);
4544 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4545 		return -EINVAL;
4546 
4547 	ret = --sinfo->users;
4548 	if (!ret) {
4549 		list_del(&sinfo->link_phys);
4550 		list_del(&sinfo->link_domain);
4551 		kfree(sinfo);
4552 	}
4553 
4554 	return ret;
4555 }
4556 
4557 static int aux_domain_add_dev(struct dmar_domain *domain,
4558 			      struct device *dev)
4559 {
4560 	int ret;
4561 	unsigned long flags;
4562 	struct intel_iommu *iommu;
4563 
4564 	iommu = device_to_iommu(dev, NULL, NULL);
4565 	if (!iommu)
4566 		return -ENODEV;
4567 
4568 	if (domain->default_pasid <= 0) {
4569 		u32 pasid;
4570 
4571 		/* No private data needed for the default pasid */
4572 		pasid = ioasid_alloc(NULL, PASID_MIN,
4573 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4574 				     NULL);
4575 		if (pasid == INVALID_IOASID) {
4576 			pr_err("Can't allocate default pasid\n");
4577 			return -ENODEV;
4578 		}
4579 		domain->default_pasid = pasid;
4580 	}
4581 
4582 	spin_lock_irqsave(&device_domain_lock, flags);
4583 	ret = auxiliary_link_device(domain, dev);
4584 	if (ret <= 0)
4585 		goto link_failed;
4586 
4587 	/*
4588 	 * Subdevices from the same physical device can be attached to the
4589 	 * same domain. For such cases, only the first subdevice attachment
4590 	 * needs to go through the full steps in this function. So if ret >
4591 	 * 1, just goto out.
4592 	 */
4593 	if (ret > 1)
4594 		goto out;
4595 
4596 	/*
4597 	 * iommu->lock must be held to attach domain to iommu and setup the
4598 	 * pasid entry for second level translation.
4599 	 */
4600 	spin_lock(&iommu->lock);
4601 	ret = domain_attach_iommu(domain, iommu);
4602 	if (ret)
4603 		goto attach_failed;
4604 
4605 	/* Setup the PASID entry for mediated devices: */
4606 	if (domain_use_first_level(domain))
4607 		ret = domain_setup_first_level(iommu, domain, dev,
4608 					       domain->default_pasid);
4609 	else
4610 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4611 						     domain->default_pasid);
4612 	if (ret)
4613 		goto table_failed;
4614 
4615 	spin_unlock(&iommu->lock);
4616 out:
4617 	spin_unlock_irqrestore(&device_domain_lock, flags);
4618 
4619 	return 0;
4620 
4621 table_failed:
4622 	domain_detach_iommu(domain, iommu);
4623 attach_failed:
4624 	spin_unlock(&iommu->lock);
4625 	auxiliary_unlink_device(domain, dev);
4626 link_failed:
4627 	spin_unlock_irqrestore(&device_domain_lock, flags);
4628 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4629 		ioasid_put(domain->default_pasid);
4630 
4631 	return ret;
4632 }
4633 
4634 static void aux_domain_remove_dev(struct dmar_domain *domain,
4635 				  struct device *dev)
4636 {
4637 	struct device_domain_info *info;
4638 	struct intel_iommu *iommu;
4639 	unsigned long flags;
4640 
4641 	if (!is_aux_domain(dev, &domain->domain))
4642 		return;
4643 
4644 	spin_lock_irqsave(&device_domain_lock, flags);
4645 	info = get_domain_info(dev);
4646 	iommu = info->iommu;
4647 
4648 	if (!auxiliary_unlink_device(domain, dev)) {
4649 		spin_lock(&iommu->lock);
4650 		intel_pasid_tear_down_entry(iommu, dev,
4651 					    domain->default_pasid, false);
4652 		domain_detach_iommu(domain, iommu);
4653 		spin_unlock(&iommu->lock);
4654 	}
4655 
4656 	spin_unlock_irqrestore(&device_domain_lock, flags);
4657 
4658 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4659 		ioasid_put(domain->default_pasid);
4660 }
4661 
4662 static int prepare_domain_attach_device(struct iommu_domain *domain,
4663 					struct device *dev)
4664 {
4665 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4666 	struct intel_iommu *iommu;
4667 	int addr_width;
4668 
4669 	iommu = device_to_iommu(dev, NULL, NULL);
4670 	if (!iommu)
4671 		return -ENODEV;
4672 
4673 	/* check if this iommu agaw is sufficient for max mapped address */
4674 	addr_width = agaw_to_width(iommu->agaw);
4675 	if (addr_width > cap_mgaw(iommu->cap))
4676 		addr_width = cap_mgaw(iommu->cap);
4677 
4678 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4679 		dev_err(dev, "%s: iommu width (%d) is not "
4680 		        "sufficient for the mapped address (%llx)\n",
4681 		        __func__, addr_width, dmar_domain->max_addr);
4682 		return -EFAULT;
4683 	}
4684 	dmar_domain->gaw = addr_width;
4685 
4686 	/*
4687 	 * Knock out extra levels of page tables if necessary
4688 	 */
4689 	while (iommu->agaw < dmar_domain->agaw) {
4690 		struct dma_pte *pte;
4691 
4692 		pte = dmar_domain->pgd;
4693 		if (dma_pte_present(pte)) {
4694 			dmar_domain->pgd = (struct dma_pte *)
4695 				phys_to_virt(dma_pte_addr(pte));
4696 			free_pgtable_page(pte);
4697 		}
4698 		dmar_domain->agaw--;
4699 	}
4700 
4701 	return 0;
4702 }
4703 
4704 static int intel_iommu_attach_device(struct iommu_domain *domain,
4705 				     struct device *dev)
4706 {
4707 	int ret;
4708 
4709 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4710 	    device_is_rmrr_locked(dev)) {
4711 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4712 		return -EPERM;
4713 	}
4714 
4715 	if (is_aux_domain(dev, domain))
4716 		return -EPERM;
4717 
4718 	/* normally dev is not mapped */
4719 	if (unlikely(domain_context_mapped(dev))) {
4720 		struct dmar_domain *old_domain;
4721 
4722 		old_domain = find_domain(dev);
4723 		if (old_domain)
4724 			dmar_remove_one_dev_info(dev);
4725 	}
4726 
4727 	ret = prepare_domain_attach_device(domain, dev);
4728 	if (ret)
4729 		return ret;
4730 
4731 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4732 }
4733 
4734 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4735 					 struct device *dev)
4736 {
4737 	int ret;
4738 
4739 	if (!is_aux_domain(dev, domain))
4740 		return -EPERM;
4741 
4742 	ret = prepare_domain_attach_device(domain, dev);
4743 	if (ret)
4744 		return ret;
4745 
4746 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4747 }
4748 
4749 static void intel_iommu_detach_device(struct iommu_domain *domain,
4750 				      struct device *dev)
4751 {
4752 	dmar_remove_one_dev_info(dev);
4753 }
4754 
4755 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4756 					  struct device *dev)
4757 {
4758 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4759 }
4760 
4761 #ifdef CONFIG_INTEL_IOMMU_SVM
4762 /*
4763  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4764  * VT-d granularity. Invalidation is typically included in the unmap operation
4765  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4766  * owns the first level page tables. Invalidations of translation caches in the
4767  * guest are trapped and passed down to the host.
4768  *
4769  * vIOMMU in the guest will only expose first level page tables, therefore
4770  * we do not support IOTLB granularity for request without PASID (second level).
4771  *
4772  * For example, to find the VT-d granularity encoding for IOTLB
4773  * type and page selective granularity within PASID:
4774  * X: indexed by iommu cache type
4775  * Y: indexed by enum iommu_inv_granularity
4776  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4777  */
4778 
4779 static const int
4780 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4781 	/*
4782 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4783 	 * page selective (address granularity)
4784 	 */
4785 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4786 	/* PASID based dev TLBs */
4787 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4788 	/* PASID cache */
4789 	{-EINVAL, -EINVAL, -EINVAL}
4790 };
4791 
4792 static inline int to_vtd_granularity(int type, int granu)
4793 {
4794 	return inv_type_granu_table[type][granu];
4795 }
4796 
4797 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4798 {
4799 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4800 
4801 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4802 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4803 	 * granu size in contiguous memory.
4804 	 */
4805 	return order_base_2(nr_pages);
4806 }
4807 
4808 static int
4809 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4810 			   struct iommu_cache_invalidate_info *inv_info)
4811 {
4812 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4813 	struct device_domain_info *info;
4814 	struct intel_iommu *iommu;
4815 	unsigned long flags;
4816 	int cache_type;
4817 	u8 bus, devfn;
4818 	u16 did, sid;
4819 	int ret = 0;
4820 	u64 size = 0;
4821 
4822 	if (!inv_info || !dmar_domain)
4823 		return -EINVAL;
4824 
4825 	if (!dev || !dev_is_pci(dev))
4826 		return -ENODEV;
4827 
4828 	iommu = device_to_iommu(dev, &bus, &devfn);
4829 	if (!iommu)
4830 		return -ENODEV;
4831 
4832 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4833 		return -EINVAL;
4834 
4835 	spin_lock_irqsave(&device_domain_lock, flags);
4836 	spin_lock(&iommu->lock);
4837 	info = get_domain_info(dev);
4838 	if (!info) {
4839 		ret = -EINVAL;
4840 		goto out_unlock;
4841 	}
4842 	did = dmar_domain->iommu_did[iommu->seq_id];
4843 	sid = PCI_DEVID(bus, devfn);
4844 
4845 	/* Size is only valid in address selective invalidation */
4846 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4847 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4848 				   inv_info->granu.addr_info.nb_granules);
4849 
4850 	for_each_set_bit(cache_type,
4851 			 (unsigned long *)&inv_info->cache,
4852 			 IOMMU_CACHE_INV_TYPE_NR) {
4853 		int granu = 0;
4854 		u64 pasid = 0;
4855 		u64 addr = 0;
4856 
4857 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4858 		if (granu == -EINVAL) {
4859 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4860 					   cache_type, inv_info->granularity);
4861 			break;
4862 		}
4863 
4864 		/*
4865 		 * PASID is stored in different locations based on the
4866 		 * granularity.
4867 		 */
4868 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4869 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4870 			pasid = inv_info->granu.pasid_info.pasid;
4871 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4872 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4873 			pasid = inv_info->granu.addr_info.pasid;
4874 
4875 		switch (BIT(cache_type)) {
4876 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4877 			/* HW will ignore LSB bits based on address mask */
4878 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4879 			    size &&
4880 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4881 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4882 						   inv_info->granu.addr_info.addr, size);
4883 			}
4884 
4885 			/*
4886 			 * If granu is PASID-selective, address is ignored.
4887 			 * We use npages = -1 to indicate that.
4888 			 */
4889 			qi_flush_piotlb(iommu, did, pasid,
4890 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4891 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4892 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4893 
4894 			if (!info->ats_enabled)
4895 				break;
4896 			/*
4897 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4898 			 * in the guest may assume IOTLB flush is inclusive,
4899 			 * which is more efficient.
4900 			 */
4901 			fallthrough;
4902 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4903 			/*
4904 			 * PASID based device TLB invalidation does not support
4905 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4906 			 * IOMMU_INV_GRANU_ADDR.
4907 			 * The equivalent of that is we set the size to be the
4908 			 * entire range of 64 bit. User only provides PASID info
4909 			 * without address info. So we set addr to 0.
4910 			 */
4911 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4912 				size = 64 - VTD_PAGE_SHIFT;
4913 				addr = 0;
4914 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4915 				addr = inv_info->granu.addr_info.addr;
4916 			}
4917 
4918 			if (info->ats_enabled)
4919 				qi_flush_dev_iotlb_pasid(iommu, sid,
4920 						info->pfsid, pasid,
4921 						info->ats_qdep, addr,
4922 						size);
4923 			else
4924 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4925 			break;
4926 		default:
4927 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4928 					    cache_type);
4929 			ret = -EINVAL;
4930 		}
4931 	}
4932 out_unlock:
4933 	spin_unlock(&iommu->lock);
4934 	spin_unlock_irqrestore(&device_domain_lock, flags);
4935 
4936 	return ret;
4937 }
4938 #endif
4939 
4940 static int intel_iommu_map(struct iommu_domain *domain,
4941 			   unsigned long iova, phys_addr_t hpa,
4942 			   size_t size, int iommu_prot, gfp_t gfp)
4943 {
4944 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4945 	u64 max_addr;
4946 	int prot = 0;
4947 	int ret;
4948 
4949 	if (iommu_prot & IOMMU_READ)
4950 		prot |= DMA_PTE_READ;
4951 	if (iommu_prot & IOMMU_WRITE)
4952 		prot |= DMA_PTE_WRITE;
4953 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4954 		prot |= DMA_PTE_SNP;
4955 
4956 	max_addr = iova + size;
4957 	if (dmar_domain->max_addr < max_addr) {
4958 		u64 end;
4959 
4960 		/* check if minimum agaw is sufficient for mapped address */
4961 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4962 		if (end < max_addr) {
4963 			pr_err("%s: iommu width (%d) is not "
4964 			       "sufficient for the mapped address (%llx)\n",
4965 			       __func__, dmar_domain->gaw, max_addr);
4966 			return -EFAULT;
4967 		}
4968 		dmar_domain->max_addr = max_addr;
4969 	}
4970 	/* Round up size to next multiple of PAGE_SIZE, if it and
4971 	   the low bits of hpa would take us onto the next page */
4972 	size = aligned_nrpages(hpa, size);
4973 	ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4974 			     hpa >> VTD_PAGE_SHIFT, size, prot);
4975 	return ret;
4976 }
4977 
4978 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4979 				unsigned long iova, size_t size,
4980 				struct iommu_iotlb_gather *gather)
4981 {
4982 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4983 	unsigned long start_pfn, last_pfn;
4984 	int level = 0;
4985 
4986 	/* Cope with horrid API which requires us to unmap more than the
4987 	   size argument if it happens to be a large-page mapping. */
4988 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4989 
4990 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4991 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4992 
4993 	start_pfn = iova >> VTD_PAGE_SHIFT;
4994 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4995 
4996 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
4997 					last_pfn, gather->freelist);
4998 
4999 	if (dmar_domain->max_addr == iova + size)
5000 		dmar_domain->max_addr = iova;
5001 
5002 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5003 
5004 	return size;
5005 }
5006 
5007 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5008 				 struct iommu_iotlb_gather *gather)
5009 {
5010 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5011 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5012 	size_t size = gather->end - gather->start;
5013 	unsigned long start_pfn;
5014 	unsigned long nrpages;
5015 	int iommu_id;
5016 
5017 	nrpages = aligned_nrpages(gather->start, size);
5018 	start_pfn = mm_to_dma_pfn(iova_pfn);
5019 
5020 	for_each_domain_iommu(iommu_id, dmar_domain)
5021 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5022 				      start_pfn, nrpages, !gather->freelist, 0);
5023 
5024 	dma_free_pagelist(gather->freelist);
5025 }
5026 
5027 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5028 					    dma_addr_t iova)
5029 {
5030 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5031 	struct dma_pte *pte;
5032 	int level = 0;
5033 	u64 phys = 0;
5034 
5035 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5036 	if (pte && dma_pte_present(pte))
5037 		phys = dma_pte_addr(pte) +
5038 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5039 						VTD_PAGE_SHIFT) - 1));
5040 
5041 	return phys;
5042 }
5043 
5044 static inline bool scalable_mode_support(void)
5045 {
5046 	struct dmar_drhd_unit *drhd;
5047 	struct intel_iommu *iommu;
5048 	bool ret = true;
5049 
5050 	rcu_read_lock();
5051 	for_each_active_iommu(iommu, drhd) {
5052 		if (!sm_supported(iommu)) {
5053 			ret = false;
5054 			break;
5055 		}
5056 	}
5057 	rcu_read_unlock();
5058 
5059 	return ret;
5060 }
5061 
5062 static inline bool iommu_pasid_support(void)
5063 {
5064 	struct dmar_drhd_unit *drhd;
5065 	struct intel_iommu *iommu;
5066 	bool ret = true;
5067 
5068 	rcu_read_lock();
5069 	for_each_active_iommu(iommu, drhd) {
5070 		if (!pasid_supported(iommu)) {
5071 			ret = false;
5072 			break;
5073 		}
5074 	}
5075 	rcu_read_unlock();
5076 
5077 	return ret;
5078 }
5079 
5080 static inline bool nested_mode_support(void)
5081 {
5082 	struct dmar_drhd_unit *drhd;
5083 	struct intel_iommu *iommu;
5084 	bool ret = true;
5085 
5086 	rcu_read_lock();
5087 	for_each_active_iommu(iommu, drhd) {
5088 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5089 			ret = false;
5090 			break;
5091 		}
5092 	}
5093 	rcu_read_unlock();
5094 
5095 	return ret;
5096 }
5097 
5098 static bool intel_iommu_capable(enum iommu_cap cap)
5099 {
5100 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5101 		return domain_update_iommu_snooping(NULL) == 1;
5102 	if (cap == IOMMU_CAP_INTR_REMAP)
5103 		return irq_remapping_enabled == 1;
5104 
5105 	return false;
5106 }
5107 
5108 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5109 {
5110 	struct intel_iommu *iommu;
5111 
5112 	iommu = device_to_iommu(dev, NULL, NULL);
5113 	if (!iommu)
5114 		return ERR_PTR(-ENODEV);
5115 
5116 	if (translation_pre_enabled(iommu))
5117 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5118 
5119 	return &iommu->iommu;
5120 }
5121 
5122 static void intel_iommu_release_device(struct device *dev)
5123 {
5124 	struct intel_iommu *iommu;
5125 
5126 	iommu = device_to_iommu(dev, NULL, NULL);
5127 	if (!iommu)
5128 		return;
5129 
5130 	dmar_remove_one_dev_info(dev);
5131 
5132 	set_dma_ops(dev, NULL);
5133 }
5134 
5135 static void intel_iommu_probe_finalize(struct device *dev)
5136 {
5137 	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5138 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5139 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5140 
5141 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5142 		iommu_setup_dma_ops(dev, base,
5143 				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5144 	else
5145 		set_dma_ops(dev, NULL);
5146 }
5147 
5148 static void intel_iommu_get_resv_regions(struct device *device,
5149 					 struct list_head *head)
5150 {
5151 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5152 	struct iommu_resv_region *reg;
5153 	struct dmar_rmrr_unit *rmrr;
5154 	struct device *i_dev;
5155 	int i;
5156 
5157 	down_read(&dmar_global_lock);
5158 	for_each_rmrr_units(rmrr) {
5159 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5160 					  i, i_dev) {
5161 			struct iommu_resv_region *resv;
5162 			enum iommu_resv_type type;
5163 			size_t length;
5164 
5165 			if (i_dev != device &&
5166 			    !is_downstream_to_pci_bridge(device, i_dev))
5167 				continue;
5168 
5169 			length = rmrr->end_address - rmrr->base_address + 1;
5170 
5171 			type = device_rmrr_is_relaxable(device) ?
5172 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5173 
5174 			resv = iommu_alloc_resv_region(rmrr->base_address,
5175 						       length, prot, type);
5176 			if (!resv)
5177 				break;
5178 
5179 			list_add_tail(&resv->list, head);
5180 		}
5181 	}
5182 	up_read(&dmar_global_lock);
5183 
5184 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5185 	if (dev_is_pci(device)) {
5186 		struct pci_dev *pdev = to_pci_dev(device);
5187 
5188 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5189 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5190 						   IOMMU_RESV_DIRECT_RELAXABLE);
5191 			if (reg)
5192 				list_add_tail(&reg->list, head);
5193 		}
5194 	}
5195 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5196 
5197 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5198 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5199 				      0, IOMMU_RESV_MSI);
5200 	if (!reg)
5201 		return;
5202 	list_add_tail(&reg->list, head);
5203 }
5204 
5205 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5206 {
5207 	struct device_domain_info *info;
5208 	struct context_entry *context;
5209 	struct dmar_domain *domain;
5210 	unsigned long flags;
5211 	u64 ctx_lo;
5212 	int ret;
5213 
5214 	domain = find_domain(dev);
5215 	if (!domain)
5216 		return -EINVAL;
5217 
5218 	spin_lock_irqsave(&device_domain_lock, flags);
5219 	spin_lock(&iommu->lock);
5220 
5221 	ret = -EINVAL;
5222 	info = get_domain_info(dev);
5223 	if (!info || !info->pasid_supported)
5224 		goto out;
5225 
5226 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5227 	if (WARN_ON(!context))
5228 		goto out;
5229 
5230 	ctx_lo = context[0].lo;
5231 
5232 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5233 		ctx_lo |= CONTEXT_PASIDE;
5234 		context[0].lo = ctx_lo;
5235 		wmb();
5236 		iommu->flush.flush_context(iommu,
5237 					   domain->iommu_did[iommu->seq_id],
5238 					   PCI_DEVID(info->bus, info->devfn),
5239 					   DMA_CCMD_MASK_NOBIT,
5240 					   DMA_CCMD_DEVICE_INVL);
5241 	}
5242 
5243 	/* Enable PASID support in the device, if it wasn't already */
5244 	if (!info->pasid_enabled)
5245 		iommu_enable_dev_iotlb(info);
5246 
5247 	ret = 0;
5248 
5249  out:
5250 	spin_unlock(&iommu->lock);
5251 	spin_unlock_irqrestore(&device_domain_lock, flags);
5252 
5253 	return ret;
5254 }
5255 
5256 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5257 {
5258 	if (dev_is_pci(dev))
5259 		return pci_device_group(dev);
5260 	return generic_device_group(dev);
5261 }
5262 
5263 static int intel_iommu_enable_auxd(struct device *dev)
5264 {
5265 	struct device_domain_info *info;
5266 	struct intel_iommu *iommu;
5267 	unsigned long flags;
5268 	int ret;
5269 
5270 	iommu = device_to_iommu(dev, NULL, NULL);
5271 	if (!iommu || dmar_disabled)
5272 		return -EINVAL;
5273 
5274 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5275 		return -EINVAL;
5276 
5277 	ret = intel_iommu_enable_pasid(iommu, dev);
5278 	if (ret)
5279 		return -ENODEV;
5280 
5281 	spin_lock_irqsave(&device_domain_lock, flags);
5282 	info = get_domain_info(dev);
5283 	info->auxd_enabled = 1;
5284 	spin_unlock_irqrestore(&device_domain_lock, flags);
5285 
5286 	return 0;
5287 }
5288 
5289 static int intel_iommu_disable_auxd(struct device *dev)
5290 {
5291 	struct device_domain_info *info;
5292 	unsigned long flags;
5293 
5294 	spin_lock_irqsave(&device_domain_lock, flags);
5295 	info = get_domain_info(dev);
5296 	if (!WARN_ON(!info))
5297 		info->auxd_enabled = 0;
5298 	spin_unlock_irqrestore(&device_domain_lock, flags);
5299 
5300 	return 0;
5301 }
5302 
5303 /*
5304  * A PCI express designated vendor specific extended capability is defined
5305  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5306  * for system software and tools to detect endpoint devices supporting the
5307  * Intel scalable IO virtualization without host driver dependency.
5308  *
5309  * Returns the address of the matching extended capability structure within
5310  * the device's PCI configuration space or 0 if the device does not support
5311  * it.
5312  */
5313 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5314 {
5315 	int pos;
5316 	u16 vendor, id;
5317 
5318 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5319 	while (pos) {
5320 		pci_read_config_word(pdev, pos + 4, &vendor);
5321 		pci_read_config_word(pdev, pos + 8, &id);
5322 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5323 			return pos;
5324 
5325 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5326 	}
5327 
5328 	return 0;
5329 }
5330 
5331 static bool
5332 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5333 {
5334 	if (feat == IOMMU_DEV_FEAT_AUX) {
5335 		int ret;
5336 
5337 		if (!dev_is_pci(dev) || dmar_disabled ||
5338 		    !scalable_mode_support() || !iommu_pasid_support())
5339 			return false;
5340 
5341 		ret = pci_pasid_features(to_pci_dev(dev));
5342 		if (ret < 0)
5343 			return false;
5344 
5345 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5346 	}
5347 
5348 	if (feat == IOMMU_DEV_FEAT_SVA) {
5349 		struct device_domain_info *info = get_domain_info(dev);
5350 
5351 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5352 			info->pasid_supported && info->pri_supported &&
5353 			info->ats_supported;
5354 	}
5355 
5356 	return false;
5357 }
5358 
5359 static int
5360 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5361 {
5362 	if (feat == IOMMU_DEV_FEAT_AUX)
5363 		return intel_iommu_enable_auxd(dev);
5364 
5365 	if (feat == IOMMU_DEV_FEAT_SVA) {
5366 		struct device_domain_info *info = get_domain_info(dev);
5367 
5368 		if (!info)
5369 			return -EINVAL;
5370 
5371 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5372 			return 0;
5373 	}
5374 
5375 	return -ENODEV;
5376 }
5377 
5378 static int
5379 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5380 {
5381 	if (feat == IOMMU_DEV_FEAT_AUX)
5382 		return intel_iommu_disable_auxd(dev);
5383 
5384 	return -ENODEV;
5385 }
5386 
5387 static bool
5388 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5389 {
5390 	struct device_domain_info *info = get_domain_info(dev);
5391 
5392 	if (feat == IOMMU_DEV_FEAT_AUX)
5393 		return scalable_mode_support() && info && info->auxd_enabled;
5394 
5395 	return false;
5396 }
5397 
5398 static int
5399 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5400 {
5401 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402 
5403 	return dmar_domain->default_pasid > 0 ?
5404 			dmar_domain->default_pasid : -EINVAL;
5405 }
5406 
5407 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5408 					   struct device *dev)
5409 {
5410 	return attach_deferred(dev);
5411 }
5412 
5413 static int
5414 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5415 			    enum iommu_attr attr, void *data)
5416 {
5417 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5418 	unsigned long flags;
5419 	int ret = 0;
5420 
5421 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5422 		return -EINVAL;
5423 
5424 	switch (attr) {
5425 	case DOMAIN_ATTR_NESTING:
5426 		spin_lock_irqsave(&device_domain_lock, flags);
5427 		if (nested_mode_support() &&
5428 		    list_empty(&dmar_domain->devices)) {
5429 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5430 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5431 		} else {
5432 			ret = -ENODEV;
5433 		}
5434 		spin_unlock_irqrestore(&device_domain_lock, flags);
5435 		break;
5436 	default:
5437 		ret = -EINVAL;
5438 		break;
5439 	}
5440 
5441 	return ret;
5442 }
5443 
5444 static int
5445 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5446 			    enum iommu_attr attr, void *data)
5447 {
5448 	switch (domain->type) {
5449 	case IOMMU_DOMAIN_UNMANAGED:
5450 		return -ENODEV;
5451 	case IOMMU_DOMAIN_DMA:
5452 		switch (attr) {
5453 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5454 			*(int *)data = !intel_iommu_strict;
5455 			return 0;
5456 		default:
5457 			return -ENODEV;
5458 		}
5459 		break;
5460 	default:
5461 		return -EINVAL;
5462 	}
5463 }
5464 
5465 /*
5466  * Check that the device does not live on an external facing PCI port that is
5467  * marked as untrusted. Such devices should not be able to apply quirks and
5468  * thus not be able to bypass the IOMMU restrictions.
5469  */
5470 static bool risky_device(struct pci_dev *pdev)
5471 {
5472 	if (pdev->untrusted) {
5473 		pci_info(pdev,
5474 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5475 			 pdev->vendor, pdev->device);
5476 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5477 		return true;
5478 	}
5479 	return false;
5480 }
5481 
5482 const struct iommu_ops intel_iommu_ops = {
5483 	.capable		= intel_iommu_capable,
5484 	.domain_alloc		= intel_iommu_domain_alloc,
5485 	.domain_free		= intel_iommu_domain_free,
5486 	.domain_get_attr        = intel_iommu_domain_get_attr,
5487 	.domain_set_attr	= intel_iommu_domain_set_attr,
5488 	.attach_dev		= intel_iommu_attach_device,
5489 	.detach_dev		= intel_iommu_detach_device,
5490 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5491 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5492 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5493 	.map			= intel_iommu_map,
5494 	.unmap			= intel_iommu_unmap,
5495 	.flush_iotlb_all        = intel_flush_iotlb_all,
5496 	.iotlb_sync		= intel_iommu_tlb_sync,
5497 	.iova_to_phys		= intel_iommu_iova_to_phys,
5498 	.probe_device		= intel_iommu_probe_device,
5499 	.probe_finalize		= intel_iommu_probe_finalize,
5500 	.release_device		= intel_iommu_release_device,
5501 	.get_resv_regions	= intel_iommu_get_resv_regions,
5502 	.put_resv_regions	= generic_iommu_put_resv_regions,
5503 	.device_group		= intel_iommu_device_group,
5504 	.dev_has_feat		= intel_iommu_dev_has_feat,
5505 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5506 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5507 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5508 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5509 	.def_domain_type	= device_def_domain_type,
5510 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5511 #ifdef CONFIG_INTEL_IOMMU_SVM
5512 	.cache_invalidate	= intel_iommu_sva_invalidate,
5513 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5514 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5515 	.sva_bind		= intel_svm_bind,
5516 	.sva_unbind		= intel_svm_unbind,
5517 	.sva_get_pasid		= intel_svm_get_pasid,
5518 	.page_response		= intel_svm_page_response,
5519 #endif
5520 };
5521 
5522 static void quirk_iommu_igfx(struct pci_dev *dev)
5523 {
5524 	if (risky_device(dev))
5525 		return;
5526 
5527 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5528 	dmar_map_gfx = 0;
5529 }
5530 
5531 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5538 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5539 
5540 /* Broadwell igfx malfunctions with dmar */
5541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5545 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5552 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5553 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5555 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5556 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5557 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5558 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5560 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5564 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5565 
5566 static void quirk_iommu_rwbf(struct pci_dev *dev)
5567 {
5568 	if (risky_device(dev))
5569 		return;
5570 
5571 	/*
5572 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5573 	 * but needs it. Same seems to hold for the desktop versions.
5574 	 */
5575 	pci_info(dev, "Forcing write-buffer flush capability\n");
5576 	rwbf_quirk = 1;
5577 }
5578 
5579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5582 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5585 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5586 
5587 #define GGC 0x52
5588 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5589 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5590 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5591 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5592 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5593 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5594 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5595 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5596 
5597 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5598 {
5599 	unsigned short ggc;
5600 
5601 	if (risky_device(dev))
5602 		return;
5603 
5604 	if (pci_read_config_word(dev, GGC, &ggc))
5605 		return;
5606 
5607 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5608 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5609 		dmar_map_gfx = 0;
5610 	} else if (dmar_map_gfx) {
5611 		/* we have to ensure the gfx device is idle before we flush */
5612 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5613 		intel_iommu_strict = 1;
5614        }
5615 }
5616 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5617 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5618 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5619 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5620 
5621 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5622 {
5623 	unsigned short ver;
5624 
5625 	if (!IS_GFX_DEVICE(dev))
5626 		return;
5627 
5628 	ver = (dev->device >> 8) & 0xff;
5629 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5630 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5631 	    ver != 0x9a)
5632 		return;
5633 
5634 	if (risky_device(dev))
5635 		return;
5636 
5637 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5638 	iommu_skip_te_disable = 1;
5639 }
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5641 
5642 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5643    ISOCH DMAR unit for the Azalia sound device, but not give it any
5644    TLB entries, which causes it to deadlock. Check for that.  We do
5645    this in a function called from init_dmars(), instead of in a PCI
5646    quirk, because we don't want to print the obnoxious "BIOS broken"
5647    message if VT-d is actually disabled.
5648 */
5649 static void __init check_tylersburg_isoch(void)
5650 {
5651 	struct pci_dev *pdev;
5652 	uint32_t vtisochctrl;
5653 
5654 	/* If there's no Azalia in the system anyway, forget it. */
5655 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5656 	if (!pdev)
5657 		return;
5658 
5659 	if (risky_device(pdev)) {
5660 		pci_dev_put(pdev);
5661 		return;
5662 	}
5663 
5664 	pci_dev_put(pdev);
5665 
5666 	/* System Management Registers. Might be hidden, in which case
5667 	   we can't do the sanity check. But that's OK, because the
5668 	   known-broken BIOSes _don't_ actually hide it, so far. */
5669 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5670 	if (!pdev)
5671 		return;
5672 
5673 	if (risky_device(pdev)) {
5674 		pci_dev_put(pdev);
5675 		return;
5676 	}
5677 
5678 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5679 		pci_dev_put(pdev);
5680 		return;
5681 	}
5682 
5683 	pci_dev_put(pdev);
5684 
5685 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5686 	if (vtisochctrl & 1)
5687 		return;
5688 
5689 	/* Drop all bits other than the number of TLB entries */
5690 	vtisochctrl &= 0x1c;
5691 
5692 	/* If we have the recommended number of TLB entries (16), fine. */
5693 	if (vtisochctrl == 0x10)
5694 		return;
5695 
5696 	/* Zero TLB entries? You get to ride the short bus to school. */
5697 	if (!vtisochctrl) {
5698 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5699 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5700 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5701 		     dmi_get_system_info(DMI_BIOS_VERSION),
5702 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5703 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5704 		return;
5705 	}
5706 
5707 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5708 	       vtisochctrl);
5709 }
5710