xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 06701297)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 #include <trace/events/intel_iommu.h>
48 
49 #include "../irq_remapping.h"
50 #include "pasid.h"
51 
52 #define ROOT_SIZE		VTD_PAGE_SIZE
53 #define CONTEXT_SIZE		VTD_PAGE_SIZE
54 
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 
60 #define IOAPIC_RANGE_START	(0xfee00000)
61 #define IOAPIC_RANGE_END	(0xfeefffff)
62 #define IOVA_START_ADDR		(0x1000)
63 
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 
69 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
71 
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
75 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 
78 /* IO virtual address start page frame number */
79 #define IOVA_START_PFN		(1)
80 
81 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
82 
83 /* page table handling */
84 #define LEVEL_STRIDE		(9)
85 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
86 
87 /*
88  * This bitmap is used to advertise the page sizes our hardware support
89  * to the IOMMU core, which will then use this information to split
90  * physically contiguous memory regions it is mapping into page sizes
91  * that we support.
92  *
93  * Traditionally the IOMMU core just handed us the mappings directly,
94  * after making sure the size is an order of a 4KiB page and that the
95  * mapping has natural alignment.
96  *
97  * To retain this behavior, we currently advertise that we support
98  * all page sizes that are an order of 4KiB.
99  *
100  * If at some point we'd like to utilize the IOMMU core's new behavior,
101  * we could change this to advertise the real page sizes we support.
102  */
103 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
104 
105 static inline int agaw_to_level(int agaw)
106 {
107 	return agaw + 2;
108 }
109 
110 static inline int agaw_to_width(int agaw)
111 {
112 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 }
114 
115 static inline int width_to_agaw(int width)
116 {
117 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 }
119 
120 static inline unsigned int level_to_offset_bits(int level)
121 {
122 	return (level - 1) * LEVEL_STRIDE;
123 }
124 
125 static inline int pfn_level_offset(u64 pfn, int level)
126 {
127 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 }
129 
130 static inline u64 level_mask(int level)
131 {
132 	return -1ULL << level_to_offset_bits(level);
133 }
134 
135 static inline u64 level_size(int level)
136 {
137 	return 1ULL << level_to_offset_bits(level);
138 }
139 
140 static inline u64 align_to_level(u64 pfn, int level)
141 {
142 	return (pfn + level_size(level) - 1) & level_mask(level);
143 }
144 
145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 {
147 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 }
149 
150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
151    are never going to work. */
152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 {
154 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 
157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 {
159 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 {
163 	return mm_to_dma_pfn(page_to_pfn(pg));
164 }
165 static inline unsigned long virt_to_dma_pfn(void *p)
166 {
167 	return page_to_dma_pfn(virt_to_page(p));
168 }
169 
170 /* global iommu list, set NULL for ignored DMAR units */
171 static struct intel_iommu **g_iommus;
172 
173 static void __init check_tylersburg_isoch(void);
174 static int rwbf_quirk;
175 
176 /*
177  * set to 1 to panic kernel if can't successfully enable VT-d
178  * (used when kernel is launched w/ TXT)
179  */
180 static int force_on = 0;
181 static int intel_iommu_tboot_noforce;
182 static int no_platform_optin;
183 
184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
185 
186 /*
187  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
188  * if marked present.
189  */
190 static phys_addr_t root_entry_lctp(struct root_entry *re)
191 {
192 	if (!(re->lo & 1))
193 		return 0;
194 
195 	return re->lo & VTD_PAGE_MASK;
196 }
197 
198 /*
199  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_uctp(struct root_entry *re)
203 {
204 	if (!(re->hi & 1))
205 		return 0;
206 
207 	return re->hi & VTD_PAGE_MASK;
208 }
209 
210 static inline void context_clear_pasid_enable(struct context_entry *context)
211 {
212 	context->lo &= ~(1ULL << 11);
213 }
214 
215 static inline bool context_pasid_enabled(struct context_entry *context)
216 {
217 	return !!(context->lo & (1ULL << 11));
218 }
219 
220 static inline void context_set_copied(struct context_entry *context)
221 {
222 	context->hi |= (1ull << 3);
223 }
224 
225 static inline bool context_copied(struct context_entry *context)
226 {
227 	return !!(context->hi & (1ULL << 3));
228 }
229 
230 static inline bool __context_present(struct context_entry *context)
231 {
232 	return (context->lo & 1);
233 }
234 
235 bool context_present(struct context_entry *context)
236 {
237 	return context_pasid_enabled(context) ?
238 	     __context_present(context) :
239 	     __context_present(context) && !context_copied(context);
240 }
241 
242 static inline void context_set_present(struct context_entry *context)
243 {
244 	context->lo |= 1;
245 }
246 
247 static inline void context_set_fault_enable(struct context_entry *context)
248 {
249 	context->lo &= (((u64)-1) << 2) | 1;
250 }
251 
252 static inline void context_set_translation_type(struct context_entry *context,
253 						unsigned long value)
254 {
255 	context->lo &= (((u64)-1) << 4) | 3;
256 	context->lo |= (value & 3) << 2;
257 }
258 
259 static inline void context_set_address_root(struct context_entry *context,
260 					    unsigned long value)
261 {
262 	context->lo &= ~VTD_PAGE_MASK;
263 	context->lo |= value & VTD_PAGE_MASK;
264 }
265 
266 static inline void context_set_address_width(struct context_entry *context,
267 					     unsigned long value)
268 {
269 	context->hi |= value & 7;
270 }
271 
272 static inline void context_set_domain_id(struct context_entry *context,
273 					 unsigned long value)
274 {
275 	context->hi |= (value & ((1 << 16) - 1)) << 8;
276 }
277 
278 static inline int context_domain_id(struct context_entry *c)
279 {
280 	return((c->hi >> 8) & 0xffff);
281 }
282 
283 static inline void context_clear_entry(struct context_entry *context)
284 {
285 	context->lo = 0;
286 	context->hi = 0;
287 }
288 
289 /*
290  * This domain is a statically identity mapping domain.
291  *	1. This domain creats a static 1:1 mapping to all usable memory.
292  * 	2. It maps to each iommu if successful.
293  *	3. Each iommu mapps to this domain if successful.
294  */
295 static struct dmar_domain *si_domain;
296 static int hw_pass_through = 1;
297 
298 #define for_each_domain_iommu(idx, domain)			\
299 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
300 		if (domain->iommu_refcnt[idx])
301 
302 struct dmar_rmrr_unit {
303 	struct list_head list;		/* list of rmrr units	*/
304 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
305 	u64	base_address;		/* reserved base address*/
306 	u64	end_address;		/* reserved end address */
307 	struct dmar_dev_scope *devices;	/* target devices */
308 	int	devices_cnt;		/* target device count */
309 };
310 
311 struct dmar_atsr_unit {
312 	struct list_head list;		/* list of ATSR units */
313 	struct acpi_dmar_header *hdr;	/* ACPI header */
314 	struct dmar_dev_scope *devices;	/* target devices */
315 	int devices_cnt;		/* target device count */
316 	u8 include_all:1;		/* include all ports */
317 };
318 
319 static LIST_HEAD(dmar_atsr_units);
320 static LIST_HEAD(dmar_rmrr_units);
321 
322 #define for_each_rmrr_units(rmrr) \
323 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
324 
325 /* bitmap for indexing intel_iommus */
326 static int g_num_of_iommus;
327 
328 static void domain_exit(struct dmar_domain *domain);
329 static void domain_remove_dev_info(struct dmar_domain *domain);
330 static void dmar_remove_one_dev_info(struct device *dev);
331 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
332 static int intel_iommu_attach_device(struct iommu_domain *domain,
333 				     struct device *dev);
334 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
335 					    dma_addr_t iova);
336 
337 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
342 
343 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
344 int intel_iommu_sm = 1;
345 #else
346 int intel_iommu_sm;
347 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
348 
349 int intel_iommu_enabled = 0;
350 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
351 
352 static int dmar_map_gfx = 1;
353 static int dmar_forcedac;
354 static int intel_iommu_strict;
355 static int intel_iommu_superpage = 1;
356 static int iommu_identity_mapping;
357 static int iommu_skip_te_disable;
358 
359 #define IDENTMAP_GFX		2
360 #define IDENTMAP_AZALIA		4
361 
362 int intel_iommu_gfx_mapped;
363 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
364 
365 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
366 struct device_domain_info *get_domain_info(struct device *dev)
367 {
368 	struct device_domain_info *info;
369 
370 	if (!dev)
371 		return NULL;
372 
373 	info = dev_iommu_priv_get(dev);
374 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
375 		return NULL;
376 
377 	return info;
378 }
379 
380 DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
382 
383 /*
384  * Iterate over elements in device_domain_list and call the specified
385  * callback @fn against each element.
386  */
387 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
388 				     void *data), void *data)
389 {
390 	int ret = 0;
391 	unsigned long flags;
392 	struct device_domain_info *info;
393 
394 	spin_lock_irqsave(&device_domain_lock, flags);
395 	list_for_each_entry(info, &device_domain_list, global) {
396 		ret = fn(info, data);
397 		if (ret) {
398 			spin_unlock_irqrestore(&device_domain_lock, flags);
399 			return ret;
400 		}
401 	}
402 	spin_unlock_irqrestore(&device_domain_lock, flags);
403 
404 	return 0;
405 }
406 
407 const struct iommu_ops intel_iommu_ops;
408 
409 static bool translation_pre_enabled(struct intel_iommu *iommu)
410 {
411 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
412 }
413 
414 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
417 }
418 
419 static void init_translation_status(struct intel_iommu *iommu)
420 {
421 	u32 gsts;
422 
423 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
424 	if (gsts & DMA_GSTS_TES)
425 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
426 }
427 
428 static int __init intel_iommu_setup(char *str)
429 {
430 	if (!str)
431 		return -EINVAL;
432 	while (*str) {
433 		if (!strncmp(str, "on", 2)) {
434 			dmar_disabled = 0;
435 			pr_info("IOMMU enabled\n");
436 		} else if (!strncmp(str, "off", 3)) {
437 			dmar_disabled = 1;
438 			no_platform_optin = 1;
439 			pr_info("IOMMU disabled\n");
440 		} else if (!strncmp(str, "igfx_off", 8)) {
441 			dmar_map_gfx = 0;
442 			pr_info("Disable GFX device mapping\n");
443 		} else if (!strncmp(str, "forcedac", 8)) {
444 			pr_info("Forcing DAC for PCI devices\n");
445 			dmar_forcedac = 1;
446 		} else if (!strncmp(str, "strict", 6)) {
447 			pr_info("Disable batched IOTLB flush\n");
448 			intel_iommu_strict = 1;
449 		} else if (!strncmp(str, "sp_off", 6)) {
450 			pr_info("Disable supported super page\n");
451 			intel_iommu_superpage = 0;
452 		} else if (!strncmp(str, "sm_on", 5)) {
453 			pr_info("Intel-IOMMU: scalable mode supported\n");
454 			intel_iommu_sm = 1;
455 		} else if (!strncmp(str, "tboot_noforce", 13)) {
456 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 			intel_iommu_tboot_noforce = 1;
458 		}
459 
460 		str += strcspn(str, ",");
461 		while (*str == ',')
462 			str++;
463 	}
464 	return 0;
465 }
466 __setup("intel_iommu=", intel_iommu_setup);
467 
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
470 
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
472 {
473 	struct dmar_domain **domains;
474 	int idx = did >> 8;
475 
476 	domains = iommu->domains[idx];
477 	if (!domains)
478 		return NULL;
479 
480 	return domains[did & 0xff];
481 }
482 
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 			     struct dmar_domain *domain)
485 {
486 	struct dmar_domain **domains;
487 	int idx = did >> 8;
488 
489 	if (!iommu->domains[idx]) {
490 		size_t size = 256 * sizeof(struct dmar_domain *);
491 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
492 	}
493 
494 	domains = iommu->domains[idx];
495 	if (WARN_ON(!domains))
496 		return;
497 	else
498 		domains[did & 0xff] = domain;
499 }
500 
501 void *alloc_pgtable_page(int node)
502 {
503 	struct page *page;
504 	void *vaddr = NULL;
505 
506 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
507 	if (page)
508 		vaddr = page_address(page);
509 	return vaddr;
510 }
511 
512 void free_pgtable_page(void *vaddr)
513 {
514 	free_page((unsigned long)vaddr);
515 }
516 
517 static inline void *alloc_domain_mem(void)
518 {
519 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
520 }
521 
522 static void free_domain_mem(void *vaddr)
523 {
524 	kmem_cache_free(iommu_domain_cache, vaddr);
525 }
526 
527 static inline void * alloc_devinfo_mem(void)
528 {
529 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
530 }
531 
532 static inline void free_devinfo_mem(void *vaddr)
533 {
534 	kmem_cache_free(iommu_devinfo_cache, vaddr);
535 }
536 
537 static inline int domain_type_is_si(struct dmar_domain *domain)
538 {
539 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
540 }
541 
542 static inline bool domain_use_first_level(struct dmar_domain *domain)
543 {
544 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
545 }
546 
547 static inline int domain_pfn_supported(struct dmar_domain *domain,
548 				       unsigned long pfn)
549 {
550 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
551 
552 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
553 }
554 
555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
556 {
557 	unsigned long sagaw;
558 	int agaw = -1;
559 
560 	sagaw = cap_sagaw(iommu->cap);
561 	for (agaw = width_to_agaw(max_gaw);
562 	     agaw >= 0; agaw--) {
563 		if (test_bit(agaw, &sagaw))
564 			break;
565 	}
566 
567 	return agaw;
568 }
569 
570 /*
571  * Calculate max SAGAW for each iommu.
572  */
573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
574 {
575 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 }
577 
578 /*
579  * calculate agaw for each iommu.
580  * "SAGAW" may be different across iommus, use a default agaw, and
581  * get a supported less agaw for iommus that don't support the default agaw.
582  */
583 int iommu_calculate_agaw(struct intel_iommu *iommu)
584 {
585 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
586 }
587 
588 /* This functionin only returns single iommu in a domain */
589 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 {
591 	int iommu_id;
592 
593 	/* si_domain and vm domain should not get here. */
594 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
595 		return NULL;
596 
597 	for_each_domain_iommu(iommu_id, domain)
598 		break;
599 
600 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
601 		return NULL;
602 
603 	return g_iommus[iommu_id];
604 }
605 
606 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
607 {
608 	return sm_supported(iommu) ?
609 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
610 }
611 
612 static void domain_update_iommu_coherency(struct dmar_domain *domain)
613 {
614 	struct dmar_drhd_unit *drhd;
615 	struct intel_iommu *iommu;
616 	bool found = false;
617 	int i;
618 
619 	domain->iommu_coherency = 1;
620 
621 	for_each_domain_iommu(i, domain) {
622 		found = true;
623 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
624 			domain->iommu_coherency = 0;
625 			break;
626 		}
627 	}
628 	if (found)
629 		return;
630 
631 	/* No hardware attached; use lowest common denominator */
632 	rcu_read_lock();
633 	for_each_active_iommu(iommu, drhd) {
634 		if (!iommu_paging_structure_coherency(iommu)) {
635 			domain->iommu_coherency = 0;
636 			break;
637 		}
638 	}
639 	rcu_read_unlock();
640 }
641 
642 static int domain_update_iommu_snooping(struct intel_iommu *skip)
643 {
644 	struct dmar_drhd_unit *drhd;
645 	struct intel_iommu *iommu;
646 	int ret = 1;
647 
648 	rcu_read_lock();
649 	for_each_active_iommu(iommu, drhd) {
650 		if (iommu != skip) {
651 			if (!ecap_sc_support(iommu->ecap)) {
652 				ret = 0;
653 				break;
654 			}
655 		}
656 	}
657 	rcu_read_unlock();
658 
659 	return ret;
660 }
661 
662 static int domain_update_iommu_superpage(struct dmar_domain *domain,
663 					 struct intel_iommu *skip)
664 {
665 	struct dmar_drhd_unit *drhd;
666 	struct intel_iommu *iommu;
667 	int mask = 0x3;
668 
669 	if (!intel_iommu_superpage) {
670 		return 0;
671 	}
672 
673 	/* set iommu_superpage to the smallest common denominator */
674 	rcu_read_lock();
675 	for_each_active_iommu(iommu, drhd) {
676 		if (iommu != skip) {
677 			if (domain && domain_use_first_level(domain)) {
678 				if (!cap_fl1gp_support(iommu->cap))
679 					mask = 0x1;
680 			} else {
681 				mask &= cap_super_page_val(iommu->cap);
682 			}
683 
684 			if (!mask)
685 				break;
686 		}
687 	}
688 	rcu_read_unlock();
689 
690 	return fls(mask);
691 }
692 
693 static int domain_update_device_node(struct dmar_domain *domain)
694 {
695 	struct device_domain_info *info;
696 	int nid = NUMA_NO_NODE;
697 
698 	assert_spin_locked(&device_domain_lock);
699 
700 	if (list_empty(&domain->devices))
701 		return NUMA_NO_NODE;
702 
703 	list_for_each_entry(info, &domain->devices, link) {
704 		if (!info->dev)
705 			continue;
706 
707 		/*
708 		 * There could possibly be multiple device numa nodes as devices
709 		 * within the same domain may sit behind different IOMMUs. There
710 		 * isn't perfect answer in such situation, so we select first
711 		 * come first served policy.
712 		 */
713 		nid = dev_to_node(info->dev);
714 		if (nid != NUMA_NO_NODE)
715 			break;
716 	}
717 
718 	return nid;
719 }
720 
721 static void domain_update_iotlb(struct dmar_domain *domain);
722 
723 /* Some capabilities may be different across iommus */
724 static void domain_update_iommu_cap(struct dmar_domain *domain)
725 {
726 	domain_update_iommu_coherency(domain);
727 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
728 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
729 
730 	/*
731 	 * If RHSA is missing, we should default to the device numa domain
732 	 * as fall back.
733 	 */
734 	if (domain->nid == NUMA_NO_NODE)
735 		domain->nid = domain_update_device_node(domain);
736 
737 	/*
738 	 * First-level translation restricts the input-address to a
739 	 * canonical address (i.e., address bits 63:N have the same
740 	 * value as address bit [N-1], where N is 48-bits with 4-level
741 	 * paging and 57-bits with 5-level paging). Hence, skip bit
742 	 * [N-1].
743 	 */
744 	if (domain_use_first_level(domain))
745 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
746 	else
747 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
748 
749 	domain_update_iotlb(domain);
750 }
751 
752 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
753 					 u8 devfn, int alloc)
754 {
755 	struct root_entry *root = &iommu->root_entry[bus];
756 	struct context_entry *context;
757 	u64 *entry;
758 
759 	entry = &root->lo;
760 	if (sm_supported(iommu)) {
761 		if (devfn >= 0x80) {
762 			devfn -= 0x80;
763 			entry = &root->hi;
764 		}
765 		devfn *= 2;
766 	}
767 	if (*entry & 1)
768 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
769 	else {
770 		unsigned long phy_addr;
771 		if (!alloc)
772 			return NULL;
773 
774 		context = alloc_pgtable_page(iommu->node);
775 		if (!context)
776 			return NULL;
777 
778 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
779 		phy_addr = virt_to_phys((void *)context);
780 		*entry = phy_addr | 1;
781 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
782 	}
783 	return &context[devfn];
784 }
785 
786 static bool attach_deferred(struct device *dev)
787 {
788 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
789 }
790 
791 /**
792  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
793  *				 sub-hierarchy of a candidate PCI-PCI bridge
794  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
795  * @bridge: the candidate PCI-PCI bridge
796  *
797  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
798  */
799 static bool
800 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
801 {
802 	struct pci_dev *pdev, *pbridge;
803 
804 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
805 		return false;
806 
807 	pdev = to_pci_dev(dev);
808 	pbridge = to_pci_dev(bridge);
809 
810 	if (pbridge->subordinate &&
811 	    pbridge->subordinate->number <= pdev->bus->number &&
812 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
813 		return true;
814 
815 	return false;
816 }
817 
818 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
819 {
820 	struct dmar_drhd_unit *drhd;
821 	u32 vtbar;
822 	int rc;
823 
824 	/* We know that this device on this chipset has its own IOMMU.
825 	 * If we find it under a different IOMMU, then the BIOS is lying
826 	 * to us. Hope that the IOMMU for this device is actually
827 	 * disabled, and it needs no translation...
828 	 */
829 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
830 	if (rc) {
831 		/* "can't" happen */
832 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
833 		return false;
834 	}
835 	vtbar &= 0xffff0000;
836 
837 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
838 	drhd = dmar_find_matched_drhd_unit(pdev);
839 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
840 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
841 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
842 		return true;
843 	}
844 
845 	return false;
846 }
847 
848 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
849 {
850 	if (!iommu || iommu->drhd->ignored)
851 		return true;
852 
853 	if (dev_is_pci(dev)) {
854 		struct pci_dev *pdev = to_pci_dev(dev);
855 
856 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
857 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
858 		    quirk_ioat_snb_local_iommu(pdev))
859 			return true;
860 	}
861 
862 	return false;
863 }
864 
865 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
866 {
867 	struct dmar_drhd_unit *drhd = NULL;
868 	struct pci_dev *pdev = NULL;
869 	struct intel_iommu *iommu;
870 	struct device *tmp;
871 	u16 segment = 0;
872 	int i;
873 
874 	if (!dev)
875 		return NULL;
876 
877 	if (dev_is_pci(dev)) {
878 		struct pci_dev *pf_pdev;
879 
880 		pdev = pci_real_dma_dev(to_pci_dev(dev));
881 
882 		/* VFs aren't listed in scope tables; we need to look up
883 		 * the PF instead to find the IOMMU. */
884 		pf_pdev = pci_physfn(pdev);
885 		dev = &pf_pdev->dev;
886 		segment = pci_domain_nr(pdev->bus);
887 	} else if (has_acpi_companion(dev))
888 		dev = &ACPI_COMPANION(dev)->dev;
889 
890 	rcu_read_lock();
891 	for_each_iommu(iommu, drhd) {
892 		if (pdev && segment != drhd->segment)
893 			continue;
894 
895 		for_each_active_dev_scope(drhd->devices,
896 					  drhd->devices_cnt, i, tmp) {
897 			if (tmp == dev) {
898 				/* For a VF use its original BDF# not that of the PF
899 				 * which we used for the IOMMU lookup. Strictly speaking
900 				 * we could do this for all PCI devices; we only need to
901 				 * get the BDF# from the scope table for ACPI matches. */
902 				if (pdev && pdev->is_virtfn)
903 					goto got_pdev;
904 
905 				if (bus && devfn) {
906 					*bus = drhd->devices[i].bus;
907 					*devfn = drhd->devices[i].devfn;
908 				}
909 				goto out;
910 			}
911 
912 			if (is_downstream_to_pci_bridge(dev, tmp))
913 				goto got_pdev;
914 		}
915 
916 		if (pdev && drhd->include_all) {
917 		got_pdev:
918 			if (bus && devfn) {
919 				*bus = pdev->bus->number;
920 				*devfn = pdev->devfn;
921 			}
922 			goto out;
923 		}
924 	}
925 	iommu = NULL;
926  out:
927 	if (iommu_is_dummy(iommu, dev))
928 		iommu = NULL;
929 
930 	rcu_read_unlock();
931 
932 	return iommu;
933 }
934 
935 static void domain_flush_cache(struct dmar_domain *domain,
936 			       void *addr, int size)
937 {
938 	if (!domain->iommu_coherency)
939 		clflush_cache_range(addr, size);
940 }
941 
942 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
943 {
944 	struct context_entry *context;
945 	int ret = 0;
946 	unsigned long flags;
947 
948 	spin_lock_irqsave(&iommu->lock, flags);
949 	context = iommu_context_addr(iommu, bus, devfn, 0);
950 	if (context)
951 		ret = context_present(context);
952 	spin_unlock_irqrestore(&iommu->lock, flags);
953 	return ret;
954 }
955 
956 static void free_context_table(struct intel_iommu *iommu)
957 {
958 	int i;
959 	unsigned long flags;
960 	struct context_entry *context;
961 
962 	spin_lock_irqsave(&iommu->lock, flags);
963 	if (!iommu->root_entry) {
964 		goto out;
965 	}
966 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
967 		context = iommu_context_addr(iommu, i, 0, 0);
968 		if (context)
969 			free_pgtable_page(context);
970 
971 		if (!sm_supported(iommu))
972 			continue;
973 
974 		context = iommu_context_addr(iommu, i, 0x80, 0);
975 		if (context)
976 			free_pgtable_page(context);
977 
978 	}
979 	free_pgtable_page(iommu->root_entry);
980 	iommu->root_entry = NULL;
981 out:
982 	spin_unlock_irqrestore(&iommu->lock, flags);
983 }
984 
985 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
986 				      unsigned long pfn, int *target_level)
987 {
988 	struct dma_pte *parent, *pte;
989 	int level = agaw_to_level(domain->agaw);
990 	int offset;
991 
992 	BUG_ON(!domain->pgd);
993 
994 	if (!domain_pfn_supported(domain, pfn))
995 		/* Address beyond IOMMU's addressing capabilities. */
996 		return NULL;
997 
998 	parent = domain->pgd;
999 
1000 	while (1) {
1001 		void *tmp_page;
1002 
1003 		offset = pfn_level_offset(pfn, level);
1004 		pte = &parent[offset];
1005 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1006 			break;
1007 		if (level == *target_level)
1008 			break;
1009 
1010 		if (!dma_pte_present(pte)) {
1011 			uint64_t pteval;
1012 
1013 			tmp_page = alloc_pgtable_page(domain->nid);
1014 
1015 			if (!tmp_page)
1016 				return NULL;
1017 
1018 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1019 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1020 			if (domain_use_first_level(domain))
1021 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1022 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1023 				/* Someone else set it while we were thinking; use theirs. */
1024 				free_pgtable_page(tmp_page);
1025 			else
1026 				domain_flush_cache(domain, pte, sizeof(*pte));
1027 		}
1028 		if (level == 1)
1029 			break;
1030 
1031 		parent = phys_to_virt(dma_pte_addr(pte));
1032 		level--;
1033 	}
1034 
1035 	if (!*target_level)
1036 		*target_level = level;
1037 
1038 	return pte;
1039 }
1040 
1041 /* return address's pte at specific level */
1042 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1043 					 unsigned long pfn,
1044 					 int level, int *large_page)
1045 {
1046 	struct dma_pte *parent, *pte;
1047 	int total = agaw_to_level(domain->agaw);
1048 	int offset;
1049 
1050 	parent = domain->pgd;
1051 	while (level <= total) {
1052 		offset = pfn_level_offset(pfn, total);
1053 		pte = &parent[offset];
1054 		if (level == total)
1055 			return pte;
1056 
1057 		if (!dma_pte_present(pte)) {
1058 			*large_page = total;
1059 			break;
1060 		}
1061 
1062 		if (dma_pte_superpage(pte)) {
1063 			*large_page = total;
1064 			return pte;
1065 		}
1066 
1067 		parent = phys_to_virt(dma_pte_addr(pte));
1068 		total--;
1069 	}
1070 	return NULL;
1071 }
1072 
1073 /* clear last level pte, a tlb flush should be followed */
1074 static void dma_pte_clear_range(struct dmar_domain *domain,
1075 				unsigned long start_pfn,
1076 				unsigned long last_pfn)
1077 {
1078 	unsigned int large_page;
1079 	struct dma_pte *first_pte, *pte;
1080 
1081 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1082 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1083 	BUG_ON(start_pfn > last_pfn);
1084 
1085 	/* we don't need lock here; nobody else touches the iova range */
1086 	do {
1087 		large_page = 1;
1088 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1089 		if (!pte) {
1090 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1091 			continue;
1092 		}
1093 		do {
1094 			dma_clear_pte(pte);
1095 			start_pfn += lvl_to_nr_pages(large_page);
1096 			pte++;
1097 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1098 
1099 		domain_flush_cache(domain, first_pte,
1100 				   (void *)pte - (void *)first_pte);
1101 
1102 	} while (start_pfn && start_pfn <= last_pfn);
1103 }
1104 
1105 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1106 			       int retain_level, struct dma_pte *pte,
1107 			       unsigned long pfn, unsigned long start_pfn,
1108 			       unsigned long last_pfn)
1109 {
1110 	pfn = max(start_pfn, pfn);
1111 	pte = &pte[pfn_level_offset(pfn, level)];
1112 
1113 	do {
1114 		unsigned long level_pfn;
1115 		struct dma_pte *level_pte;
1116 
1117 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1118 			goto next;
1119 
1120 		level_pfn = pfn & level_mask(level);
1121 		level_pte = phys_to_virt(dma_pte_addr(pte));
1122 
1123 		if (level > 2) {
1124 			dma_pte_free_level(domain, level - 1, retain_level,
1125 					   level_pte, level_pfn, start_pfn,
1126 					   last_pfn);
1127 		}
1128 
1129 		/*
1130 		 * Free the page table if we're below the level we want to
1131 		 * retain and the range covers the entire table.
1132 		 */
1133 		if (level < retain_level && !(start_pfn > level_pfn ||
1134 		      last_pfn < level_pfn + level_size(level) - 1)) {
1135 			dma_clear_pte(pte);
1136 			domain_flush_cache(domain, pte, sizeof(*pte));
1137 			free_pgtable_page(level_pte);
1138 		}
1139 next:
1140 		pfn += level_size(level);
1141 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1142 }
1143 
1144 /*
1145  * clear last level (leaf) ptes and free page table pages below the
1146  * level we wish to keep intact.
1147  */
1148 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1149 				   unsigned long start_pfn,
1150 				   unsigned long last_pfn,
1151 				   int retain_level)
1152 {
1153 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1154 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1155 	BUG_ON(start_pfn > last_pfn);
1156 
1157 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1158 
1159 	/* We don't need lock here; nobody else touches the iova range */
1160 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1161 			   domain->pgd, 0, start_pfn, last_pfn);
1162 
1163 	/* free pgd */
1164 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1165 		free_pgtable_page(domain->pgd);
1166 		domain->pgd = NULL;
1167 	}
1168 }
1169 
1170 /* When a page at a given level is being unlinked from its parent, we don't
1171    need to *modify* it at all. All we need to do is make a list of all the
1172    pages which can be freed just as soon as we've flushed the IOTLB and we
1173    know the hardware page-walk will no longer touch them.
1174    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1175    be freed. */
1176 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1177 					    int level, struct dma_pte *pte,
1178 					    struct page *freelist)
1179 {
1180 	struct page *pg;
1181 
1182 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1183 	pg->freelist = freelist;
1184 	freelist = pg;
1185 
1186 	if (level == 1)
1187 		return freelist;
1188 
1189 	pte = page_address(pg);
1190 	do {
1191 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1192 			freelist = dma_pte_list_pagetables(domain, level - 1,
1193 							   pte, freelist);
1194 		pte++;
1195 	} while (!first_pte_in_page(pte));
1196 
1197 	return freelist;
1198 }
1199 
1200 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1201 					struct dma_pte *pte, unsigned long pfn,
1202 					unsigned long start_pfn,
1203 					unsigned long last_pfn,
1204 					struct page *freelist)
1205 {
1206 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1207 
1208 	pfn = max(start_pfn, pfn);
1209 	pte = &pte[pfn_level_offset(pfn, level)];
1210 
1211 	do {
1212 		unsigned long level_pfn;
1213 
1214 		if (!dma_pte_present(pte))
1215 			goto next;
1216 
1217 		level_pfn = pfn & level_mask(level);
1218 
1219 		/* If range covers entire pagetable, free it */
1220 		if (start_pfn <= level_pfn &&
1221 		    last_pfn >= level_pfn + level_size(level) - 1) {
1222 			/* These suborbinate page tables are going away entirely. Don't
1223 			   bother to clear them; we're just going to *free* them. */
1224 			if (level > 1 && !dma_pte_superpage(pte))
1225 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1226 
1227 			dma_clear_pte(pte);
1228 			if (!first_pte)
1229 				first_pte = pte;
1230 			last_pte = pte;
1231 		} else if (level > 1) {
1232 			/* Recurse down into a level that isn't *entirely* obsolete */
1233 			freelist = dma_pte_clear_level(domain, level - 1,
1234 						       phys_to_virt(dma_pte_addr(pte)),
1235 						       level_pfn, start_pfn, last_pfn,
1236 						       freelist);
1237 		}
1238 next:
1239 		pfn += level_size(level);
1240 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1241 
1242 	if (first_pte)
1243 		domain_flush_cache(domain, first_pte,
1244 				   (void *)++last_pte - (void *)first_pte);
1245 
1246 	return freelist;
1247 }
1248 
1249 /* We can't just free the pages because the IOMMU may still be walking
1250    the page tables, and may have cached the intermediate levels. The
1251    pages can only be freed after the IOTLB flush has been done. */
1252 static struct page *domain_unmap(struct dmar_domain *domain,
1253 				 unsigned long start_pfn,
1254 				 unsigned long last_pfn,
1255 				 struct page *freelist)
1256 {
1257 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1258 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1259 	BUG_ON(start_pfn > last_pfn);
1260 
1261 	/* we don't need lock here; nobody else touches the iova range */
1262 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1263 				       domain->pgd, 0, start_pfn, last_pfn,
1264 				       freelist);
1265 
1266 	/* free pgd */
1267 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1268 		struct page *pgd_page = virt_to_page(domain->pgd);
1269 		pgd_page->freelist = freelist;
1270 		freelist = pgd_page;
1271 
1272 		domain->pgd = NULL;
1273 	}
1274 
1275 	return freelist;
1276 }
1277 
1278 static void dma_free_pagelist(struct page *freelist)
1279 {
1280 	struct page *pg;
1281 
1282 	while ((pg = freelist)) {
1283 		freelist = pg->freelist;
1284 		free_pgtable_page(page_address(pg));
1285 	}
1286 }
1287 
1288 /* iommu handling */
1289 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1290 {
1291 	struct root_entry *root;
1292 	unsigned long flags;
1293 
1294 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1295 	if (!root) {
1296 		pr_err("Allocating root entry for %s failed\n",
1297 			iommu->name);
1298 		return -ENOMEM;
1299 	}
1300 
1301 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1302 
1303 	spin_lock_irqsave(&iommu->lock, flags);
1304 	iommu->root_entry = root;
1305 	spin_unlock_irqrestore(&iommu->lock, flags);
1306 
1307 	return 0;
1308 }
1309 
1310 static void iommu_set_root_entry(struct intel_iommu *iommu)
1311 {
1312 	u64 addr;
1313 	u32 sts;
1314 	unsigned long flag;
1315 
1316 	addr = virt_to_phys(iommu->root_entry);
1317 	if (sm_supported(iommu))
1318 		addr |= DMA_RTADDR_SMT;
1319 
1320 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1321 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1322 
1323 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1324 
1325 	/* Make sure hardware complete it */
1326 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1327 		      readl, (sts & DMA_GSTS_RTPS), sts);
1328 
1329 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1330 }
1331 
1332 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1333 {
1334 	u32 val;
1335 	unsigned long flag;
1336 
1337 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1338 		return;
1339 
1340 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1341 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1342 
1343 	/* Make sure hardware complete it */
1344 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1345 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1346 
1347 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348 }
1349 
1350 /* return value determine if we need a write buffer flush */
1351 static void __iommu_flush_context(struct intel_iommu *iommu,
1352 				  u16 did, u16 source_id, u8 function_mask,
1353 				  u64 type)
1354 {
1355 	u64 val = 0;
1356 	unsigned long flag;
1357 
1358 	switch (type) {
1359 	case DMA_CCMD_GLOBAL_INVL:
1360 		val = DMA_CCMD_GLOBAL_INVL;
1361 		break;
1362 	case DMA_CCMD_DOMAIN_INVL:
1363 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1364 		break;
1365 	case DMA_CCMD_DEVICE_INVL:
1366 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1367 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1368 		break;
1369 	default:
1370 		BUG();
1371 	}
1372 	val |= DMA_CCMD_ICC;
1373 
1374 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1375 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1376 
1377 	/* Make sure hardware complete it */
1378 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1379 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1380 
1381 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1382 }
1383 
1384 /* return value determine if we need a write buffer flush */
1385 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1386 				u64 addr, unsigned int size_order, u64 type)
1387 {
1388 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1389 	u64 val = 0, val_iva = 0;
1390 	unsigned long flag;
1391 
1392 	switch (type) {
1393 	case DMA_TLB_GLOBAL_FLUSH:
1394 		/* global flush doesn't need set IVA_REG */
1395 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1396 		break;
1397 	case DMA_TLB_DSI_FLUSH:
1398 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1399 		break;
1400 	case DMA_TLB_PSI_FLUSH:
1401 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1402 		/* IH bit is passed in as part of address */
1403 		val_iva = size_order | addr;
1404 		break;
1405 	default:
1406 		BUG();
1407 	}
1408 	/* Note: set drain read/write */
1409 #if 0
1410 	/*
1411 	 * This is probably to be super secure.. Looks like we can
1412 	 * ignore it without any impact.
1413 	 */
1414 	if (cap_read_drain(iommu->cap))
1415 		val |= DMA_TLB_READ_DRAIN;
1416 #endif
1417 	if (cap_write_drain(iommu->cap))
1418 		val |= DMA_TLB_WRITE_DRAIN;
1419 
1420 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1421 	/* Note: Only uses first TLB reg currently */
1422 	if (val_iva)
1423 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1424 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1425 
1426 	/* Make sure hardware complete it */
1427 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1428 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1429 
1430 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1431 
1432 	/* check IOTLB invalidation granularity */
1433 	if (DMA_TLB_IAIG(val) == 0)
1434 		pr_err("Flush IOTLB failed\n");
1435 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1436 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1437 			(unsigned long long)DMA_TLB_IIRG(type),
1438 			(unsigned long long)DMA_TLB_IAIG(val));
1439 }
1440 
1441 static struct device_domain_info *
1442 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1443 			 u8 bus, u8 devfn)
1444 {
1445 	struct device_domain_info *info;
1446 
1447 	assert_spin_locked(&device_domain_lock);
1448 
1449 	if (!iommu->qi)
1450 		return NULL;
1451 
1452 	list_for_each_entry(info, &domain->devices, link)
1453 		if (info->iommu == iommu && info->bus == bus &&
1454 		    info->devfn == devfn) {
1455 			if (info->ats_supported && info->dev)
1456 				return info;
1457 			break;
1458 		}
1459 
1460 	return NULL;
1461 }
1462 
1463 static void domain_update_iotlb(struct dmar_domain *domain)
1464 {
1465 	struct device_domain_info *info;
1466 	bool has_iotlb_device = false;
1467 
1468 	assert_spin_locked(&device_domain_lock);
1469 
1470 	list_for_each_entry(info, &domain->devices, link)
1471 		if (info->ats_enabled) {
1472 			has_iotlb_device = true;
1473 			break;
1474 		}
1475 
1476 	if (!has_iotlb_device) {
1477 		struct subdev_domain_info *sinfo;
1478 
1479 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1480 			info = get_domain_info(sinfo->pdev);
1481 			if (info && info->ats_enabled) {
1482 				has_iotlb_device = true;
1483 				break;
1484 			}
1485 		}
1486 	}
1487 
1488 	domain->has_iotlb_device = has_iotlb_device;
1489 }
1490 
1491 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1492 {
1493 	struct pci_dev *pdev;
1494 
1495 	assert_spin_locked(&device_domain_lock);
1496 
1497 	if (!info || !dev_is_pci(info->dev))
1498 		return;
1499 
1500 	pdev = to_pci_dev(info->dev);
1501 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1502 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1503 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1504 	 * reserved, which should be set to 0.
1505 	 */
1506 	if (!ecap_dit(info->iommu->ecap))
1507 		info->pfsid = 0;
1508 	else {
1509 		struct pci_dev *pf_pdev;
1510 
1511 		/* pdev will be returned if device is not a vf */
1512 		pf_pdev = pci_physfn(pdev);
1513 		info->pfsid = pci_dev_id(pf_pdev);
1514 	}
1515 
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1518 	   the device if you enable PASID support after ATS support is
1519 	   undefined. So always enable PASID support on devices which
1520 	   have it, even if we can't yet know if we're ever going to
1521 	   use it. */
1522 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1523 		info->pasid_enabled = 1;
1524 
1525 	if (info->pri_supported &&
1526 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1527 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1528 		info->pri_enabled = 1;
1529 #endif
1530 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1531 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1532 		info->ats_enabled = 1;
1533 		domain_update_iotlb(info->domain);
1534 		info->ats_qdep = pci_ats_queue_depth(pdev);
1535 	}
1536 }
1537 
1538 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1539 {
1540 	struct pci_dev *pdev;
1541 
1542 	assert_spin_locked(&device_domain_lock);
1543 
1544 	if (!dev_is_pci(info->dev))
1545 		return;
1546 
1547 	pdev = to_pci_dev(info->dev);
1548 
1549 	if (info->ats_enabled) {
1550 		pci_disable_ats(pdev);
1551 		info->ats_enabled = 0;
1552 		domain_update_iotlb(info->domain);
1553 	}
1554 #ifdef CONFIG_INTEL_IOMMU_SVM
1555 	if (info->pri_enabled) {
1556 		pci_disable_pri(pdev);
1557 		info->pri_enabled = 0;
1558 	}
1559 	if (info->pasid_enabled) {
1560 		pci_disable_pasid(pdev);
1561 		info->pasid_enabled = 0;
1562 	}
1563 #endif
1564 }
1565 
1566 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1567 				    u64 addr, unsigned int mask)
1568 {
1569 	u16 sid, qdep;
1570 
1571 	if (!info || !info->ats_enabled)
1572 		return;
1573 
1574 	sid = info->bus << 8 | info->devfn;
1575 	qdep = info->ats_qdep;
1576 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1577 			   qdep, addr, mask);
1578 }
1579 
1580 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1581 				  u64 addr, unsigned mask)
1582 {
1583 	unsigned long flags;
1584 	struct device_domain_info *info;
1585 	struct subdev_domain_info *sinfo;
1586 
1587 	if (!domain->has_iotlb_device)
1588 		return;
1589 
1590 	spin_lock_irqsave(&device_domain_lock, flags);
1591 	list_for_each_entry(info, &domain->devices, link)
1592 		__iommu_flush_dev_iotlb(info, addr, mask);
1593 
1594 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1595 		info = get_domain_info(sinfo->pdev);
1596 		__iommu_flush_dev_iotlb(info, addr, mask);
1597 	}
1598 	spin_unlock_irqrestore(&device_domain_lock, flags);
1599 }
1600 
1601 static void domain_flush_piotlb(struct intel_iommu *iommu,
1602 				struct dmar_domain *domain,
1603 				u64 addr, unsigned long npages, bool ih)
1604 {
1605 	u16 did = domain->iommu_did[iommu->seq_id];
1606 
1607 	if (domain->default_pasid)
1608 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1609 				addr, npages, ih);
1610 
1611 	if (!list_empty(&domain->devices))
1612 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1613 }
1614 
1615 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1616 				  struct dmar_domain *domain,
1617 				  unsigned long pfn, unsigned int pages,
1618 				  int ih, int map)
1619 {
1620 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1621 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1622 	u16 did = domain->iommu_did[iommu->seq_id];
1623 
1624 	BUG_ON(pages == 0);
1625 
1626 	if (ih)
1627 		ih = 1 << 6;
1628 
1629 	if (domain_use_first_level(domain)) {
1630 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1631 	} else {
1632 		/*
1633 		 * Fallback to domain selective flush if no PSI support or
1634 		 * the size is too big. PSI requires page size to be 2 ^ x,
1635 		 * and the base address is naturally aligned to the size.
1636 		 */
1637 		if (!cap_pgsel_inv(iommu->cap) ||
1638 		    mask > cap_max_amask_val(iommu->cap))
1639 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1640 							DMA_TLB_DSI_FLUSH);
1641 		else
1642 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1643 							DMA_TLB_PSI_FLUSH);
1644 	}
1645 
1646 	/*
1647 	 * In caching mode, changes of pages from non-present to present require
1648 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1649 	 */
1650 	if (!cap_caching_mode(iommu->cap) || !map)
1651 		iommu_flush_dev_iotlb(domain, addr, mask);
1652 }
1653 
1654 /* Notification for newly created mappings */
1655 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1656 					struct dmar_domain *domain,
1657 					unsigned long pfn, unsigned int pages)
1658 {
1659 	/*
1660 	 * It's a non-present to present mapping. Only flush if caching mode
1661 	 * and second level.
1662 	 */
1663 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1664 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1665 	else
1666 		iommu_flush_write_buffer(iommu);
1667 }
1668 
1669 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1670 {
1671 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1672 	int idx;
1673 
1674 	for_each_domain_iommu(idx, dmar_domain) {
1675 		struct intel_iommu *iommu = g_iommus[idx];
1676 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1677 
1678 		if (domain_use_first_level(dmar_domain))
1679 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1680 		else
1681 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1682 						 DMA_TLB_DSI_FLUSH);
1683 
1684 		if (!cap_caching_mode(iommu->cap))
1685 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1686 					      0, MAX_AGAW_PFN_WIDTH);
1687 	}
1688 }
1689 
1690 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1691 {
1692 	u32 pmen;
1693 	unsigned long flags;
1694 
1695 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1696 		return;
1697 
1698 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1699 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1700 	pmen &= ~DMA_PMEN_EPM;
1701 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1702 
1703 	/* wait for the protected region status bit to clear */
1704 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1705 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1706 
1707 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709 
1710 static void iommu_enable_translation(struct intel_iommu *iommu)
1711 {
1712 	u32 sts;
1713 	unsigned long flags;
1714 
1715 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1716 	iommu->gcmd |= DMA_GCMD_TE;
1717 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1718 
1719 	/* Make sure hardware complete it */
1720 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1721 		      readl, (sts & DMA_GSTS_TES), sts);
1722 
1723 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1724 }
1725 
1726 static void iommu_disable_translation(struct intel_iommu *iommu)
1727 {
1728 	u32 sts;
1729 	unsigned long flag;
1730 
1731 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1732 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1733 		return;
1734 
1735 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1736 	iommu->gcmd &= ~DMA_GCMD_TE;
1737 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1738 
1739 	/* Make sure hardware complete it */
1740 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1741 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1742 
1743 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1744 }
1745 
1746 static int iommu_init_domains(struct intel_iommu *iommu)
1747 {
1748 	u32 ndomains, nlongs;
1749 	size_t size;
1750 
1751 	ndomains = cap_ndoms(iommu->cap);
1752 	pr_debug("%s: Number of Domains supported <%d>\n",
1753 		 iommu->name, ndomains);
1754 	nlongs = BITS_TO_LONGS(ndomains);
1755 
1756 	spin_lock_init(&iommu->lock);
1757 
1758 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1759 	if (!iommu->domain_ids) {
1760 		pr_err("%s: Allocating domain id array failed\n",
1761 		       iommu->name);
1762 		return -ENOMEM;
1763 	}
1764 
1765 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1766 	iommu->domains = kzalloc(size, GFP_KERNEL);
1767 
1768 	if (iommu->domains) {
1769 		size = 256 * sizeof(struct dmar_domain *);
1770 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1771 	}
1772 
1773 	if (!iommu->domains || !iommu->domains[0]) {
1774 		pr_err("%s: Allocating domain array failed\n",
1775 		       iommu->name);
1776 		kfree(iommu->domain_ids);
1777 		kfree(iommu->domains);
1778 		iommu->domain_ids = NULL;
1779 		iommu->domains    = NULL;
1780 		return -ENOMEM;
1781 	}
1782 
1783 	/*
1784 	 * If Caching mode is set, then invalid translations are tagged
1785 	 * with domain-id 0, hence we need to pre-allocate it. We also
1786 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1787 	 * make sure it is not used for a real domain.
1788 	 */
1789 	set_bit(0, iommu->domain_ids);
1790 
1791 	/*
1792 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1793 	 * entry for first-level or pass-through translation modes should
1794 	 * be programmed with a domain id different from those used for
1795 	 * second-level or nested translation. We reserve a domain id for
1796 	 * this purpose.
1797 	 */
1798 	if (sm_supported(iommu))
1799 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1800 
1801 	return 0;
1802 }
1803 
1804 static void disable_dmar_iommu(struct intel_iommu *iommu)
1805 {
1806 	struct device_domain_info *info, *tmp;
1807 	unsigned long flags;
1808 
1809 	if (!iommu->domains || !iommu->domain_ids)
1810 		return;
1811 
1812 	spin_lock_irqsave(&device_domain_lock, flags);
1813 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1814 		if (info->iommu != iommu)
1815 			continue;
1816 
1817 		if (!info->dev || !info->domain)
1818 			continue;
1819 
1820 		__dmar_remove_one_dev_info(info);
1821 	}
1822 	spin_unlock_irqrestore(&device_domain_lock, flags);
1823 
1824 	if (iommu->gcmd & DMA_GCMD_TE)
1825 		iommu_disable_translation(iommu);
1826 }
1827 
1828 static void free_dmar_iommu(struct intel_iommu *iommu)
1829 {
1830 	if ((iommu->domains) && (iommu->domain_ids)) {
1831 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1832 		int i;
1833 
1834 		for (i = 0; i < elems; i++)
1835 			kfree(iommu->domains[i]);
1836 		kfree(iommu->domains);
1837 		kfree(iommu->domain_ids);
1838 		iommu->domains = NULL;
1839 		iommu->domain_ids = NULL;
1840 	}
1841 
1842 	g_iommus[iommu->seq_id] = NULL;
1843 
1844 	/* free context mapping */
1845 	free_context_table(iommu);
1846 
1847 #ifdef CONFIG_INTEL_IOMMU_SVM
1848 	if (pasid_supported(iommu)) {
1849 		if (ecap_prs(iommu->ecap))
1850 			intel_svm_finish_prq(iommu);
1851 	}
1852 	if (vccap_pasid(iommu->vccap))
1853 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1854 
1855 #endif
1856 }
1857 
1858 /*
1859  * Check and return whether first level is used by default for
1860  * DMA translation.
1861  */
1862 static bool first_level_by_default(void)
1863 {
1864 	struct dmar_drhd_unit *drhd;
1865 	struct intel_iommu *iommu;
1866 	static int first_level_support = -1;
1867 
1868 	if (likely(first_level_support != -1))
1869 		return first_level_support;
1870 
1871 	first_level_support = 1;
1872 
1873 	rcu_read_lock();
1874 	for_each_active_iommu(iommu, drhd) {
1875 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1876 			first_level_support = 0;
1877 			break;
1878 		}
1879 	}
1880 	rcu_read_unlock();
1881 
1882 	return first_level_support;
1883 }
1884 
1885 static struct dmar_domain *alloc_domain(int flags)
1886 {
1887 	struct dmar_domain *domain;
1888 
1889 	domain = alloc_domain_mem();
1890 	if (!domain)
1891 		return NULL;
1892 
1893 	memset(domain, 0, sizeof(*domain));
1894 	domain->nid = NUMA_NO_NODE;
1895 	domain->flags = flags;
1896 	if (first_level_by_default())
1897 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1898 	domain->has_iotlb_device = false;
1899 	INIT_LIST_HEAD(&domain->devices);
1900 	INIT_LIST_HEAD(&domain->subdevices);
1901 
1902 	return domain;
1903 }
1904 
1905 /* Must be called with iommu->lock */
1906 static int domain_attach_iommu(struct dmar_domain *domain,
1907 			       struct intel_iommu *iommu)
1908 {
1909 	unsigned long ndomains;
1910 	int num;
1911 
1912 	assert_spin_locked(&device_domain_lock);
1913 	assert_spin_locked(&iommu->lock);
1914 
1915 	domain->iommu_refcnt[iommu->seq_id] += 1;
1916 	domain->iommu_count += 1;
1917 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1918 		ndomains = cap_ndoms(iommu->cap);
1919 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1920 
1921 		if (num >= ndomains) {
1922 			pr_err("%s: No free domain ids\n", iommu->name);
1923 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1924 			domain->iommu_count -= 1;
1925 			return -ENOSPC;
1926 		}
1927 
1928 		set_bit(num, iommu->domain_ids);
1929 		set_iommu_domain(iommu, num, domain);
1930 
1931 		domain->iommu_did[iommu->seq_id] = num;
1932 		domain->nid			 = iommu->node;
1933 
1934 		domain_update_iommu_cap(domain);
1935 	}
1936 
1937 	return 0;
1938 }
1939 
1940 static int domain_detach_iommu(struct dmar_domain *domain,
1941 			       struct intel_iommu *iommu)
1942 {
1943 	int num, count;
1944 
1945 	assert_spin_locked(&device_domain_lock);
1946 	assert_spin_locked(&iommu->lock);
1947 
1948 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1949 	count = --domain->iommu_count;
1950 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1951 		num = domain->iommu_did[iommu->seq_id];
1952 		clear_bit(num, iommu->domain_ids);
1953 		set_iommu_domain(iommu, num, NULL);
1954 
1955 		domain_update_iommu_cap(domain);
1956 		domain->iommu_did[iommu->seq_id] = 0;
1957 	}
1958 
1959 	return count;
1960 }
1961 
1962 static inline int guestwidth_to_adjustwidth(int gaw)
1963 {
1964 	int agaw;
1965 	int r = (gaw - 12) % 9;
1966 
1967 	if (r == 0)
1968 		agaw = gaw;
1969 	else
1970 		agaw = gaw + 9 - r;
1971 	if (agaw > 64)
1972 		agaw = 64;
1973 	return agaw;
1974 }
1975 
1976 static void domain_exit(struct dmar_domain *domain)
1977 {
1978 
1979 	/* Remove associated devices and clear attached or cached domains */
1980 	domain_remove_dev_info(domain);
1981 
1982 	/* destroy iovas */
1983 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1984 		iommu_put_dma_cookie(&domain->domain);
1985 
1986 	if (domain->pgd) {
1987 		struct page *freelist;
1988 
1989 		freelist = domain_unmap(domain, 0,
1990 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1991 		dma_free_pagelist(freelist);
1992 	}
1993 
1994 	free_domain_mem(domain);
1995 }
1996 
1997 /*
1998  * Get the PASID directory size for scalable mode context entry.
1999  * Value of X in the PDTS field of a scalable mode context entry
2000  * indicates PASID directory with 2^(X + 7) entries.
2001  */
2002 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2003 {
2004 	int pds, max_pde;
2005 
2006 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2007 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2008 	if (pds < 7)
2009 		return 0;
2010 
2011 	return pds - 7;
2012 }
2013 
2014 /*
2015  * Set the RID_PASID field of a scalable mode context entry. The
2016  * IOMMU hardware will use the PASID value set in this field for
2017  * DMA translations of DMA requests without PASID.
2018  */
2019 static inline void
2020 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2021 {
2022 	context->hi |= pasid & ((1 << 20) - 1);
2023 }
2024 
2025 /*
2026  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2027  * entry.
2028  */
2029 static inline void context_set_sm_dte(struct context_entry *context)
2030 {
2031 	context->lo |= (1 << 2);
2032 }
2033 
2034 /*
2035  * Set the PRE(Page Request Enable) field of a scalable mode context
2036  * entry.
2037  */
2038 static inline void context_set_sm_pre(struct context_entry *context)
2039 {
2040 	context->lo |= (1 << 4);
2041 }
2042 
2043 /* Convert value to context PASID directory size field coding. */
2044 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2045 
2046 static int domain_context_mapping_one(struct dmar_domain *domain,
2047 				      struct intel_iommu *iommu,
2048 				      struct pasid_table *table,
2049 				      u8 bus, u8 devfn)
2050 {
2051 	u16 did = domain->iommu_did[iommu->seq_id];
2052 	int translation = CONTEXT_TT_MULTI_LEVEL;
2053 	struct device_domain_info *info = NULL;
2054 	struct context_entry *context;
2055 	unsigned long flags;
2056 	int ret;
2057 
2058 	WARN_ON(did == 0);
2059 
2060 	if (hw_pass_through && domain_type_is_si(domain))
2061 		translation = CONTEXT_TT_PASS_THROUGH;
2062 
2063 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2064 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2065 
2066 	BUG_ON(!domain->pgd);
2067 
2068 	spin_lock_irqsave(&device_domain_lock, flags);
2069 	spin_lock(&iommu->lock);
2070 
2071 	ret = -ENOMEM;
2072 	context = iommu_context_addr(iommu, bus, devfn, 1);
2073 	if (!context)
2074 		goto out_unlock;
2075 
2076 	ret = 0;
2077 	if (context_present(context))
2078 		goto out_unlock;
2079 
2080 	/*
2081 	 * For kdump cases, old valid entries may be cached due to the
2082 	 * in-flight DMA and copied pgtable, but there is no unmapping
2083 	 * behaviour for them, thus we need an explicit cache flush for
2084 	 * the newly-mapped device. For kdump, at this point, the device
2085 	 * is supposed to finish reset at its driver probe stage, so no
2086 	 * in-flight DMA will exist, and we don't need to worry anymore
2087 	 * hereafter.
2088 	 */
2089 	if (context_copied(context)) {
2090 		u16 did_old = context_domain_id(context);
2091 
2092 		if (did_old < cap_ndoms(iommu->cap)) {
2093 			iommu->flush.flush_context(iommu, did_old,
2094 						   (((u16)bus) << 8) | devfn,
2095 						   DMA_CCMD_MASK_NOBIT,
2096 						   DMA_CCMD_DEVICE_INVL);
2097 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2098 						 DMA_TLB_DSI_FLUSH);
2099 		}
2100 	}
2101 
2102 	context_clear_entry(context);
2103 
2104 	if (sm_supported(iommu)) {
2105 		unsigned long pds;
2106 
2107 		WARN_ON(!table);
2108 
2109 		/* Setup the PASID DIR pointer: */
2110 		pds = context_get_sm_pds(table);
2111 		context->lo = (u64)virt_to_phys(table->table) |
2112 				context_pdts(pds);
2113 
2114 		/* Setup the RID_PASID field: */
2115 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2116 
2117 		/*
2118 		 * Setup the Device-TLB enable bit and Page request
2119 		 * Enable bit:
2120 		 */
2121 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2122 		if (info && info->ats_supported)
2123 			context_set_sm_dte(context);
2124 		if (info && info->pri_supported)
2125 			context_set_sm_pre(context);
2126 	} else {
2127 		struct dma_pte *pgd = domain->pgd;
2128 		int agaw;
2129 
2130 		context_set_domain_id(context, did);
2131 
2132 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2133 			/*
2134 			 * Skip top levels of page tables for iommu which has
2135 			 * less agaw than default. Unnecessary for PT mode.
2136 			 */
2137 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2138 				ret = -ENOMEM;
2139 				pgd = phys_to_virt(dma_pte_addr(pgd));
2140 				if (!dma_pte_present(pgd))
2141 					goto out_unlock;
2142 			}
2143 
2144 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2145 			if (info && info->ats_supported)
2146 				translation = CONTEXT_TT_DEV_IOTLB;
2147 			else
2148 				translation = CONTEXT_TT_MULTI_LEVEL;
2149 
2150 			context_set_address_root(context, virt_to_phys(pgd));
2151 			context_set_address_width(context, agaw);
2152 		} else {
2153 			/*
2154 			 * In pass through mode, AW must be programmed to
2155 			 * indicate the largest AGAW value supported by
2156 			 * hardware. And ASR is ignored by hardware.
2157 			 */
2158 			context_set_address_width(context, iommu->msagaw);
2159 		}
2160 
2161 		context_set_translation_type(context, translation);
2162 	}
2163 
2164 	context_set_fault_enable(context);
2165 	context_set_present(context);
2166 	if (!ecap_coherent(iommu->ecap))
2167 		clflush_cache_range(context, sizeof(*context));
2168 
2169 	/*
2170 	 * It's a non-present to present mapping. If hardware doesn't cache
2171 	 * non-present entry we only need to flush the write-buffer. If the
2172 	 * _does_ cache non-present entries, then it does so in the special
2173 	 * domain #0, which we have to flush:
2174 	 */
2175 	if (cap_caching_mode(iommu->cap)) {
2176 		iommu->flush.flush_context(iommu, 0,
2177 					   (((u16)bus) << 8) | devfn,
2178 					   DMA_CCMD_MASK_NOBIT,
2179 					   DMA_CCMD_DEVICE_INVL);
2180 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2181 	} else {
2182 		iommu_flush_write_buffer(iommu);
2183 	}
2184 	iommu_enable_dev_iotlb(info);
2185 
2186 	ret = 0;
2187 
2188 out_unlock:
2189 	spin_unlock(&iommu->lock);
2190 	spin_unlock_irqrestore(&device_domain_lock, flags);
2191 
2192 	return ret;
2193 }
2194 
2195 struct domain_context_mapping_data {
2196 	struct dmar_domain *domain;
2197 	struct intel_iommu *iommu;
2198 	struct pasid_table *table;
2199 };
2200 
2201 static int domain_context_mapping_cb(struct pci_dev *pdev,
2202 				     u16 alias, void *opaque)
2203 {
2204 	struct domain_context_mapping_data *data = opaque;
2205 
2206 	return domain_context_mapping_one(data->domain, data->iommu,
2207 					  data->table, PCI_BUS_NUM(alias),
2208 					  alias & 0xff);
2209 }
2210 
2211 static int
2212 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2213 {
2214 	struct domain_context_mapping_data data;
2215 	struct pasid_table *table;
2216 	struct intel_iommu *iommu;
2217 	u8 bus, devfn;
2218 
2219 	iommu = device_to_iommu(dev, &bus, &devfn);
2220 	if (!iommu)
2221 		return -ENODEV;
2222 
2223 	table = intel_pasid_get_table(dev);
2224 
2225 	if (!dev_is_pci(dev))
2226 		return domain_context_mapping_one(domain, iommu, table,
2227 						  bus, devfn);
2228 
2229 	data.domain = domain;
2230 	data.iommu = iommu;
2231 	data.table = table;
2232 
2233 	return pci_for_each_dma_alias(to_pci_dev(dev),
2234 				      &domain_context_mapping_cb, &data);
2235 }
2236 
2237 static int domain_context_mapped_cb(struct pci_dev *pdev,
2238 				    u16 alias, void *opaque)
2239 {
2240 	struct intel_iommu *iommu = opaque;
2241 
2242 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2243 }
2244 
2245 static int domain_context_mapped(struct device *dev)
2246 {
2247 	struct intel_iommu *iommu;
2248 	u8 bus, devfn;
2249 
2250 	iommu = device_to_iommu(dev, &bus, &devfn);
2251 	if (!iommu)
2252 		return -ENODEV;
2253 
2254 	if (!dev_is_pci(dev))
2255 		return device_context_mapped(iommu, bus, devfn);
2256 
2257 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2258 				       domain_context_mapped_cb, iommu);
2259 }
2260 
2261 /* Returns a number of VTD pages, but aligned to MM page size */
2262 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2263 					    size_t size)
2264 {
2265 	host_addr &= ~PAGE_MASK;
2266 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2267 }
2268 
2269 /* Return largest possible superpage level for a given mapping */
2270 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2271 					  unsigned long iov_pfn,
2272 					  unsigned long phy_pfn,
2273 					  unsigned long pages)
2274 {
2275 	int support, level = 1;
2276 	unsigned long pfnmerge;
2277 
2278 	support = domain->iommu_superpage;
2279 
2280 	/* To use a large page, the virtual *and* physical addresses
2281 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2282 	   of them will mean we have to use smaller pages. So just
2283 	   merge them and check both at once. */
2284 	pfnmerge = iov_pfn | phy_pfn;
2285 
2286 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2287 		pages >>= VTD_STRIDE_SHIFT;
2288 		if (!pages)
2289 			break;
2290 		pfnmerge >>= VTD_STRIDE_SHIFT;
2291 		level++;
2292 		support--;
2293 	}
2294 	return level;
2295 }
2296 
2297 static int
2298 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2299 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2300 {
2301 	struct dma_pte *first_pte = NULL, *pte = NULL;
2302 	unsigned int largepage_lvl = 0;
2303 	unsigned long lvl_pages = 0;
2304 	phys_addr_t pteval;
2305 	u64 attr;
2306 
2307 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2308 
2309 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2310 		return -EINVAL;
2311 
2312 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2313 	if (domain_use_first_level(domain))
2314 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2315 
2316 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2317 
2318 	while (nr_pages > 0) {
2319 		uint64_t tmp;
2320 
2321 		if (!pte) {
2322 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2323 					phys_pfn, nr_pages);
2324 
2325 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2326 			if (!pte)
2327 				return -ENOMEM;
2328 			/* It is large page*/
2329 			if (largepage_lvl > 1) {
2330 				unsigned long nr_superpages, end_pfn;
2331 
2332 				pteval |= DMA_PTE_LARGE_PAGE;
2333 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2334 
2335 				nr_superpages = nr_pages / lvl_pages;
2336 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2337 
2338 				/*
2339 				 * Ensure that old small page tables are
2340 				 * removed to make room for superpage(s).
2341 				 * We're adding new large pages, so make sure
2342 				 * we don't remove their parent tables.
2343 				 */
2344 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2345 						       largepage_lvl + 1);
2346 			} else {
2347 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2348 			}
2349 
2350 		}
2351 		/* We don't need lock here, nobody else
2352 		 * touches the iova range
2353 		 */
2354 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2355 		if (tmp) {
2356 			static int dumps = 5;
2357 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2358 				iov_pfn, tmp, (unsigned long long)pteval);
2359 			if (dumps) {
2360 				dumps--;
2361 				debug_dma_dump_mappings(NULL);
2362 			}
2363 			WARN_ON(1);
2364 		}
2365 
2366 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2367 
2368 		BUG_ON(nr_pages < lvl_pages);
2369 
2370 		nr_pages -= lvl_pages;
2371 		iov_pfn += lvl_pages;
2372 		phys_pfn += lvl_pages;
2373 		pteval += lvl_pages * VTD_PAGE_SIZE;
2374 
2375 		/* If the next PTE would be the first in a new page, then we
2376 		 * need to flush the cache on the entries we've just written.
2377 		 * And then we'll need to recalculate 'pte', so clear it and
2378 		 * let it get set again in the if (!pte) block above.
2379 		 *
2380 		 * If we're done (!nr_pages) we need to flush the cache too.
2381 		 *
2382 		 * Also if we've been setting superpages, we may need to
2383 		 * recalculate 'pte' and switch back to smaller pages for the
2384 		 * end of the mapping, if the trailing size is not enough to
2385 		 * use another superpage (i.e. nr_pages < lvl_pages).
2386 		 */
2387 		pte++;
2388 		if (!nr_pages || first_pte_in_page(pte) ||
2389 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2390 			domain_flush_cache(domain, first_pte,
2391 					   (void *)pte - (void *)first_pte);
2392 			pte = NULL;
2393 		}
2394 	}
2395 
2396 	return 0;
2397 }
2398 
2399 static int
2400 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2401 	       unsigned long phys_pfn, unsigned long nr_pages, int prot)
2402 {
2403 	int iommu_id, ret;
2404 	struct intel_iommu *iommu;
2405 
2406 	/* Do the real mapping first */
2407 	ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2408 	if (ret)
2409 		return ret;
2410 
2411 	for_each_domain_iommu(iommu_id, domain) {
2412 		iommu = g_iommus[iommu_id];
2413 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2414 	}
2415 
2416 	return 0;
2417 }
2418 
2419 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2420 {
2421 	unsigned long flags;
2422 	struct context_entry *context;
2423 	u16 did_old;
2424 
2425 	if (!iommu)
2426 		return;
2427 
2428 	spin_lock_irqsave(&iommu->lock, flags);
2429 	context = iommu_context_addr(iommu, bus, devfn, 0);
2430 	if (!context) {
2431 		spin_unlock_irqrestore(&iommu->lock, flags);
2432 		return;
2433 	}
2434 	did_old = context_domain_id(context);
2435 	context_clear_entry(context);
2436 	__iommu_flush_cache(iommu, context, sizeof(*context));
2437 	spin_unlock_irqrestore(&iommu->lock, flags);
2438 	iommu->flush.flush_context(iommu,
2439 				   did_old,
2440 				   (((u16)bus) << 8) | devfn,
2441 				   DMA_CCMD_MASK_NOBIT,
2442 				   DMA_CCMD_DEVICE_INVL);
2443 	iommu->flush.flush_iotlb(iommu,
2444 				 did_old,
2445 				 0,
2446 				 0,
2447 				 DMA_TLB_DSI_FLUSH);
2448 }
2449 
2450 static inline void unlink_domain_info(struct device_domain_info *info)
2451 {
2452 	assert_spin_locked(&device_domain_lock);
2453 	list_del(&info->link);
2454 	list_del(&info->global);
2455 	if (info->dev)
2456 		dev_iommu_priv_set(info->dev, NULL);
2457 }
2458 
2459 static void domain_remove_dev_info(struct dmar_domain *domain)
2460 {
2461 	struct device_domain_info *info, *tmp;
2462 	unsigned long flags;
2463 
2464 	spin_lock_irqsave(&device_domain_lock, flags);
2465 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2466 		__dmar_remove_one_dev_info(info);
2467 	spin_unlock_irqrestore(&device_domain_lock, flags);
2468 }
2469 
2470 struct dmar_domain *find_domain(struct device *dev)
2471 {
2472 	struct device_domain_info *info;
2473 
2474 	if (unlikely(!dev || !dev->iommu))
2475 		return NULL;
2476 
2477 	if (unlikely(attach_deferred(dev)))
2478 		return NULL;
2479 
2480 	/* No lock here, assumes no domain exit in normal case */
2481 	info = get_domain_info(dev);
2482 	if (likely(info))
2483 		return info->domain;
2484 
2485 	return NULL;
2486 }
2487 
2488 static inline struct device_domain_info *
2489 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2490 {
2491 	struct device_domain_info *info;
2492 
2493 	list_for_each_entry(info, &device_domain_list, global)
2494 		if (info->segment == segment && info->bus == bus &&
2495 		    info->devfn == devfn)
2496 			return info;
2497 
2498 	return NULL;
2499 }
2500 
2501 static int domain_setup_first_level(struct intel_iommu *iommu,
2502 				    struct dmar_domain *domain,
2503 				    struct device *dev,
2504 				    u32 pasid)
2505 {
2506 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2507 	struct dma_pte *pgd = domain->pgd;
2508 	int agaw, level;
2509 
2510 	/*
2511 	 * Skip top levels of page tables for iommu which has
2512 	 * less agaw than default. Unnecessary for PT mode.
2513 	 */
2514 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2515 		pgd = phys_to_virt(dma_pte_addr(pgd));
2516 		if (!dma_pte_present(pgd))
2517 			return -ENOMEM;
2518 	}
2519 
2520 	level = agaw_to_level(agaw);
2521 	if (level != 4 && level != 5)
2522 		return -EINVAL;
2523 
2524 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2525 
2526 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2527 					     domain->iommu_did[iommu->seq_id],
2528 					     flags);
2529 }
2530 
2531 static bool dev_is_real_dma_subdevice(struct device *dev)
2532 {
2533 	return dev && dev_is_pci(dev) &&
2534 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2535 }
2536 
2537 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2538 						    int bus, int devfn,
2539 						    struct device *dev,
2540 						    struct dmar_domain *domain)
2541 {
2542 	struct dmar_domain *found = NULL;
2543 	struct device_domain_info *info;
2544 	unsigned long flags;
2545 	int ret;
2546 
2547 	info = alloc_devinfo_mem();
2548 	if (!info)
2549 		return NULL;
2550 
2551 	if (!dev_is_real_dma_subdevice(dev)) {
2552 		info->bus = bus;
2553 		info->devfn = devfn;
2554 		info->segment = iommu->segment;
2555 	} else {
2556 		struct pci_dev *pdev = to_pci_dev(dev);
2557 
2558 		info->bus = pdev->bus->number;
2559 		info->devfn = pdev->devfn;
2560 		info->segment = pci_domain_nr(pdev->bus);
2561 	}
2562 
2563 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2564 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2565 	info->ats_qdep = 0;
2566 	info->dev = dev;
2567 	info->domain = domain;
2568 	info->iommu = iommu;
2569 	info->pasid_table = NULL;
2570 	info->auxd_enabled = 0;
2571 	INIT_LIST_HEAD(&info->subdevices);
2572 
2573 	if (dev && dev_is_pci(dev)) {
2574 		struct pci_dev *pdev = to_pci_dev(info->dev);
2575 
2576 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2577 		    pci_ats_supported(pdev) &&
2578 		    dmar_find_matched_atsr_unit(pdev))
2579 			info->ats_supported = 1;
2580 
2581 		if (sm_supported(iommu)) {
2582 			if (pasid_supported(iommu)) {
2583 				int features = pci_pasid_features(pdev);
2584 				if (features >= 0)
2585 					info->pasid_supported = features | 1;
2586 			}
2587 
2588 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2589 			    pci_pri_supported(pdev))
2590 				info->pri_supported = 1;
2591 		}
2592 	}
2593 
2594 	spin_lock_irqsave(&device_domain_lock, flags);
2595 	if (dev)
2596 		found = find_domain(dev);
2597 
2598 	if (!found) {
2599 		struct device_domain_info *info2;
2600 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2601 						       info->devfn);
2602 		if (info2) {
2603 			found      = info2->domain;
2604 			info2->dev = dev;
2605 		}
2606 	}
2607 
2608 	if (found) {
2609 		spin_unlock_irqrestore(&device_domain_lock, flags);
2610 		free_devinfo_mem(info);
2611 		/* Caller must free the original domain */
2612 		return found;
2613 	}
2614 
2615 	spin_lock(&iommu->lock);
2616 	ret = domain_attach_iommu(domain, iommu);
2617 	spin_unlock(&iommu->lock);
2618 
2619 	if (ret) {
2620 		spin_unlock_irqrestore(&device_domain_lock, flags);
2621 		free_devinfo_mem(info);
2622 		return NULL;
2623 	}
2624 
2625 	list_add(&info->link, &domain->devices);
2626 	list_add(&info->global, &device_domain_list);
2627 	if (dev)
2628 		dev_iommu_priv_set(dev, info);
2629 	spin_unlock_irqrestore(&device_domain_lock, flags);
2630 
2631 	/* PASID table is mandatory for a PCI device in scalable mode. */
2632 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2633 		ret = intel_pasid_alloc_table(dev);
2634 		if (ret) {
2635 			dev_err(dev, "PASID table allocation failed\n");
2636 			dmar_remove_one_dev_info(dev);
2637 			return NULL;
2638 		}
2639 
2640 		/* Setup the PASID entry for requests without PASID: */
2641 		spin_lock_irqsave(&iommu->lock, flags);
2642 		if (hw_pass_through && domain_type_is_si(domain))
2643 			ret = intel_pasid_setup_pass_through(iommu, domain,
2644 					dev, PASID_RID2PASID);
2645 		else if (domain_use_first_level(domain))
2646 			ret = domain_setup_first_level(iommu, domain, dev,
2647 					PASID_RID2PASID);
2648 		else
2649 			ret = intel_pasid_setup_second_level(iommu, domain,
2650 					dev, PASID_RID2PASID);
2651 		spin_unlock_irqrestore(&iommu->lock, flags);
2652 		if (ret) {
2653 			dev_err(dev, "Setup RID2PASID failed\n");
2654 			dmar_remove_one_dev_info(dev);
2655 			return NULL;
2656 		}
2657 	}
2658 
2659 	if (dev && domain_context_mapping(domain, dev)) {
2660 		dev_err(dev, "Domain context map failed\n");
2661 		dmar_remove_one_dev_info(dev);
2662 		return NULL;
2663 	}
2664 
2665 	return domain;
2666 }
2667 
2668 static int iommu_domain_identity_map(struct dmar_domain *domain,
2669 				     unsigned long first_vpfn,
2670 				     unsigned long last_vpfn)
2671 {
2672 	/*
2673 	 * RMRR range might have overlap with physical memory range,
2674 	 * clear it first
2675 	 */
2676 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2677 
2678 	return __domain_mapping(domain, first_vpfn,
2679 				first_vpfn, last_vpfn - first_vpfn + 1,
2680 				DMA_PTE_READ|DMA_PTE_WRITE);
2681 }
2682 
2683 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2684 
2685 static int __init si_domain_init(int hw)
2686 {
2687 	struct dmar_rmrr_unit *rmrr;
2688 	struct device *dev;
2689 	int i, nid, ret;
2690 
2691 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2692 	if (!si_domain)
2693 		return -EFAULT;
2694 
2695 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2696 		domain_exit(si_domain);
2697 		return -EFAULT;
2698 	}
2699 
2700 	if (hw)
2701 		return 0;
2702 
2703 	for_each_online_node(nid) {
2704 		unsigned long start_pfn, end_pfn;
2705 		int i;
2706 
2707 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2708 			ret = iommu_domain_identity_map(si_domain,
2709 					mm_to_dma_pfn(start_pfn),
2710 					mm_to_dma_pfn(end_pfn));
2711 			if (ret)
2712 				return ret;
2713 		}
2714 	}
2715 
2716 	/*
2717 	 * Identity map the RMRRs so that devices with RMRRs could also use
2718 	 * the si_domain.
2719 	 */
2720 	for_each_rmrr_units(rmrr) {
2721 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2722 					  i, dev) {
2723 			unsigned long long start = rmrr->base_address;
2724 			unsigned long long end = rmrr->end_address;
2725 
2726 			if (WARN_ON(end < start ||
2727 				    end >> agaw_to_width(si_domain->agaw)))
2728 				continue;
2729 
2730 			ret = iommu_domain_identity_map(si_domain,
2731 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2732 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2733 			if (ret)
2734 				return ret;
2735 		}
2736 	}
2737 
2738 	return 0;
2739 }
2740 
2741 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2742 {
2743 	struct dmar_domain *ndomain;
2744 	struct intel_iommu *iommu;
2745 	u8 bus, devfn;
2746 
2747 	iommu = device_to_iommu(dev, &bus, &devfn);
2748 	if (!iommu)
2749 		return -ENODEV;
2750 
2751 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2752 	if (ndomain != domain)
2753 		return -EBUSY;
2754 
2755 	return 0;
2756 }
2757 
2758 static bool device_has_rmrr(struct device *dev)
2759 {
2760 	struct dmar_rmrr_unit *rmrr;
2761 	struct device *tmp;
2762 	int i;
2763 
2764 	rcu_read_lock();
2765 	for_each_rmrr_units(rmrr) {
2766 		/*
2767 		 * Return TRUE if this RMRR contains the device that
2768 		 * is passed in.
2769 		 */
2770 		for_each_active_dev_scope(rmrr->devices,
2771 					  rmrr->devices_cnt, i, tmp)
2772 			if (tmp == dev ||
2773 			    is_downstream_to_pci_bridge(dev, tmp)) {
2774 				rcu_read_unlock();
2775 				return true;
2776 			}
2777 	}
2778 	rcu_read_unlock();
2779 	return false;
2780 }
2781 
2782 /**
2783  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2784  * is relaxable (ie. is allowed to be not enforced under some conditions)
2785  * @dev: device handle
2786  *
2787  * We assume that PCI USB devices with RMRRs have them largely
2788  * for historical reasons and that the RMRR space is not actively used post
2789  * boot.  This exclusion may change if vendors begin to abuse it.
2790  *
2791  * The same exception is made for graphics devices, with the requirement that
2792  * any use of the RMRR regions will be torn down before assigning the device
2793  * to a guest.
2794  *
2795  * Return: true if the RMRR is relaxable, false otherwise
2796  */
2797 static bool device_rmrr_is_relaxable(struct device *dev)
2798 {
2799 	struct pci_dev *pdev;
2800 
2801 	if (!dev_is_pci(dev))
2802 		return false;
2803 
2804 	pdev = to_pci_dev(dev);
2805 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2806 		return true;
2807 	else
2808 		return false;
2809 }
2810 
2811 /*
2812  * There are a couple cases where we need to restrict the functionality of
2813  * devices associated with RMRRs.  The first is when evaluating a device for
2814  * identity mapping because problems exist when devices are moved in and out
2815  * of domains and their respective RMRR information is lost.  This means that
2816  * a device with associated RMRRs will never be in a "passthrough" domain.
2817  * The second is use of the device through the IOMMU API.  This interface
2818  * expects to have full control of the IOVA space for the device.  We cannot
2819  * satisfy both the requirement that RMRR access is maintained and have an
2820  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2821  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2822  * We therefore prevent devices associated with an RMRR from participating in
2823  * the IOMMU API, which eliminates them from device assignment.
2824  *
2825  * In both cases, devices which have relaxable RMRRs are not concerned by this
2826  * restriction. See device_rmrr_is_relaxable comment.
2827  */
2828 static bool device_is_rmrr_locked(struct device *dev)
2829 {
2830 	if (!device_has_rmrr(dev))
2831 		return false;
2832 
2833 	if (device_rmrr_is_relaxable(dev))
2834 		return false;
2835 
2836 	return true;
2837 }
2838 
2839 /*
2840  * Return the required default domain type for a specific device.
2841  *
2842  * @dev: the device in query
2843  * @startup: true if this is during early boot
2844  *
2845  * Returns:
2846  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2847  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2848  *  - 0: both identity and dynamic domains work for this device
2849  */
2850 static int device_def_domain_type(struct device *dev)
2851 {
2852 	if (dev_is_pci(dev)) {
2853 		struct pci_dev *pdev = to_pci_dev(dev);
2854 
2855 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2856 			return IOMMU_DOMAIN_IDENTITY;
2857 
2858 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2859 			return IOMMU_DOMAIN_IDENTITY;
2860 	}
2861 
2862 	return 0;
2863 }
2864 
2865 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2866 {
2867 	/*
2868 	 * Start from the sane iommu hardware state.
2869 	 * If the queued invalidation is already initialized by us
2870 	 * (for example, while enabling interrupt-remapping) then
2871 	 * we got the things already rolling from a sane state.
2872 	 */
2873 	if (!iommu->qi) {
2874 		/*
2875 		 * Clear any previous faults.
2876 		 */
2877 		dmar_fault(-1, iommu);
2878 		/*
2879 		 * Disable queued invalidation if supported and already enabled
2880 		 * before OS handover.
2881 		 */
2882 		dmar_disable_qi(iommu);
2883 	}
2884 
2885 	if (dmar_enable_qi(iommu)) {
2886 		/*
2887 		 * Queued Invalidate not enabled, use Register Based Invalidate
2888 		 */
2889 		iommu->flush.flush_context = __iommu_flush_context;
2890 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2891 		pr_info("%s: Using Register based invalidation\n",
2892 			iommu->name);
2893 	} else {
2894 		iommu->flush.flush_context = qi_flush_context;
2895 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2896 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2897 	}
2898 }
2899 
2900 static int copy_context_table(struct intel_iommu *iommu,
2901 			      struct root_entry *old_re,
2902 			      struct context_entry **tbl,
2903 			      int bus, bool ext)
2904 {
2905 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2906 	struct context_entry *new_ce = NULL, ce;
2907 	struct context_entry *old_ce = NULL;
2908 	struct root_entry re;
2909 	phys_addr_t old_ce_phys;
2910 
2911 	tbl_idx = ext ? bus * 2 : bus;
2912 	memcpy(&re, old_re, sizeof(re));
2913 
2914 	for (devfn = 0; devfn < 256; devfn++) {
2915 		/* First calculate the correct index */
2916 		idx = (ext ? devfn * 2 : devfn) % 256;
2917 
2918 		if (idx == 0) {
2919 			/* First save what we may have and clean up */
2920 			if (new_ce) {
2921 				tbl[tbl_idx] = new_ce;
2922 				__iommu_flush_cache(iommu, new_ce,
2923 						    VTD_PAGE_SIZE);
2924 				pos = 1;
2925 			}
2926 
2927 			if (old_ce)
2928 				memunmap(old_ce);
2929 
2930 			ret = 0;
2931 			if (devfn < 0x80)
2932 				old_ce_phys = root_entry_lctp(&re);
2933 			else
2934 				old_ce_phys = root_entry_uctp(&re);
2935 
2936 			if (!old_ce_phys) {
2937 				if (ext && devfn == 0) {
2938 					/* No LCTP, try UCTP */
2939 					devfn = 0x7f;
2940 					continue;
2941 				} else {
2942 					goto out;
2943 				}
2944 			}
2945 
2946 			ret = -ENOMEM;
2947 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2948 					MEMREMAP_WB);
2949 			if (!old_ce)
2950 				goto out;
2951 
2952 			new_ce = alloc_pgtable_page(iommu->node);
2953 			if (!new_ce)
2954 				goto out_unmap;
2955 
2956 			ret = 0;
2957 		}
2958 
2959 		/* Now copy the context entry */
2960 		memcpy(&ce, old_ce + idx, sizeof(ce));
2961 
2962 		if (!__context_present(&ce))
2963 			continue;
2964 
2965 		did = context_domain_id(&ce);
2966 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2967 			set_bit(did, iommu->domain_ids);
2968 
2969 		/*
2970 		 * We need a marker for copied context entries. This
2971 		 * marker needs to work for the old format as well as
2972 		 * for extended context entries.
2973 		 *
2974 		 * Bit 67 of the context entry is used. In the old
2975 		 * format this bit is available to software, in the
2976 		 * extended format it is the PGE bit, but PGE is ignored
2977 		 * by HW if PASIDs are disabled (and thus still
2978 		 * available).
2979 		 *
2980 		 * So disable PASIDs first and then mark the entry
2981 		 * copied. This means that we don't copy PASID
2982 		 * translations from the old kernel, but this is fine as
2983 		 * faults there are not fatal.
2984 		 */
2985 		context_clear_pasid_enable(&ce);
2986 		context_set_copied(&ce);
2987 
2988 		new_ce[idx] = ce;
2989 	}
2990 
2991 	tbl[tbl_idx + pos] = new_ce;
2992 
2993 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2994 
2995 out_unmap:
2996 	memunmap(old_ce);
2997 
2998 out:
2999 	return ret;
3000 }
3001 
3002 static int copy_translation_tables(struct intel_iommu *iommu)
3003 {
3004 	struct context_entry **ctxt_tbls;
3005 	struct root_entry *old_rt;
3006 	phys_addr_t old_rt_phys;
3007 	int ctxt_table_entries;
3008 	unsigned long flags;
3009 	u64 rtaddr_reg;
3010 	int bus, ret;
3011 	bool new_ext, ext;
3012 
3013 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3014 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3015 	new_ext    = !!ecap_ecs(iommu->ecap);
3016 
3017 	/*
3018 	 * The RTT bit can only be changed when translation is disabled,
3019 	 * but disabling translation means to open a window for data
3020 	 * corruption. So bail out and don't copy anything if we would
3021 	 * have to change the bit.
3022 	 */
3023 	if (new_ext != ext)
3024 		return -EINVAL;
3025 
3026 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3027 	if (!old_rt_phys)
3028 		return -EINVAL;
3029 
3030 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3031 	if (!old_rt)
3032 		return -ENOMEM;
3033 
3034 	/* This is too big for the stack - allocate it from slab */
3035 	ctxt_table_entries = ext ? 512 : 256;
3036 	ret = -ENOMEM;
3037 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3038 	if (!ctxt_tbls)
3039 		goto out_unmap;
3040 
3041 	for (bus = 0; bus < 256; bus++) {
3042 		ret = copy_context_table(iommu, &old_rt[bus],
3043 					 ctxt_tbls, bus, ext);
3044 		if (ret) {
3045 			pr_err("%s: Failed to copy context table for bus %d\n",
3046 				iommu->name, bus);
3047 			continue;
3048 		}
3049 	}
3050 
3051 	spin_lock_irqsave(&iommu->lock, flags);
3052 
3053 	/* Context tables are copied, now write them to the root_entry table */
3054 	for (bus = 0; bus < 256; bus++) {
3055 		int idx = ext ? bus * 2 : bus;
3056 		u64 val;
3057 
3058 		if (ctxt_tbls[idx]) {
3059 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3060 			iommu->root_entry[bus].lo = val;
3061 		}
3062 
3063 		if (!ext || !ctxt_tbls[idx + 1])
3064 			continue;
3065 
3066 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3067 		iommu->root_entry[bus].hi = val;
3068 	}
3069 
3070 	spin_unlock_irqrestore(&iommu->lock, flags);
3071 
3072 	kfree(ctxt_tbls);
3073 
3074 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3075 
3076 	ret = 0;
3077 
3078 out_unmap:
3079 	memunmap(old_rt);
3080 
3081 	return ret;
3082 }
3083 
3084 #ifdef CONFIG_INTEL_IOMMU_SVM
3085 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3086 {
3087 	struct intel_iommu *iommu = data;
3088 	ioasid_t ioasid;
3089 
3090 	if (!iommu)
3091 		return INVALID_IOASID;
3092 	/*
3093 	 * VT-d virtual command interface always uses the full 20 bit
3094 	 * PASID range. Host can partition guest PASID range based on
3095 	 * policies but it is out of guest's control.
3096 	 */
3097 	if (min < PASID_MIN || max > intel_pasid_max_id)
3098 		return INVALID_IOASID;
3099 
3100 	if (vcmd_alloc_pasid(iommu, &ioasid))
3101 		return INVALID_IOASID;
3102 
3103 	return ioasid;
3104 }
3105 
3106 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3107 {
3108 	struct intel_iommu *iommu = data;
3109 
3110 	if (!iommu)
3111 		return;
3112 	/*
3113 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3114 	 * We can only free the PASID when all the devices are unbound.
3115 	 */
3116 	if (ioasid_find(NULL, ioasid, NULL)) {
3117 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3118 		return;
3119 	}
3120 	vcmd_free_pasid(iommu, ioasid);
3121 }
3122 
3123 static void register_pasid_allocator(struct intel_iommu *iommu)
3124 {
3125 	/*
3126 	 * If we are running in the host, no need for custom allocator
3127 	 * in that PASIDs are allocated from the host system-wide.
3128 	 */
3129 	if (!cap_caching_mode(iommu->cap))
3130 		return;
3131 
3132 	if (!sm_supported(iommu)) {
3133 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3134 		return;
3135 	}
3136 
3137 	/*
3138 	 * Register a custom PASID allocator if we are running in a guest,
3139 	 * guest PASID must be obtained via virtual command interface.
3140 	 * There can be multiple vIOMMUs in each guest but only one allocator
3141 	 * is active. All vIOMMU allocators will eventually be calling the same
3142 	 * host allocator.
3143 	 */
3144 	if (!vccap_pasid(iommu->vccap))
3145 		return;
3146 
3147 	pr_info("Register custom PASID allocator\n");
3148 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3149 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3150 	iommu->pasid_allocator.pdata = (void *)iommu;
3151 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3152 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3153 		/*
3154 		 * Disable scalable mode on this IOMMU if there
3155 		 * is no custom allocator. Mixing SM capable vIOMMU
3156 		 * and non-SM vIOMMU are not supported.
3157 		 */
3158 		intel_iommu_sm = 0;
3159 	}
3160 }
3161 #endif
3162 
3163 static int __init init_dmars(void)
3164 {
3165 	struct dmar_drhd_unit *drhd;
3166 	struct intel_iommu *iommu;
3167 	int ret;
3168 
3169 	/*
3170 	 * for each drhd
3171 	 *    allocate root
3172 	 *    initialize and program root entry to not present
3173 	 * endfor
3174 	 */
3175 	for_each_drhd_unit(drhd) {
3176 		/*
3177 		 * lock not needed as this is only incremented in the single
3178 		 * threaded kernel __init code path all other access are read
3179 		 * only
3180 		 */
3181 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3182 			g_num_of_iommus++;
3183 			continue;
3184 		}
3185 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3186 	}
3187 
3188 	/* Preallocate enough resources for IOMMU hot-addition */
3189 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3190 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3191 
3192 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3193 			GFP_KERNEL);
3194 	if (!g_iommus) {
3195 		pr_err("Allocating global iommu array failed\n");
3196 		ret = -ENOMEM;
3197 		goto error;
3198 	}
3199 
3200 	for_each_iommu(iommu, drhd) {
3201 		if (drhd->ignored) {
3202 			iommu_disable_translation(iommu);
3203 			continue;
3204 		}
3205 
3206 		/*
3207 		 * Find the max pasid size of all IOMMU's in the system.
3208 		 * We need to ensure the system pasid table is no bigger
3209 		 * than the smallest supported.
3210 		 */
3211 		if (pasid_supported(iommu)) {
3212 			u32 temp = 2 << ecap_pss(iommu->ecap);
3213 
3214 			intel_pasid_max_id = min_t(u32, temp,
3215 						   intel_pasid_max_id);
3216 		}
3217 
3218 		g_iommus[iommu->seq_id] = iommu;
3219 
3220 		intel_iommu_init_qi(iommu);
3221 
3222 		ret = iommu_init_domains(iommu);
3223 		if (ret)
3224 			goto free_iommu;
3225 
3226 		init_translation_status(iommu);
3227 
3228 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3229 			iommu_disable_translation(iommu);
3230 			clear_translation_pre_enabled(iommu);
3231 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3232 				iommu->name);
3233 		}
3234 
3235 		/*
3236 		 * TBD:
3237 		 * we could share the same root & context tables
3238 		 * among all IOMMU's. Need to Split it later.
3239 		 */
3240 		ret = iommu_alloc_root_entry(iommu);
3241 		if (ret)
3242 			goto free_iommu;
3243 
3244 		if (translation_pre_enabled(iommu)) {
3245 			pr_info("Translation already enabled - trying to copy translation structures\n");
3246 
3247 			ret = copy_translation_tables(iommu);
3248 			if (ret) {
3249 				/*
3250 				 * We found the IOMMU with translation
3251 				 * enabled - but failed to copy over the
3252 				 * old root-entry table. Try to proceed
3253 				 * by disabling translation now and
3254 				 * allocating a clean root-entry table.
3255 				 * This might cause DMAR faults, but
3256 				 * probably the dump will still succeed.
3257 				 */
3258 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3259 				       iommu->name);
3260 				iommu_disable_translation(iommu);
3261 				clear_translation_pre_enabled(iommu);
3262 			} else {
3263 				pr_info("Copied translation tables from previous kernel for %s\n",
3264 					iommu->name);
3265 			}
3266 		}
3267 
3268 		if (!ecap_pass_through(iommu->ecap))
3269 			hw_pass_through = 0;
3270 		intel_svm_check(iommu);
3271 	}
3272 
3273 	/*
3274 	 * Now that qi is enabled on all iommus, set the root entry and flush
3275 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3276 	 * flush_context function will loop forever and the boot hangs.
3277 	 */
3278 	for_each_active_iommu(iommu, drhd) {
3279 		iommu_flush_write_buffer(iommu);
3280 #ifdef CONFIG_INTEL_IOMMU_SVM
3281 		register_pasid_allocator(iommu);
3282 #endif
3283 		iommu_set_root_entry(iommu);
3284 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3285 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3286 	}
3287 
3288 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3289 	dmar_map_gfx = 0;
3290 #endif
3291 
3292 	if (!dmar_map_gfx)
3293 		iommu_identity_mapping |= IDENTMAP_GFX;
3294 
3295 	check_tylersburg_isoch();
3296 
3297 	ret = si_domain_init(hw_pass_through);
3298 	if (ret)
3299 		goto free_iommu;
3300 
3301 	/*
3302 	 * for each drhd
3303 	 *   enable fault log
3304 	 *   global invalidate context cache
3305 	 *   global invalidate iotlb
3306 	 *   enable translation
3307 	 */
3308 	for_each_iommu(iommu, drhd) {
3309 		if (drhd->ignored) {
3310 			/*
3311 			 * we always have to disable PMRs or DMA may fail on
3312 			 * this device
3313 			 */
3314 			if (force_on)
3315 				iommu_disable_protect_mem_regions(iommu);
3316 			continue;
3317 		}
3318 
3319 		iommu_flush_write_buffer(iommu);
3320 
3321 #ifdef CONFIG_INTEL_IOMMU_SVM
3322 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3323 			/*
3324 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3325 			 * could cause possible lock race condition.
3326 			 */
3327 			up_write(&dmar_global_lock);
3328 			ret = intel_svm_enable_prq(iommu);
3329 			down_write(&dmar_global_lock);
3330 			if (ret)
3331 				goto free_iommu;
3332 		}
3333 #endif
3334 		ret = dmar_set_interrupt(iommu);
3335 		if (ret)
3336 			goto free_iommu;
3337 	}
3338 
3339 	return 0;
3340 
3341 free_iommu:
3342 	for_each_active_iommu(iommu, drhd) {
3343 		disable_dmar_iommu(iommu);
3344 		free_dmar_iommu(iommu);
3345 	}
3346 
3347 	kfree(g_iommus);
3348 
3349 error:
3350 	return ret;
3351 }
3352 
3353 static inline int iommu_domain_cache_init(void)
3354 {
3355 	int ret = 0;
3356 
3357 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3358 					 sizeof(struct dmar_domain),
3359 					 0,
3360 					 SLAB_HWCACHE_ALIGN,
3361 
3362 					 NULL);
3363 	if (!iommu_domain_cache) {
3364 		pr_err("Couldn't create iommu_domain cache\n");
3365 		ret = -ENOMEM;
3366 	}
3367 
3368 	return ret;
3369 }
3370 
3371 static inline int iommu_devinfo_cache_init(void)
3372 {
3373 	int ret = 0;
3374 
3375 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3376 					 sizeof(struct device_domain_info),
3377 					 0,
3378 					 SLAB_HWCACHE_ALIGN,
3379 					 NULL);
3380 	if (!iommu_devinfo_cache) {
3381 		pr_err("Couldn't create devinfo cache\n");
3382 		ret = -ENOMEM;
3383 	}
3384 
3385 	return ret;
3386 }
3387 
3388 static int __init iommu_init_mempool(void)
3389 {
3390 	int ret;
3391 	ret = iova_cache_get();
3392 	if (ret)
3393 		return ret;
3394 
3395 	ret = iommu_domain_cache_init();
3396 	if (ret)
3397 		goto domain_error;
3398 
3399 	ret = iommu_devinfo_cache_init();
3400 	if (!ret)
3401 		return ret;
3402 
3403 	kmem_cache_destroy(iommu_domain_cache);
3404 domain_error:
3405 	iova_cache_put();
3406 
3407 	return -ENOMEM;
3408 }
3409 
3410 static void __init iommu_exit_mempool(void)
3411 {
3412 	kmem_cache_destroy(iommu_devinfo_cache);
3413 	kmem_cache_destroy(iommu_domain_cache);
3414 	iova_cache_put();
3415 }
3416 
3417 static void __init init_no_remapping_devices(void)
3418 {
3419 	struct dmar_drhd_unit *drhd;
3420 	struct device *dev;
3421 	int i;
3422 
3423 	for_each_drhd_unit(drhd) {
3424 		if (!drhd->include_all) {
3425 			for_each_active_dev_scope(drhd->devices,
3426 						  drhd->devices_cnt, i, dev)
3427 				break;
3428 			/* ignore DMAR unit if no devices exist */
3429 			if (i == drhd->devices_cnt)
3430 				drhd->ignored = 1;
3431 		}
3432 	}
3433 
3434 	for_each_active_drhd_unit(drhd) {
3435 		if (drhd->include_all)
3436 			continue;
3437 
3438 		for_each_active_dev_scope(drhd->devices,
3439 					  drhd->devices_cnt, i, dev)
3440 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3441 				break;
3442 		if (i < drhd->devices_cnt)
3443 			continue;
3444 
3445 		/* This IOMMU has *only* gfx devices. Either bypass it or
3446 		   set the gfx_mapped flag, as appropriate */
3447 		drhd->gfx_dedicated = 1;
3448 		if (!dmar_map_gfx)
3449 			drhd->ignored = 1;
3450 	}
3451 }
3452 
3453 #ifdef CONFIG_SUSPEND
3454 static int init_iommu_hw(void)
3455 {
3456 	struct dmar_drhd_unit *drhd;
3457 	struct intel_iommu *iommu = NULL;
3458 
3459 	for_each_active_iommu(iommu, drhd)
3460 		if (iommu->qi)
3461 			dmar_reenable_qi(iommu);
3462 
3463 	for_each_iommu(iommu, drhd) {
3464 		if (drhd->ignored) {
3465 			/*
3466 			 * we always have to disable PMRs or DMA may fail on
3467 			 * this device
3468 			 */
3469 			if (force_on)
3470 				iommu_disable_protect_mem_regions(iommu);
3471 			continue;
3472 		}
3473 
3474 		iommu_flush_write_buffer(iommu);
3475 
3476 		iommu_set_root_entry(iommu);
3477 
3478 		iommu->flush.flush_context(iommu, 0, 0, 0,
3479 					   DMA_CCMD_GLOBAL_INVL);
3480 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3481 		iommu_enable_translation(iommu);
3482 		iommu_disable_protect_mem_regions(iommu);
3483 	}
3484 
3485 	return 0;
3486 }
3487 
3488 static void iommu_flush_all(void)
3489 {
3490 	struct dmar_drhd_unit *drhd;
3491 	struct intel_iommu *iommu;
3492 
3493 	for_each_active_iommu(iommu, drhd) {
3494 		iommu->flush.flush_context(iommu, 0, 0, 0,
3495 					   DMA_CCMD_GLOBAL_INVL);
3496 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3497 					 DMA_TLB_GLOBAL_FLUSH);
3498 	}
3499 }
3500 
3501 static int iommu_suspend(void)
3502 {
3503 	struct dmar_drhd_unit *drhd;
3504 	struct intel_iommu *iommu = NULL;
3505 	unsigned long flag;
3506 
3507 	for_each_active_iommu(iommu, drhd) {
3508 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3509 					     GFP_KERNEL);
3510 		if (!iommu->iommu_state)
3511 			goto nomem;
3512 	}
3513 
3514 	iommu_flush_all();
3515 
3516 	for_each_active_iommu(iommu, drhd) {
3517 		iommu_disable_translation(iommu);
3518 
3519 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3520 
3521 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3522 			readl(iommu->reg + DMAR_FECTL_REG);
3523 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3524 			readl(iommu->reg + DMAR_FEDATA_REG);
3525 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3526 			readl(iommu->reg + DMAR_FEADDR_REG);
3527 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3528 			readl(iommu->reg + DMAR_FEUADDR_REG);
3529 
3530 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3531 	}
3532 	return 0;
3533 
3534 nomem:
3535 	for_each_active_iommu(iommu, drhd)
3536 		kfree(iommu->iommu_state);
3537 
3538 	return -ENOMEM;
3539 }
3540 
3541 static void iommu_resume(void)
3542 {
3543 	struct dmar_drhd_unit *drhd;
3544 	struct intel_iommu *iommu = NULL;
3545 	unsigned long flag;
3546 
3547 	if (init_iommu_hw()) {
3548 		if (force_on)
3549 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3550 		else
3551 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3552 		return;
3553 	}
3554 
3555 	for_each_active_iommu(iommu, drhd) {
3556 
3557 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3558 
3559 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3560 			iommu->reg + DMAR_FECTL_REG);
3561 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3562 			iommu->reg + DMAR_FEDATA_REG);
3563 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3564 			iommu->reg + DMAR_FEADDR_REG);
3565 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3566 			iommu->reg + DMAR_FEUADDR_REG);
3567 
3568 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3569 	}
3570 
3571 	for_each_active_iommu(iommu, drhd)
3572 		kfree(iommu->iommu_state);
3573 }
3574 
3575 static struct syscore_ops iommu_syscore_ops = {
3576 	.resume		= iommu_resume,
3577 	.suspend	= iommu_suspend,
3578 };
3579 
3580 static void __init init_iommu_pm_ops(void)
3581 {
3582 	register_syscore_ops(&iommu_syscore_ops);
3583 }
3584 
3585 #else
3586 static inline void init_iommu_pm_ops(void) {}
3587 #endif	/* CONFIG_PM */
3588 
3589 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3590 {
3591 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3592 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3593 	    rmrr->end_address <= rmrr->base_address ||
3594 	    arch_rmrr_sanity_check(rmrr))
3595 		return -EINVAL;
3596 
3597 	return 0;
3598 }
3599 
3600 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3601 {
3602 	struct acpi_dmar_reserved_memory *rmrr;
3603 	struct dmar_rmrr_unit *rmrru;
3604 
3605 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3606 	if (rmrr_sanity_check(rmrr)) {
3607 		pr_warn(FW_BUG
3608 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3609 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3610 			   rmrr->base_address, rmrr->end_address,
3611 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3612 			   dmi_get_system_info(DMI_BIOS_VERSION),
3613 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3614 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3615 	}
3616 
3617 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3618 	if (!rmrru)
3619 		goto out;
3620 
3621 	rmrru->hdr = header;
3622 
3623 	rmrru->base_address = rmrr->base_address;
3624 	rmrru->end_address = rmrr->end_address;
3625 
3626 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3627 				((void *)rmrr) + rmrr->header.length,
3628 				&rmrru->devices_cnt);
3629 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3630 		goto free_rmrru;
3631 
3632 	list_add(&rmrru->list, &dmar_rmrr_units);
3633 
3634 	return 0;
3635 free_rmrru:
3636 	kfree(rmrru);
3637 out:
3638 	return -ENOMEM;
3639 }
3640 
3641 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3642 {
3643 	struct dmar_atsr_unit *atsru;
3644 	struct acpi_dmar_atsr *tmp;
3645 
3646 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3647 				dmar_rcu_check()) {
3648 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3649 		if (atsr->segment != tmp->segment)
3650 			continue;
3651 		if (atsr->header.length != tmp->header.length)
3652 			continue;
3653 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3654 			return atsru;
3655 	}
3656 
3657 	return NULL;
3658 }
3659 
3660 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3661 {
3662 	struct acpi_dmar_atsr *atsr;
3663 	struct dmar_atsr_unit *atsru;
3664 
3665 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3666 		return 0;
3667 
3668 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3669 	atsru = dmar_find_atsr(atsr);
3670 	if (atsru)
3671 		return 0;
3672 
3673 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3674 	if (!atsru)
3675 		return -ENOMEM;
3676 
3677 	/*
3678 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3679 	 * copy the memory content because the memory buffer will be freed
3680 	 * on return.
3681 	 */
3682 	atsru->hdr = (void *)(atsru + 1);
3683 	memcpy(atsru->hdr, hdr, hdr->length);
3684 	atsru->include_all = atsr->flags & 0x1;
3685 	if (!atsru->include_all) {
3686 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3687 				(void *)atsr + atsr->header.length,
3688 				&atsru->devices_cnt);
3689 		if (atsru->devices_cnt && atsru->devices == NULL) {
3690 			kfree(atsru);
3691 			return -ENOMEM;
3692 		}
3693 	}
3694 
3695 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3696 
3697 	return 0;
3698 }
3699 
3700 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3701 {
3702 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3703 	kfree(atsru);
3704 }
3705 
3706 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3707 {
3708 	struct acpi_dmar_atsr *atsr;
3709 	struct dmar_atsr_unit *atsru;
3710 
3711 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3712 	atsru = dmar_find_atsr(atsr);
3713 	if (atsru) {
3714 		list_del_rcu(&atsru->list);
3715 		synchronize_rcu();
3716 		intel_iommu_free_atsr(atsru);
3717 	}
3718 
3719 	return 0;
3720 }
3721 
3722 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3723 {
3724 	int i;
3725 	struct device *dev;
3726 	struct acpi_dmar_atsr *atsr;
3727 	struct dmar_atsr_unit *atsru;
3728 
3729 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3730 	atsru = dmar_find_atsr(atsr);
3731 	if (!atsru)
3732 		return 0;
3733 
3734 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3735 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3736 					  i, dev)
3737 			return -EBUSY;
3738 	}
3739 
3740 	return 0;
3741 }
3742 
3743 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3744 {
3745 	int sp, ret;
3746 	struct intel_iommu *iommu = dmaru->iommu;
3747 
3748 	if (g_iommus[iommu->seq_id])
3749 		return 0;
3750 
3751 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3752 		pr_warn("%s: Doesn't support hardware pass through.\n",
3753 			iommu->name);
3754 		return -ENXIO;
3755 	}
3756 	if (!ecap_sc_support(iommu->ecap) &&
3757 	    domain_update_iommu_snooping(iommu)) {
3758 		pr_warn("%s: Doesn't support snooping.\n",
3759 			iommu->name);
3760 		return -ENXIO;
3761 	}
3762 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3763 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3764 		pr_warn("%s: Doesn't support large page.\n",
3765 			iommu->name);
3766 		return -ENXIO;
3767 	}
3768 
3769 	/*
3770 	 * Disable translation if already enabled prior to OS handover.
3771 	 */
3772 	if (iommu->gcmd & DMA_GCMD_TE)
3773 		iommu_disable_translation(iommu);
3774 
3775 	g_iommus[iommu->seq_id] = iommu;
3776 	ret = iommu_init_domains(iommu);
3777 	if (ret == 0)
3778 		ret = iommu_alloc_root_entry(iommu);
3779 	if (ret)
3780 		goto out;
3781 
3782 	intel_svm_check(iommu);
3783 
3784 	if (dmaru->ignored) {
3785 		/*
3786 		 * we always have to disable PMRs or DMA may fail on this device
3787 		 */
3788 		if (force_on)
3789 			iommu_disable_protect_mem_regions(iommu);
3790 		return 0;
3791 	}
3792 
3793 	intel_iommu_init_qi(iommu);
3794 	iommu_flush_write_buffer(iommu);
3795 
3796 #ifdef CONFIG_INTEL_IOMMU_SVM
3797 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3798 		ret = intel_svm_enable_prq(iommu);
3799 		if (ret)
3800 			goto disable_iommu;
3801 	}
3802 #endif
3803 	ret = dmar_set_interrupt(iommu);
3804 	if (ret)
3805 		goto disable_iommu;
3806 
3807 	iommu_set_root_entry(iommu);
3808 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3809 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3810 	iommu_enable_translation(iommu);
3811 
3812 	iommu_disable_protect_mem_regions(iommu);
3813 	return 0;
3814 
3815 disable_iommu:
3816 	disable_dmar_iommu(iommu);
3817 out:
3818 	free_dmar_iommu(iommu);
3819 	return ret;
3820 }
3821 
3822 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3823 {
3824 	int ret = 0;
3825 	struct intel_iommu *iommu = dmaru->iommu;
3826 
3827 	if (!intel_iommu_enabled)
3828 		return 0;
3829 	if (iommu == NULL)
3830 		return -EINVAL;
3831 
3832 	if (insert) {
3833 		ret = intel_iommu_add(dmaru);
3834 	} else {
3835 		disable_dmar_iommu(iommu);
3836 		free_dmar_iommu(iommu);
3837 	}
3838 
3839 	return ret;
3840 }
3841 
3842 static void intel_iommu_free_dmars(void)
3843 {
3844 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3845 	struct dmar_atsr_unit *atsru, *atsr_n;
3846 
3847 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3848 		list_del(&rmrru->list);
3849 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3850 		kfree(rmrru);
3851 	}
3852 
3853 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3854 		list_del(&atsru->list);
3855 		intel_iommu_free_atsr(atsru);
3856 	}
3857 }
3858 
3859 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3860 {
3861 	int i, ret = 1;
3862 	struct pci_bus *bus;
3863 	struct pci_dev *bridge = NULL;
3864 	struct device *tmp;
3865 	struct acpi_dmar_atsr *atsr;
3866 	struct dmar_atsr_unit *atsru;
3867 
3868 	dev = pci_physfn(dev);
3869 	for (bus = dev->bus; bus; bus = bus->parent) {
3870 		bridge = bus->self;
3871 		/* If it's an integrated device, allow ATS */
3872 		if (!bridge)
3873 			return 1;
3874 		/* Connected via non-PCIe: no ATS */
3875 		if (!pci_is_pcie(bridge) ||
3876 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3877 			return 0;
3878 		/* If we found the root port, look it up in the ATSR */
3879 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3880 			break;
3881 	}
3882 
3883 	rcu_read_lock();
3884 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3885 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3886 		if (atsr->segment != pci_domain_nr(dev->bus))
3887 			continue;
3888 
3889 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3890 			if (tmp == &bridge->dev)
3891 				goto out;
3892 
3893 		if (atsru->include_all)
3894 			goto out;
3895 	}
3896 	ret = 0;
3897 out:
3898 	rcu_read_unlock();
3899 
3900 	return ret;
3901 }
3902 
3903 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3904 {
3905 	int ret;
3906 	struct dmar_rmrr_unit *rmrru;
3907 	struct dmar_atsr_unit *atsru;
3908 	struct acpi_dmar_atsr *atsr;
3909 	struct acpi_dmar_reserved_memory *rmrr;
3910 
3911 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3912 		return 0;
3913 
3914 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3915 		rmrr = container_of(rmrru->hdr,
3916 				    struct acpi_dmar_reserved_memory, header);
3917 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3918 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3919 				((void *)rmrr) + rmrr->header.length,
3920 				rmrr->segment, rmrru->devices,
3921 				rmrru->devices_cnt);
3922 			if (ret < 0)
3923 				return ret;
3924 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3925 			dmar_remove_dev_scope(info, rmrr->segment,
3926 				rmrru->devices, rmrru->devices_cnt);
3927 		}
3928 	}
3929 
3930 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3931 		if (atsru->include_all)
3932 			continue;
3933 
3934 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3935 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3936 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3937 					(void *)atsr + atsr->header.length,
3938 					atsr->segment, atsru->devices,
3939 					atsru->devices_cnt);
3940 			if (ret > 0)
3941 				break;
3942 			else if (ret < 0)
3943 				return ret;
3944 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3945 			if (dmar_remove_dev_scope(info, atsr->segment,
3946 					atsru->devices, atsru->devices_cnt))
3947 				break;
3948 		}
3949 	}
3950 
3951 	return 0;
3952 }
3953 
3954 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3955 				       unsigned long val, void *v)
3956 {
3957 	struct memory_notify *mhp = v;
3958 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3959 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3960 			mhp->nr_pages - 1);
3961 
3962 	switch (val) {
3963 	case MEM_GOING_ONLINE:
3964 		if (iommu_domain_identity_map(si_domain,
3965 					      start_vpfn, last_vpfn)) {
3966 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3967 				start_vpfn, last_vpfn);
3968 			return NOTIFY_BAD;
3969 		}
3970 		break;
3971 
3972 	case MEM_OFFLINE:
3973 	case MEM_CANCEL_ONLINE:
3974 		{
3975 			struct dmar_drhd_unit *drhd;
3976 			struct intel_iommu *iommu;
3977 			struct page *freelist;
3978 
3979 			freelist = domain_unmap(si_domain,
3980 						start_vpfn, last_vpfn,
3981 						NULL);
3982 
3983 			rcu_read_lock();
3984 			for_each_active_iommu(iommu, drhd)
3985 				iommu_flush_iotlb_psi(iommu, si_domain,
3986 					start_vpfn, mhp->nr_pages,
3987 					!freelist, 0);
3988 			rcu_read_unlock();
3989 			dma_free_pagelist(freelist);
3990 		}
3991 		break;
3992 	}
3993 
3994 	return NOTIFY_OK;
3995 }
3996 
3997 static struct notifier_block intel_iommu_memory_nb = {
3998 	.notifier_call = intel_iommu_memory_notifier,
3999 	.priority = 0
4000 };
4001 
4002 static void free_all_cpu_cached_iovas(unsigned int cpu)
4003 {
4004 	int i;
4005 
4006 	for (i = 0; i < g_num_of_iommus; i++) {
4007 		struct intel_iommu *iommu = g_iommus[i];
4008 		struct dmar_domain *domain;
4009 		int did;
4010 
4011 		if (!iommu)
4012 			continue;
4013 
4014 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4015 			domain = get_iommu_domain(iommu, (u16)did);
4016 
4017 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4018 				continue;
4019 
4020 			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4021 		}
4022 	}
4023 }
4024 
4025 static int intel_iommu_cpu_dead(unsigned int cpu)
4026 {
4027 	free_all_cpu_cached_iovas(cpu);
4028 	return 0;
4029 }
4030 
4031 static void intel_disable_iommus(void)
4032 {
4033 	struct intel_iommu *iommu = NULL;
4034 	struct dmar_drhd_unit *drhd;
4035 
4036 	for_each_iommu(iommu, drhd)
4037 		iommu_disable_translation(iommu);
4038 }
4039 
4040 void intel_iommu_shutdown(void)
4041 {
4042 	struct dmar_drhd_unit *drhd;
4043 	struct intel_iommu *iommu = NULL;
4044 
4045 	if (no_iommu || dmar_disabled)
4046 		return;
4047 
4048 	down_write(&dmar_global_lock);
4049 
4050 	/* Disable PMRs explicitly here. */
4051 	for_each_iommu(iommu, drhd)
4052 		iommu_disable_protect_mem_regions(iommu);
4053 
4054 	/* Make sure the IOMMUs are switched off */
4055 	intel_disable_iommus();
4056 
4057 	up_write(&dmar_global_lock);
4058 }
4059 
4060 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4061 {
4062 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4063 
4064 	return container_of(iommu_dev, struct intel_iommu, iommu);
4065 }
4066 
4067 static ssize_t intel_iommu_show_version(struct device *dev,
4068 					struct device_attribute *attr,
4069 					char *buf)
4070 {
4071 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4072 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4073 	return sprintf(buf, "%d:%d\n",
4074 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4075 }
4076 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4077 
4078 static ssize_t intel_iommu_show_address(struct device *dev,
4079 					struct device_attribute *attr,
4080 					char *buf)
4081 {
4082 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4083 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4084 }
4085 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4086 
4087 static ssize_t intel_iommu_show_cap(struct device *dev,
4088 				    struct device_attribute *attr,
4089 				    char *buf)
4090 {
4091 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4092 	return sprintf(buf, "%llx\n", iommu->cap);
4093 }
4094 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4095 
4096 static ssize_t intel_iommu_show_ecap(struct device *dev,
4097 				    struct device_attribute *attr,
4098 				    char *buf)
4099 {
4100 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4101 	return sprintf(buf, "%llx\n", iommu->ecap);
4102 }
4103 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4104 
4105 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4106 				      struct device_attribute *attr,
4107 				      char *buf)
4108 {
4109 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4110 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4111 }
4112 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4113 
4114 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4115 					   struct device_attribute *attr,
4116 					   char *buf)
4117 {
4118 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4119 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4120 						  cap_ndoms(iommu->cap)));
4121 }
4122 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4123 
4124 static struct attribute *intel_iommu_attrs[] = {
4125 	&dev_attr_version.attr,
4126 	&dev_attr_address.attr,
4127 	&dev_attr_cap.attr,
4128 	&dev_attr_ecap.attr,
4129 	&dev_attr_domains_supported.attr,
4130 	&dev_attr_domains_used.attr,
4131 	NULL,
4132 };
4133 
4134 static struct attribute_group intel_iommu_group = {
4135 	.name = "intel-iommu",
4136 	.attrs = intel_iommu_attrs,
4137 };
4138 
4139 const struct attribute_group *intel_iommu_groups[] = {
4140 	&intel_iommu_group,
4141 	NULL,
4142 };
4143 
4144 static inline bool has_external_pci(void)
4145 {
4146 	struct pci_dev *pdev = NULL;
4147 
4148 	for_each_pci_dev(pdev)
4149 		if (pdev->external_facing)
4150 			return true;
4151 
4152 	return false;
4153 }
4154 
4155 static int __init platform_optin_force_iommu(void)
4156 {
4157 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4158 		return 0;
4159 
4160 	if (no_iommu || dmar_disabled)
4161 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4162 
4163 	/*
4164 	 * If Intel-IOMMU is disabled by default, we will apply identity
4165 	 * map for all devices except those marked as being untrusted.
4166 	 */
4167 	if (dmar_disabled)
4168 		iommu_set_default_passthrough(false);
4169 
4170 	dmar_disabled = 0;
4171 	no_iommu = 0;
4172 
4173 	return 1;
4174 }
4175 
4176 static int __init probe_acpi_namespace_devices(void)
4177 {
4178 	struct dmar_drhd_unit *drhd;
4179 	/* To avoid a -Wunused-but-set-variable warning. */
4180 	struct intel_iommu *iommu __maybe_unused;
4181 	struct device *dev;
4182 	int i, ret = 0;
4183 
4184 	for_each_active_iommu(iommu, drhd) {
4185 		for_each_active_dev_scope(drhd->devices,
4186 					  drhd->devices_cnt, i, dev) {
4187 			struct acpi_device_physical_node *pn;
4188 			struct iommu_group *group;
4189 			struct acpi_device *adev;
4190 
4191 			if (dev->bus != &acpi_bus_type)
4192 				continue;
4193 
4194 			adev = to_acpi_device(dev);
4195 			mutex_lock(&adev->physical_node_lock);
4196 			list_for_each_entry(pn,
4197 					    &adev->physical_node_list, node) {
4198 				group = iommu_group_get(pn->dev);
4199 				if (group) {
4200 					iommu_group_put(group);
4201 					continue;
4202 				}
4203 
4204 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4205 				ret = iommu_probe_device(pn->dev);
4206 				if (ret)
4207 					break;
4208 			}
4209 			mutex_unlock(&adev->physical_node_lock);
4210 
4211 			if (ret)
4212 				return ret;
4213 		}
4214 	}
4215 
4216 	return 0;
4217 }
4218 
4219 int __init intel_iommu_init(void)
4220 {
4221 	int ret = -ENODEV;
4222 	struct dmar_drhd_unit *drhd;
4223 	struct intel_iommu *iommu;
4224 
4225 	/*
4226 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4227 	 * opt in, so enforce that.
4228 	 */
4229 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4230 		    platform_optin_force_iommu();
4231 
4232 	if (iommu_init_mempool()) {
4233 		if (force_on)
4234 			panic("tboot: Failed to initialize iommu memory\n");
4235 		return -ENOMEM;
4236 	}
4237 
4238 	down_write(&dmar_global_lock);
4239 	if (dmar_table_init()) {
4240 		if (force_on)
4241 			panic("tboot: Failed to initialize DMAR table\n");
4242 		goto out_free_dmar;
4243 	}
4244 
4245 	if (dmar_dev_scope_init() < 0) {
4246 		if (force_on)
4247 			panic("tboot: Failed to initialize DMAR device scope\n");
4248 		goto out_free_dmar;
4249 	}
4250 
4251 	up_write(&dmar_global_lock);
4252 
4253 	/*
4254 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4255 	 * complain later when we register it under the lock.
4256 	 */
4257 	dmar_register_bus_notifier();
4258 
4259 	down_write(&dmar_global_lock);
4260 
4261 	if (!no_iommu)
4262 		intel_iommu_debugfs_init();
4263 
4264 	if (no_iommu || dmar_disabled) {
4265 		/*
4266 		 * We exit the function here to ensure IOMMU's remapping and
4267 		 * mempool aren't setup, which means that the IOMMU's PMRs
4268 		 * won't be disabled via the call to init_dmars(). So disable
4269 		 * it explicitly here. The PMRs were setup by tboot prior to
4270 		 * calling SENTER, but the kernel is expected to reset/tear
4271 		 * down the PMRs.
4272 		 */
4273 		if (intel_iommu_tboot_noforce) {
4274 			for_each_iommu(iommu, drhd)
4275 				iommu_disable_protect_mem_regions(iommu);
4276 		}
4277 
4278 		/*
4279 		 * Make sure the IOMMUs are switched off, even when we
4280 		 * boot into a kexec kernel and the previous kernel left
4281 		 * them enabled
4282 		 */
4283 		intel_disable_iommus();
4284 		goto out_free_dmar;
4285 	}
4286 
4287 	if (list_empty(&dmar_rmrr_units))
4288 		pr_info("No RMRR found\n");
4289 
4290 	if (list_empty(&dmar_atsr_units))
4291 		pr_info("No ATSR found\n");
4292 
4293 	if (dmar_map_gfx)
4294 		intel_iommu_gfx_mapped = 1;
4295 
4296 	init_no_remapping_devices();
4297 
4298 	ret = init_dmars();
4299 	if (ret) {
4300 		if (force_on)
4301 			panic("tboot: Failed to initialize DMARs\n");
4302 		pr_err("Initialization failed\n");
4303 		goto out_free_dmar;
4304 	}
4305 	up_write(&dmar_global_lock);
4306 
4307 	init_iommu_pm_ops();
4308 
4309 	down_read(&dmar_global_lock);
4310 	for_each_active_iommu(iommu, drhd) {
4311 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4312 				       intel_iommu_groups,
4313 				       "%s", iommu->name);
4314 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4315 		iommu_device_register(&iommu->iommu);
4316 	}
4317 	up_read(&dmar_global_lock);
4318 
4319 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4320 	if (si_domain && !hw_pass_through)
4321 		register_memory_notifier(&intel_iommu_memory_nb);
4322 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4323 			  intel_iommu_cpu_dead);
4324 
4325 	down_read(&dmar_global_lock);
4326 	if (probe_acpi_namespace_devices())
4327 		pr_warn("ACPI name space devices didn't probe correctly\n");
4328 
4329 	/* Finally, we enable the DMA remapping hardware. */
4330 	for_each_iommu(iommu, drhd) {
4331 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4332 			iommu_enable_translation(iommu);
4333 
4334 		iommu_disable_protect_mem_regions(iommu);
4335 	}
4336 	up_read(&dmar_global_lock);
4337 
4338 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4339 
4340 	intel_iommu_enabled = 1;
4341 
4342 	return 0;
4343 
4344 out_free_dmar:
4345 	intel_iommu_free_dmars();
4346 	up_write(&dmar_global_lock);
4347 	iommu_exit_mempool();
4348 	return ret;
4349 }
4350 
4351 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4352 {
4353 	struct intel_iommu *iommu = opaque;
4354 
4355 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4356 	return 0;
4357 }
4358 
4359 /*
4360  * NB - intel-iommu lacks any sort of reference counting for the users of
4361  * dependent devices.  If multiple endpoints have intersecting dependent
4362  * devices, unbinding the driver from any one of them will possibly leave
4363  * the others unable to operate.
4364  */
4365 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4366 {
4367 	if (!iommu || !dev || !dev_is_pci(dev))
4368 		return;
4369 
4370 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4371 }
4372 
4373 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4374 {
4375 	struct dmar_domain *domain;
4376 	struct intel_iommu *iommu;
4377 	unsigned long flags;
4378 
4379 	assert_spin_locked(&device_domain_lock);
4380 
4381 	if (WARN_ON(!info))
4382 		return;
4383 
4384 	iommu = info->iommu;
4385 	domain = info->domain;
4386 
4387 	if (info->dev) {
4388 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4389 			intel_pasid_tear_down_entry(iommu, info->dev,
4390 					PASID_RID2PASID, false);
4391 
4392 		iommu_disable_dev_iotlb(info);
4393 		if (!dev_is_real_dma_subdevice(info->dev))
4394 			domain_context_clear(iommu, info->dev);
4395 		intel_pasid_free_table(info->dev);
4396 	}
4397 
4398 	unlink_domain_info(info);
4399 
4400 	spin_lock_irqsave(&iommu->lock, flags);
4401 	domain_detach_iommu(domain, iommu);
4402 	spin_unlock_irqrestore(&iommu->lock, flags);
4403 
4404 	free_devinfo_mem(info);
4405 }
4406 
4407 static void dmar_remove_one_dev_info(struct device *dev)
4408 {
4409 	struct device_domain_info *info;
4410 	unsigned long flags;
4411 
4412 	spin_lock_irqsave(&device_domain_lock, flags);
4413 	info = get_domain_info(dev);
4414 	if (info)
4415 		__dmar_remove_one_dev_info(info);
4416 	spin_unlock_irqrestore(&device_domain_lock, flags);
4417 }
4418 
4419 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4420 {
4421 	int adjust_width;
4422 
4423 	/* calculate AGAW */
4424 	domain->gaw = guest_width;
4425 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4426 	domain->agaw = width_to_agaw(adjust_width);
4427 
4428 	domain->iommu_coherency = 0;
4429 	domain->iommu_snooping = 0;
4430 	domain->iommu_superpage = 0;
4431 	domain->max_addr = 0;
4432 
4433 	/* always allocate the top pgd */
4434 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4435 	if (!domain->pgd)
4436 		return -ENOMEM;
4437 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4438 	return 0;
4439 }
4440 
4441 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4442 {
4443 	struct dmar_domain *dmar_domain;
4444 	struct iommu_domain *domain;
4445 
4446 	switch (type) {
4447 	case IOMMU_DOMAIN_DMA:
4448 	case IOMMU_DOMAIN_UNMANAGED:
4449 		dmar_domain = alloc_domain(0);
4450 		if (!dmar_domain) {
4451 			pr_err("Can't allocate dmar_domain\n");
4452 			return NULL;
4453 		}
4454 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4455 			pr_err("Domain initialization failed\n");
4456 			domain_exit(dmar_domain);
4457 			return NULL;
4458 		}
4459 
4460 		if (type == IOMMU_DOMAIN_DMA &&
4461 		    iommu_get_dma_cookie(&dmar_domain->domain))
4462 			return NULL;
4463 
4464 		domain = &dmar_domain->domain;
4465 		domain->geometry.aperture_start = 0;
4466 		domain->geometry.aperture_end   =
4467 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4468 		domain->geometry.force_aperture = true;
4469 
4470 		return domain;
4471 	case IOMMU_DOMAIN_IDENTITY:
4472 		return &si_domain->domain;
4473 	default:
4474 		return NULL;
4475 	}
4476 
4477 	return NULL;
4478 }
4479 
4480 static void intel_iommu_domain_free(struct iommu_domain *domain)
4481 {
4482 	if (domain != &si_domain->domain)
4483 		domain_exit(to_dmar_domain(domain));
4484 }
4485 
4486 /*
4487  * Check whether a @domain could be attached to the @dev through the
4488  * aux-domain attach/detach APIs.
4489  */
4490 static inline bool
4491 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4492 {
4493 	struct device_domain_info *info = get_domain_info(dev);
4494 
4495 	return info && info->auxd_enabled &&
4496 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4497 }
4498 
4499 static inline struct subdev_domain_info *
4500 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4501 {
4502 	struct subdev_domain_info *sinfo;
4503 
4504 	if (!list_empty(&domain->subdevices)) {
4505 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4506 			if (sinfo->pdev == dev)
4507 				return sinfo;
4508 		}
4509 	}
4510 
4511 	return NULL;
4512 }
4513 
4514 static int auxiliary_link_device(struct dmar_domain *domain,
4515 				 struct device *dev)
4516 {
4517 	struct device_domain_info *info = get_domain_info(dev);
4518 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4519 
4520 	assert_spin_locked(&device_domain_lock);
4521 	if (WARN_ON(!info))
4522 		return -EINVAL;
4523 
4524 	if (!sinfo) {
4525 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4526 		sinfo->domain = domain;
4527 		sinfo->pdev = dev;
4528 		list_add(&sinfo->link_phys, &info->subdevices);
4529 		list_add(&sinfo->link_domain, &domain->subdevices);
4530 	}
4531 
4532 	return ++sinfo->users;
4533 }
4534 
4535 static int auxiliary_unlink_device(struct dmar_domain *domain,
4536 				   struct device *dev)
4537 {
4538 	struct device_domain_info *info = get_domain_info(dev);
4539 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4540 	int ret;
4541 
4542 	assert_spin_locked(&device_domain_lock);
4543 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4544 		return -EINVAL;
4545 
4546 	ret = --sinfo->users;
4547 	if (!ret) {
4548 		list_del(&sinfo->link_phys);
4549 		list_del(&sinfo->link_domain);
4550 		kfree(sinfo);
4551 	}
4552 
4553 	return ret;
4554 }
4555 
4556 static int aux_domain_add_dev(struct dmar_domain *domain,
4557 			      struct device *dev)
4558 {
4559 	int ret;
4560 	unsigned long flags;
4561 	struct intel_iommu *iommu;
4562 
4563 	iommu = device_to_iommu(dev, NULL, NULL);
4564 	if (!iommu)
4565 		return -ENODEV;
4566 
4567 	if (domain->default_pasid <= 0) {
4568 		u32 pasid;
4569 
4570 		/* No private data needed for the default pasid */
4571 		pasid = ioasid_alloc(NULL, PASID_MIN,
4572 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4573 				     NULL);
4574 		if (pasid == INVALID_IOASID) {
4575 			pr_err("Can't allocate default pasid\n");
4576 			return -ENODEV;
4577 		}
4578 		domain->default_pasid = pasid;
4579 	}
4580 
4581 	spin_lock_irqsave(&device_domain_lock, flags);
4582 	ret = auxiliary_link_device(domain, dev);
4583 	if (ret <= 0)
4584 		goto link_failed;
4585 
4586 	/*
4587 	 * Subdevices from the same physical device can be attached to the
4588 	 * same domain. For such cases, only the first subdevice attachment
4589 	 * needs to go through the full steps in this function. So if ret >
4590 	 * 1, just goto out.
4591 	 */
4592 	if (ret > 1)
4593 		goto out;
4594 
4595 	/*
4596 	 * iommu->lock must be held to attach domain to iommu and setup the
4597 	 * pasid entry for second level translation.
4598 	 */
4599 	spin_lock(&iommu->lock);
4600 	ret = domain_attach_iommu(domain, iommu);
4601 	if (ret)
4602 		goto attach_failed;
4603 
4604 	/* Setup the PASID entry for mediated devices: */
4605 	if (domain_use_first_level(domain))
4606 		ret = domain_setup_first_level(iommu, domain, dev,
4607 					       domain->default_pasid);
4608 	else
4609 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4610 						     domain->default_pasid);
4611 	if (ret)
4612 		goto table_failed;
4613 
4614 	spin_unlock(&iommu->lock);
4615 out:
4616 	spin_unlock_irqrestore(&device_domain_lock, flags);
4617 
4618 	return 0;
4619 
4620 table_failed:
4621 	domain_detach_iommu(domain, iommu);
4622 attach_failed:
4623 	spin_unlock(&iommu->lock);
4624 	auxiliary_unlink_device(domain, dev);
4625 link_failed:
4626 	spin_unlock_irqrestore(&device_domain_lock, flags);
4627 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4628 		ioasid_put(domain->default_pasid);
4629 
4630 	return ret;
4631 }
4632 
4633 static void aux_domain_remove_dev(struct dmar_domain *domain,
4634 				  struct device *dev)
4635 {
4636 	struct device_domain_info *info;
4637 	struct intel_iommu *iommu;
4638 	unsigned long flags;
4639 
4640 	if (!is_aux_domain(dev, &domain->domain))
4641 		return;
4642 
4643 	spin_lock_irqsave(&device_domain_lock, flags);
4644 	info = get_domain_info(dev);
4645 	iommu = info->iommu;
4646 
4647 	if (!auxiliary_unlink_device(domain, dev)) {
4648 		spin_lock(&iommu->lock);
4649 		intel_pasid_tear_down_entry(iommu, dev,
4650 					    domain->default_pasid, false);
4651 		domain_detach_iommu(domain, iommu);
4652 		spin_unlock(&iommu->lock);
4653 	}
4654 
4655 	spin_unlock_irqrestore(&device_domain_lock, flags);
4656 
4657 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4658 		ioasid_put(domain->default_pasid);
4659 }
4660 
4661 static int prepare_domain_attach_device(struct iommu_domain *domain,
4662 					struct device *dev)
4663 {
4664 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4665 	struct intel_iommu *iommu;
4666 	int addr_width;
4667 
4668 	iommu = device_to_iommu(dev, NULL, NULL);
4669 	if (!iommu)
4670 		return -ENODEV;
4671 
4672 	/* check if this iommu agaw is sufficient for max mapped address */
4673 	addr_width = agaw_to_width(iommu->agaw);
4674 	if (addr_width > cap_mgaw(iommu->cap))
4675 		addr_width = cap_mgaw(iommu->cap);
4676 
4677 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4678 		dev_err(dev, "%s: iommu width (%d) is not "
4679 		        "sufficient for the mapped address (%llx)\n",
4680 		        __func__, addr_width, dmar_domain->max_addr);
4681 		return -EFAULT;
4682 	}
4683 	dmar_domain->gaw = addr_width;
4684 
4685 	/*
4686 	 * Knock out extra levels of page tables if necessary
4687 	 */
4688 	while (iommu->agaw < dmar_domain->agaw) {
4689 		struct dma_pte *pte;
4690 
4691 		pte = dmar_domain->pgd;
4692 		if (dma_pte_present(pte)) {
4693 			dmar_domain->pgd = (struct dma_pte *)
4694 				phys_to_virt(dma_pte_addr(pte));
4695 			free_pgtable_page(pte);
4696 		}
4697 		dmar_domain->agaw--;
4698 	}
4699 
4700 	return 0;
4701 }
4702 
4703 static int intel_iommu_attach_device(struct iommu_domain *domain,
4704 				     struct device *dev)
4705 {
4706 	int ret;
4707 
4708 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4709 	    device_is_rmrr_locked(dev)) {
4710 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4711 		return -EPERM;
4712 	}
4713 
4714 	if (is_aux_domain(dev, domain))
4715 		return -EPERM;
4716 
4717 	/* normally dev is not mapped */
4718 	if (unlikely(domain_context_mapped(dev))) {
4719 		struct dmar_domain *old_domain;
4720 
4721 		old_domain = find_domain(dev);
4722 		if (old_domain)
4723 			dmar_remove_one_dev_info(dev);
4724 	}
4725 
4726 	ret = prepare_domain_attach_device(domain, dev);
4727 	if (ret)
4728 		return ret;
4729 
4730 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4731 }
4732 
4733 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4734 					 struct device *dev)
4735 {
4736 	int ret;
4737 
4738 	if (!is_aux_domain(dev, domain))
4739 		return -EPERM;
4740 
4741 	ret = prepare_domain_attach_device(domain, dev);
4742 	if (ret)
4743 		return ret;
4744 
4745 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4746 }
4747 
4748 static void intel_iommu_detach_device(struct iommu_domain *domain,
4749 				      struct device *dev)
4750 {
4751 	dmar_remove_one_dev_info(dev);
4752 }
4753 
4754 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4755 					  struct device *dev)
4756 {
4757 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4758 }
4759 
4760 #ifdef CONFIG_INTEL_IOMMU_SVM
4761 /*
4762  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4763  * VT-d granularity. Invalidation is typically included in the unmap operation
4764  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4765  * owns the first level page tables. Invalidations of translation caches in the
4766  * guest are trapped and passed down to the host.
4767  *
4768  * vIOMMU in the guest will only expose first level page tables, therefore
4769  * we do not support IOTLB granularity for request without PASID (second level).
4770  *
4771  * For example, to find the VT-d granularity encoding for IOTLB
4772  * type and page selective granularity within PASID:
4773  * X: indexed by iommu cache type
4774  * Y: indexed by enum iommu_inv_granularity
4775  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4776  */
4777 
4778 static const int
4779 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4780 	/*
4781 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4782 	 * page selective (address granularity)
4783 	 */
4784 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4785 	/* PASID based dev TLBs */
4786 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4787 	/* PASID cache */
4788 	{-EINVAL, -EINVAL, -EINVAL}
4789 };
4790 
4791 static inline int to_vtd_granularity(int type, int granu)
4792 {
4793 	return inv_type_granu_table[type][granu];
4794 }
4795 
4796 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4797 {
4798 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4799 
4800 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4801 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4802 	 * granu size in contiguous memory.
4803 	 */
4804 	return order_base_2(nr_pages);
4805 }
4806 
4807 static int
4808 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4809 			   struct iommu_cache_invalidate_info *inv_info)
4810 {
4811 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4812 	struct device_domain_info *info;
4813 	struct intel_iommu *iommu;
4814 	unsigned long flags;
4815 	int cache_type;
4816 	u8 bus, devfn;
4817 	u16 did, sid;
4818 	int ret = 0;
4819 	u64 size = 0;
4820 
4821 	if (!inv_info || !dmar_domain)
4822 		return -EINVAL;
4823 
4824 	if (!dev || !dev_is_pci(dev))
4825 		return -ENODEV;
4826 
4827 	iommu = device_to_iommu(dev, &bus, &devfn);
4828 	if (!iommu)
4829 		return -ENODEV;
4830 
4831 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4832 		return -EINVAL;
4833 
4834 	spin_lock_irqsave(&device_domain_lock, flags);
4835 	spin_lock(&iommu->lock);
4836 	info = get_domain_info(dev);
4837 	if (!info) {
4838 		ret = -EINVAL;
4839 		goto out_unlock;
4840 	}
4841 	did = dmar_domain->iommu_did[iommu->seq_id];
4842 	sid = PCI_DEVID(bus, devfn);
4843 
4844 	/* Size is only valid in address selective invalidation */
4845 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4846 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4847 				   inv_info->granu.addr_info.nb_granules);
4848 
4849 	for_each_set_bit(cache_type,
4850 			 (unsigned long *)&inv_info->cache,
4851 			 IOMMU_CACHE_INV_TYPE_NR) {
4852 		int granu = 0;
4853 		u64 pasid = 0;
4854 		u64 addr = 0;
4855 
4856 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4857 		if (granu == -EINVAL) {
4858 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4859 					   cache_type, inv_info->granularity);
4860 			break;
4861 		}
4862 
4863 		/*
4864 		 * PASID is stored in different locations based on the
4865 		 * granularity.
4866 		 */
4867 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4868 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4869 			pasid = inv_info->granu.pasid_info.pasid;
4870 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4871 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4872 			pasid = inv_info->granu.addr_info.pasid;
4873 
4874 		switch (BIT(cache_type)) {
4875 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4876 			/* HW will ignore LSB bits based on address mask */
4877 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4878 			    size &&
4879 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4880 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4881 						   inv_info->granu.addr_info.addr, size);
4882 			}
4883 
4884 			/*
4885 			 * If granu is PASID-selective, address is ignored.
4886 			 * We use npages = -1 to indicate that.
4887 			 */
4888 			qi_flush_piotlb(iommu, did, pasid,
4889 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4890 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4891 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4892 
4893 			if (!info->ats_enabled)
4894 				break;
4895 			/*
4896 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4897 			 * in the guest may assume IOTLB flush is inclusive,
4898 			 * which is more efficient.
4899 			 */
4900 			fallthrough;
4901 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4902 			/*
4903 			 * PASID based device TLB invalidation does not support
4904 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4905 			 * IOMMU_INV_GRANU_ADDR.
4906 			 * The equivalent of that is we set the size to be the
4907 			 * entire range of 64 bit. User only provides PASID info
4908 			 * without address info. So we set addr to 0.
4909 			 */
4910 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4911 				size = 64 - VTD_PAGE_SHIFT;
4912 				addr = 0;
4913 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4914 				addr = inv_info->granu.addr_info.addr;
4915 			}
4916 
4917 			if (info->ats_enabled)
4918 				qi_flush_dev_iotlb_pasid(iommu, sid,
4919 						info->pfsid, pasid,
4920 						info->ats_qdep, addr,
4921 						size);
4922 			else
4923 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4924 			break;
4925 		default:
4926 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4927 					    cache_type);
4928 			ret = -EINVAL;
4929 		}
4930 	}
4931 out_unlock:
4932 	spin_unlock(&iommu->lock);
4933 	spin_unlock_irqrestore(&device_domain_lock, flags);
4934 
4935 	return ret;
4936 }
4937 #endif
4938 
4939 static int intel_iommu_map(struct iommu_domain *domain,
4940 			   unsigned long iova, phys_addr_t hpa,
4941 			   size_t size, int iommu_prot, gfp_t gfp)
4942 {
4943 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4944 	u64 max_addr;
4945 	int prot = 0;
4946 	int ret;
4947 
4948 	if (iommu_prot & IOMMU_READ)
4949 		prot |= DMA_PTE_READ;
4950 	if (iommu_prot & IOMMU_WRITE)
4951 		prot |= DMA_PTE_WRITE;
4952 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4953 		prot |= DMA_PTE_SNP;
4954 
4955 	max_addr = iova + size;
4956 	if (dmar_domain->max_addr < max_addr) {
4957 		u64 end;
4958 
4959 		/* check if minimum agaw is sufficient for mapped address */
4960 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4961 		if (end < max_addr) {
4962 			pr_err("%s: iommu width (%d) is not "
4963 			       "sufficient for the mapped address (%llx)\n",
4964 			       __func__, dmar_domain->gaw, max_addr);
4965 			return -EFAULT;
4966 		}
4967 		dmar_domain->max_addr = max_addr;
4968 	}
4969 	/* Round up size to next multiple of PAGE_SIZE, if it and
4970 	   the low bits of hpa would take us onto the next page */
4971 	size = aligned_nrpages(hpa, size);
4972 	ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4973 			     hpa >> VTD_PAGE_SHIFT, size, prot);
4974 	return ret;
4975 }
4976 
4977 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4978 				unsigned long iova, size_t size,
4979 				struct iommu_iotlb_gather *gather)
4980 {
4981 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4982 	unsigned long start_pfn, last_pfn;
4983 	int level = 0;
4984 
4985 	/* Cope with horrid API which requires us to unmap more than the
4986 	   size argument if it happens to be a large-page mapping. */
4987 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4988 
4989 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4990 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4991 
4992 	start_pfn = iova >> VTD_PAGE_SHIFT;
4993 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4994 
4995 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
4996 					last_pfn, gather->freelist);
4997 
4998 	if (dmar_domain->max_addr == iova + size)
4999 		dmar_domain->max_addr = iova;
5000 
5001 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5002 
5003 	return size;
5004 }
5005 
5006 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5007 				 struct iommu_iotlb_gather *gather)
5008 {
5009 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5010 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5011 	size_t size = gather->end - gather->start;
5012 	unsigned long start_pfn;
5013 	unsigned long nrpages;
5014 	int iommu_id;
5015 
5016 	nrpages = aligned_nrpages(gather->start, size);
5017 	start_pfn = mm_to_dma_pfn(iova_pfn);
5018 
5019 	for_each_domain_iommu(iommu_id, dmar_domain)
5020 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5021 				      start_pfn, nrpages, !gather->freelist, 0);
5022 
5023 	dma_free_pagelist(gather->freelist);
5024 }
5025 
5026 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5027 					    dma_addr_t iova)
5028 {
5029 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5030 	struct dma_pte *pte;
5031 	int level = 0;
5032 	u64 phys = 0;
5033 
5034 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5035 	if (pte && dma_pte_present(pte))
5036 		phys = dma_pte_addr(pte) +
5037 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5038 						VTD_PAGE_SHIFT) - 1));
5039 
5040 	return phys;
5041 }
5042 
5043 static inline bool scalable_mode_support(void)
5044 {
5045 	struct dmar_drhd_unit *drhd;
5046 	struct intel_iommu *iommu;
5047 	bool ret = true;
5048 
5049 	rcu_read_lock();
5050 	for_each_active_iommu(iommu, drhd) {
5051 		if (!sm_supported(iommu)) {
5052 			ret = false;
5053 			break;
5054 		}
5055 	}
5056 	rcu_read_unlock();
5057 
5058 	return ret;
5059 }
5060 
5061 static inline bool iommu_pasid_support(void)
5062 {
5063 	struct dmar_drhd_unit *drhd;
5064 	struct intel_iommu *iommu;
5065 	bool ret = true;
5066 
5067 	rcu_read_lock();
5068 	for_each_active_iommu(iommu, drhd) {
5069 		if (!pasid_supported(iommu)) {
5070 			ret = false;
5071 			break;
5072 		}
5073 	}
5074 	rcu_read_unlock();
5075 
5076 	return ret;
5077 }
5078 
5079 static inline bool nested_mode_support(void)
5080 {
5081 	struct dmar_drhd_unit *drhd;
5082 	struct intel_iommu *iommu;
5083 	bool ret = true;
5084 
5085 	rcu_read_lock();
5086 	for_each_active_iommu(iommu, drhd) {
5087 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5088 			ret = false;
5089 			break;
5090 		}
5091 	}
5092 	rcu_read_unlock();
5093 
5094 	return ret;
5095 }
5096 
5097 static bool intel_iommu_capable(enum iommu_cap cap)
5098 {
5099 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5100 		return domain_update_iommu_snooping(NULL) == 1;
5101 	if (cap == IOMMU_CAP_INTR_REMAP)
5102 		return irq_remapping_enabled == 1;
5103 
5104 	return false;
5105 }
5106 
5107 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5108 {
5109 	struct intel_iommu *iommu;
5110 
5111 	iommu = device_to_iommu(dev, NULL, NULL);
5112 	if (!iommu)
5113 		return ERR_PTR(-ENODEV);
5114 
5115 	if (translation_pre_enabled(iommu))
5116 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5117 
5118 	return &iommu->iommu;
5119 }
5120 
5121 static void intel_iommu_release_device(struct device *dev)
5122 {
5123 	struct intel_iommu *iommu;
5124 
5125 	iommu = device_to_iommu(dev, NULL, NULL);
5126 	if (!iommu)
5127 		return;
5128 
5129 	dmar_remove_one_dev_info(dev);
5130 
5131 	set_dma_ops(dev, NULL);
5132 }
5133 
5134 static void intel_iommu_probe_finalize(struct device *dev)
5135 {
5136 	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5137 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5138 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5139 
5140 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5141 		iommu_setup_dma_ops(dev, base,
5142 				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5143 	else
5144 		set_dma_ops(dev, NULL);
5145 }
5146 
5147 static void intel_iommu_get_resv_regions(struct device *device,
5148 					 struct list_head *head)
5149 {
5150 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5151 	struct iommu_resv_region *reg;
5152 	struct dmar_rmrr_unit *rmrr;
5153 	struct device *i_dev;
5154 	int i;
5155 
5156 	down_read(&dmar_global_lock);
5157 	for_each_rmrr_units(rmrr) {
5158 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5159 					  i, i_dev) {
5160 			struct iommu_resv_region *resv;
5161 			enum iommu_resv_type type;
5162 			size_t length;
5163 
5164 			if (i_dev != device &&
5165 			    !is_downstream_to_pci_bridge(device, i_dev))
5166 				continue;
5167 
5168 			length = rmrr->end_address - rmrr->base_address + 1;
5169 
5170 			type = device_rmrr_is_relaxable(device) ?
5171 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5172 
5173 			resv = iommu_alloc_resv_region(rmrr->base_address,
5174 						       length, prot, type);
5175 			if (!resv)
5176 				break;
5177 
5178 			list_add_tail(&resv->list, head);
5179 		}
5180 	}
5181 	up_read(&dmar_global_lock);
5182 
5183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5184 	if (dev_is_pci(device)) {
5185 		struct pci_dev *pdev = to_pci_dev(device);
5186 
5187 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5188 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5189 						   IOMMU_RESV_DIRECT_RELAXABLE);
5190 			if (reg)
5191 				list_add_tail(&reg->list, head);
5192 		}
5193 	}
5194 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5195 
5196 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5197 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5198 				      0, IOMMU_RESV_MSI);
5199 	if (!reg)
5200 		return;
5201 	list_add_tail(&reg->list, head);
5202 }
5203 
5204 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5205 {
5206 	struct device_domain_info *info;
5207 	struct context_entry *context;
5208 	struct dmar_domain *domain;
5209 	unsigned long flags;
5210 	u64 ctx_lo;
5211 	int ret;
5212 
5213 	domain = find_domain(dev);
5214 	if (!domain)
5215 		return -EINVAL;
5216 
5217 	spin_lock_irqsave(&device_domain_lock, flags);
5218 	spin_lock(&iommu->lock);
5219 
5220 	ret = -EINVAL;
5221 	info = get_domain_info(dev);
5222 	if (!info || !info->pasid_supported)
5223 		goto out;
5224 
5225 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5226 	if (WARN_ON(!context))
5227 		goto out;
5228 
5229 	ctx_lo = context[0].lo;
5230 
5231 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5232 		ctx_lo |= CONTEXT_PASIDE;
5233 		context[0].lo = ctx_lo;
5234 		wmb();
5235 		iommu->flush.flush_context(iommu,
5236 					   domain->iommu_did[iommu->seq_id],
5237 					   PCI_DEVID(info->bus, info->devfn),
5238 					   DMA_CCMD_MASK_NOBIT,
5239 					   DMA_CCMD_DEVICE_INVL);
5240 	}
5241 
5242 	/* Enable PASID support in the device, if it wasn't already */
5243 	if (!info->pasid_enabled)
5244 		iommu_enable_dev_iotlb(info);
5245 
5246 	ret = 0;
5247 
5248  out:
5249 	spin_unlock(&iommu->lock);
5250 	spin_unlock_irqrestore(&device_domain_lock, flags);
5251 
5252 	return ret;
5253 }
5254 
5255 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5256 {
5257 	if (dev_is_pci(dev))
5258 		return pci_device_group(dev);
5259 	return generic_device_group(dev);
5260 }
5261 
5262 static int intel_iommu_enable_auxd(struct device *dev)
5263 {
5264 	struct device_domain_info *info;
5265 	struct intel_iommu *iommu;
5266 	unsigned long flags;
5267 	int ret;
5268 
5269 	iommu = device_to_iommu(dev, NULL, NULL);
5270 	if (!iommu || dmar_disabled)
5271 		return -EINVAL;
5272 
5273 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5274 		return -EINVAL;
5275 
5276 	ret = intel_iommu_enable_pasid(iommu, dev);
5277 	if (ret)
5278 		return -ENODEV;
5279 
5280 	spin_lock_irqsave(&device_domain_lock, flags);
5281 	info = get_domain_info(dev);
5282 	info->auxd_enabled = 1;
5283 	spin_unlock_irqrestore(&device_domain_lock, flags);
5284 
5285 	return 0;
5286 }
5287 
5288 static int intel_iommu_disable_auxd(struct device *dev)
5289 {
5290 	struct device_domain_info *info;
5291 	unsigned long flags;
5292 
5293 	spin_lock_irqsave(&device_domain_lock, flags);
5294 	info = get_domain_info(dev);
5295 	if (!WARN_ON(!info))
5296 		info->auxd_enabled = 0;
5297 	spin_unlock_irqrestore(&device_domain_lock, flags);
5298 
5299 	return 0;
5300 }
5301 
5302 /*
5303  * A PCI express designated vendor specific extended capability is defined
5304  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5305  * for system software and tools to detect endpoint devices supporting the
5306  * Intel scalable IO virtualization without host driver dependency.
5307  *
5308  * Returns the address of the matching extended capability structure within
5309  * the device's PCI configuration space or 0 if the device does not support
5310  * it.
5311  */
5312 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5313 {
5314 	int pos;
5315 	u16 vendor, id;
5316 
5317 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5318 	while (pos) {
5319 		pci_read_config_word(pdev, pos + 4, &vendor);
5320 		pci_read_config_word(pdev, pos + 8, &id);
5321 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5322 			return pos;
5323 
5324 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5325 	}
5326 
5327 	return 0;
5328 }
5329 
5330 static bool
5331 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5332 {
5333 	if (feat == IOMMU_DEV_FEAT_AUX) {
5334 		int ret;
5335 
5336 		if (!dev_is_pci(dev) || dmar_disabled ||
5337 		    !scalable_mode_support() || !iommu_pasid_support())
5338 			return false;
5339 
5340 		ret = pci_pasid_features(to_pci_dev(dev));
5341 		if (ret < 0)
5342 			return false;
5343 
5344 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5345 	}
5346 
5347 	if (feat == IOMMU_DEV_FEAT_SVA) {
5348 		struct device_domain_info *info = get_domain_info(dev);
5349 
5350 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5351 			info->pasid_supported && info->pri_supported &&
5352 			info->ats_supported;
5353 	}
5354 
5355 	return false;
5356 }
5357 
5358 static int
5359 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5360 {
5361 	if (feat == IOMMU_DEV_FEAT_AUX)
5362 		return intel_iommu_enable_auxd(dev);
5363 
5364 	if (feat == IOMMU_DEV_FEAT_SVA) {
5365 		struct device_domain_info *info = get_domain_info(dev);
5366 
5367 		if (!info)
5368 			return -EINVAL;
5369 
5370 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5371 			return 0;
5372 	}
5373 
5374 	return -ENODEV;
5375 }
5376 
5377 static int
5378 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5379 {
5380 	if (feat == IOMMU_DEV_FEAT_AUX)
5381 		return intel_iommu_disable_auxd(dev);
5382 
5383 	return -ENODEV;
5384 }
5385 
5386 static bool
5387 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5388 {
5389 	struct device_domain_info *info = get_domain_info(dev);
5390 
5391 	if (feat == IOMMU_DEV_FEAT_AUX)
5392 		return scalable_mode_support() && info && info->auxd_enabled;
5393 
5394 	return false;
5395 }
5396 
5397 static int
5398 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5399 {
5400 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5401 
5402 	return dmar_domain->default_pasid > 0 ?
5403 			dmar_domain->default_pasid : -EINVAL;
5404 }
5405 
5406 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5407 					   struct device *dev)
5408 {
5409 	return attach_deferred(dev);
5410 }
5411 
5412 static int
5413 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5414 			    enum iommu_attr attr, void *data)
5415 {
5416 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5417 	unsigned long flags;
5418 	int ret = 0;
5419 
5420 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5421 		return -EINVAL;
5422 
5423 	switch (attr) {
5424 	case DOMAIN_ATTR_NESTING:
5425 		spin_lock_irqsave(&device_domain_lock, flags);
5426 		if (nested_mode_support() &&
5427 		    list_empty(&dmar_domain->devices)) {
5428 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5429 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5430 		} else {
5431 			ret = -ENODEV;
5432 		}
5433 		spin_unlock_irqrestore(&device_domain_lock, flags);
5434 		break;
5435 	default:
5436 		ret = -EINVAL;
5437 		break;
5438 	}
5439 
5440 	return ret;
5441 }
5442 
5443 static bool domain_use_flush_queue(void)
5444 {
5445 	struct dmar_drhd_unit *drhd;
5446 	struct intel_iommu *iommu;
5447 	bool r = true;
5448 
5449 	if (intel_iommu_strict)
5450 		return false;
5451 
5452 	/*
5453 	 * The flush queue implementation does not perform page-selective
5454 	 * invalidations that are required for efficient TLB flushes in virtual
5455 	 * environments. The benefit of batching is likely to be much lower than
5456 	 * the overhead of synchronizing the virtual and physical IOMMU
5457 	 * page-tables.
5458 	 */
5459 	rcu_read_lock();
5460 	for_each_active_iommu(iommu, drhd) {
5461 		if (!cap_caching_mode(iommu->cap))
5462 			continue;
5463 
5464 		pr_warn_once("IOMMU batching is disabled due to virtualization");
5465 		r = false;
5466 		break;
5467 	}
5468 	rcu_read_unlock();
5469 
5470 	return r;
5471 }
5472 
5473 static int
5474 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5475 			    enum iommu_attr attr, void *data)
5476 {
5477 	switch (domain->type) {
5478 	case IOMMU_DOMAIN_UNMANAGED:
5479 		return -ENODEV;
5480 	case IOMMU_DOMAIN_DMA:
5481 		switch (attr) {
5482 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5483 			*(int *)data = domain_use_flush_queue();
5484 			return 0;
5485 		default:
5486 			return -ENODEV;
5487 		}
5488 		break;
5489 	default:
5490 		return -EINVAL;
5491 	}
5492 }
5493 
5494 /*
5495  * Check that the device does not live on an external facing PCI port that is
5496  * marked as untrusted. Such devices should not be able to apply quirks and
5497  * thus not be able to bypass the IOMMU restrictions.
5498  */
5499 static bool risky_device(struct pci_dev *pdev)
5500 {
5501 	if (pdev->untrusted) {
5502 		pci_info(pdev,
5503 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5504 			 pdev->vendor, pdev->device);
5505 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5506 		return true;
5507 	}
5508 	return false;
5509 }
5510 
5511 const struct iommu_ops intel_iommu_ops = {
5512 	.capable		= intel_iommu_capable,
5513 	.domain_alloc		= intel_iommu_domain_alloc,
5514 	.domain_free		= intel_iommu_domain_free,
5515 	.domain_get_attr        = intel_iommu_domain_get_attr,
5516 	.domain_set_attr	= intel_iommu_domain_set_attr,
5517 	.attach_dev		= intel_iommu_attach_device,
5518 	.detach_dev		= intel_iommu_detach_device,
5519 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5520 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5521 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5522 	.map			= intel_iommu_map,
5523 	.unmap			= intel_iommu_unmap,
5524 	.flush_iotlb_all        = intel_flush_iotlb_all,
5525 	.iotlb_sync		= intel_iommu_tlb_sync,
5526 	.iova_to_phys		= intel_iommu_iova_to_phys,
5527 	.probe_device		= intel_iommu_probe_device,
5528 	.probe_finalize		= intel_iommu_probe_finalize,
5529 	.release_device		= intel_iommu_release_device,
5530 	.get_resv_regions	= intel_iommu_get_resv_regions,
5531 	.put_resv_regions	= generic_iommu_put_resv_regions,
5532 	.device_group		= intel_iommu_device_group,
5533 	.dev_has_feat		= intel_iommu_dev_has_feat,
5534 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5535 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5536 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5537 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5538 	.def_domain_type	= device_def_domain_type,
5539 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5540 #ifdef CONFIG_INTEL_IOMMU_SVM
5541 	.cache_invalidate	= intel_iommu_sva_invalidate,
5542 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5543 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5544 	.sva_bind		= intel_svm_bind,
5545 	.sva_unbind		= intel_svm_unbind,
5546 	.sva_get_pasid		= intel_svm_get_pasid,
5547 	.page_response		= intel_svm_page_response,
5548 #endif
5549 };
5550 
5551 static void quirk_iommu_igfx(struct pci_dev *dev)
5552 {
5553 	if (risky_device(dev))
5554 		return;
5555 
5556 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5557 	dmar_map_gfx = 0;
5558 }
5559 
5560 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5564 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5565 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5566 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5567 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5568 
5569 /* Broadwell igfx malfunctions with dmar */
5570 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5571 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5573 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5574 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5575 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5576 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5577 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5578 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5582 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5585 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5586 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5587 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5588 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5589 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5590 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5591 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5592 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5593 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5594 
5595 static void quirk_iommu_rwbf(struct pci_dev *dev)
5596 {
5597 	if (risky_device(dev))
5598 		return;
5599 
5600 	/*
5601 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5602 	 * but needs it. Same seems to hold for the desktop versions.
5603 	 */
5604 	pci_info(dev, "Forcing write-buffer flush capability\n");
5605 	rwbf_quirk = 1;
5606 }
5607 
5608 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5609 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5610 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5611 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5612 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5613 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5615 
5616 #define GGC 0x52
5617 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5618 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5619 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5620 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5621 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5622 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5623 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5624 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5625 
5626 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5627 {
5628 	unsigned short ggc;
5629 
5630 	if (risky_device(dev))
5631 		return;
5632 
5633 	if (pci_read_config_word(dev, GGC, &ggc))
5634 		return;
5635 
5636 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5637 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5638 		dmar_map_gfx = 0;
5639 	} else if (dmar_map_gfx) {
5640 		/* we have to ensure the gfx device is idle before we flush */
5641 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5642 		intel_iommu_strict = 1;
5643        }
5644 }
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5649 
5650 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5651 {
5652 	unsigned short ver;
5653 
5654 	if (!IS_GFX_DEVICE(dev))
5655 		return;
5656 
5657 	ver = (dev->device >> 8) & 0xff;
5658 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5659 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5660 	    ver != 0x9a)
5661 		return;
5662 
5663 	if (risky_device(dev))
5664 		return;
5665 
5666 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5667 	iommu_skip_te_disable = 1;
5668 }
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5670 
5671 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5672    ISOCH DMAR unit for the Azalia sound device, but not give it any
5673    TLB entries, which causes it to deadlock. Check for that.  We do
5674    this in a function called from init_dmars(), instead of in a PCI
5675    quirk, because we don't want to print the obnoxious "BIOS broken"
5676    message if VT-d is actually disabled.
5677 */
5678 static void __init check_tylersburg_isoch(void)
5679 {
5680 	struct pci_dev *pdev;
5681 	uint32_t vtisochctrl;
5682 
5683 	/* If there's no Azalia in the system anyway, forget it. */
5684 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5685 	if (!pdev)
5686 		return;
5687 
5688 	if (risky_device(pdev)) {
5689 		pci_dev_put(pdev);
5690 		return;
5691 	}
5692 
5693 	pci_dev_put(pdev);
5694 
5695 	/* System Management Registers. Might be hidden, in which case
5696 	   we can't do the sanity check. But that's OK, because the
5697 	   known-broken BIOSes _don't_ actually hide it, so far. */
5698 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5699 	if (!pdev)
5700 		return;
5701 
5702 	if (risky_device(pdev)) {
5703 		pci_dev_put(pdev);
5704 		return;
5705 	}
5706 
5707 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5708 		pci_dev_put(pdev);
5709 		return;
5710 	}
5711 
5712 	pci_dev_put(pdev);
5713 
5714 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5715 	if (vtisochctrl & 1)
5716 		return;
5717 
5718 	/* Drop all bits other than the number of TLB entries */
5719 	vtisochctrl &= 0x1c;
5720 
5721 	/* If we have the recommended number of TLB entries (16), fine. */
5722 	if (vtisochctrl == 0x10)
5723 		return;
5724 
5725 	/* Zero TLB entries? You get to ride the short bus to school. */
5726 	if (!vtisochctrl) {
5727 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5728 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5729 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5730 		     dmi_get_system_info(DMI_BIOS_VERSION),
5731 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5732 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5733 		return;
5734 	}
5735 
5736 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5737 	       vtisochctrl);
5738 }
5739