xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision b737eecd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 
48 #include "../irq_remapping.h"
49 #include "pasid.h"
50 #include "cap_audit.h"
51 
52 #define ROOT_SIZE		VTD_PAGE_SIZE
53 #define CONTEXT_SIZE		VTD_PAGE_SIZE
54 
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 
60 #define IOAPIC_RANGE_START	(0xfee00000)
61 #define IOAPIC_RANGE_END	(0xfeefffff)
62 #define IOVA_START_ADDR		(0x1000)
63 
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 
69 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
71 
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
75 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 
78 /* IO virtual address start page frame number */
79 #define IOVA_START_PFN		(1)
80 
81 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
82 
83 /* page table handling */
84 #define LEVEL_STRIDE		(9)
85 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
86 
87 /*
88  * This bitmap is used to advertise the page sizes our hardware support
89  * to the IOMMU core, which will then use this information to split
90  * physically contiguous memory regions it is mapping into page sizes
91  * that we support.
92  *
93  * Traditionally the IOMMU core just handed us the mappings directly,
94  * after making sure the size is an order of a 4KiB page and that the
95  * mapping has natural alignment.
96  *
97  * To retain this behavior, we currently advertise that we support
98  * all page sizes that are an order of 4KiB.
99  *
100  * If at some point we'd like to utilize the IOMMU core's new behavior,
101  * we could change this to advertise the real page sizes we support.
102  */
103 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
104 
105 static inline int agaw_to_level(int agaw)
106 {
107 	return agaw + 2;
108 }
109 
110 static inline int agaw_to_width(int agaw)
111 {
112 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 }
114 
115 static inline int width_to_agaw(int width)
116 {
117 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 }
119 
120 static inline unsigned int level_to_offset_bits(int level)
121 {
122 	return (level - 1) * LEVEL_STRIDE;
123 }
124 
125 static inline int pfn_level_offset(u64 pfn, int level)
126 {
127 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 }
129 
130 static inline u64 level_mask(int level)
131 {
132 	return -1ULL << level_to_offset_bits(level);
133 }
134 
135 static inline u64 level_size(int level)
136 {
137 	return 1ULL << level_to_offset_bits(level);
138 }
139 
140 static inline u64 align_to_level(u64 pfn, int level)
141 {
142 	return (pfn + level_size(level) - 1) & level_mask(level);
143 }
144 
145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 {
147 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 }
149 
150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
151    are never going to work. */
152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 {
154 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 
157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 {
159 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 {
163 	return mm_to_dma_pfn(page_to_pfn(pg));
164 }
165 static inline unsigned long virt_to_dma_pfn(void *p)
166 {
167 	return page_to_dma_pfn(virt_to_page(p));
168 }
169 
170 /* global iommu list, set NULL for ignored DMAR units */
171 static struct intel_iommu **g_iommus;
172 
173 static void __init check_tylersburg_isoch(void);
174 static int rwbf_quirk;
175 
176 /*
177  * set to 1 to panic kernel if can't successfully enable VT-d
178  * (used when kernel is launched w/ TXT)
179  */
180 static int force_on = 0;
181 static int intel_iommu_tboot_noforce;
182 static int no_platform_optin;
183 
184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
185 
186 /*
187  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
188  * if marked present.
189  */
190 static phys_addr_t root_entry_lctp(struct root_entry *re)
191 {
192 	if (!(re->lo & 1))
193 		return 0;
194 
195 	return re->lo & VTD_PAGE_MASK;
196 }
197 
198 /*
199  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_uctp(struct root_entry *re)
203 {
204 	if (!(re->hi & 1))
205 		return 0;
206 
207 	return re->hi & VTD_PAGE_MASK;
208 }
209 
210 static inline void context_clear_pasid_enable(struct context_entry *context)
211 {
212 	context->lo &= ~(1ULL << 11);
213 }
214 
215 static inline bool context_pasid_enabled(struct context_entry *context)
216 {
217 	return !!(context->lo & (1ULL << 11));
218 }
219 
220 static inline void context_set_copied(struct context_entry *context)
221 {
222 	context->hi |= (1ull << 3);
223 }
224 
225 static inline bool context_copied(struct context_entry *context)
226 {
227 	return !!(context->hi & (1ULL << 3));
228 }
229 
230 static inline bool __context_present(struct context_entry *context)
231 {
232 	return (context->lo & 1);
233 }
234 
235 bool context_present(struct context_entry *context)
236 {
237 	return context_pasid_enabled(context) ?
238 	     __context_present(context) :
239 	     __context_present(context) && !context_copied(context);
240 }
241 
242 static inline void context_set_present(struct context_entry *context)
243 {
244 	context->lo |= 1;
245 }
246 
247 static inline void context_set_fault_enable(struct context_entry *context)
248 {
249 	context->lo &= (((u64)-1) << 2) | 1;
250 }
251 
252 static inline void context_set_translation_type(struct context_entry *context,
253 						unsigned long value)
254 {
255 	context->lo &= (((u64)-1) << 4) | 3;
256 	context->lo |= (value & 3) << 2;
257 }
258 
259 static inline void context_set_address_root(struct context_entry *context,
260 					    unsigned long value)
261 {
262 	context->lo &= ~VTD_PAGE_MASK;
263 	context->lo |= value & VTD_PAGE_MASK;
264 }
265 
266 static inline void context_set_address_width(struct context_entry *context,
267 					     unsigned long value)
268 {
269 	context->hi |= value & 7;
270 }
271 
272 static inline void context_set_domain_id(struct context_entry *context,
273 					 unsigned long value)
274 {
275 	context->hi |= (value & ((1 << 16) - 1)) << 8;
276 }
277 
278 static inline int context_domain_id(struct context_entry *c)
279 {
280 	return((c->hi >> 8) & 0xffff);
281 }
282 
283 static inline void context_clear_entry(struct context_entry *context)
284 {
285 	context->lo = 0;
286 	context->hi = 0;
287 }
288 
289 /*
290  * This domain is a statically identity mapping domain.
291  *	1. This domain creats a static 1:1 mapping to all usable memory.
292  * 	2. It maps to each iommu if successful.
293  *	3. Each iommu mapps to this domain if successful.
294  */
295 static struct dmar_domain *si_domain;
296 static int hw_pass_through = 1;
297 
298 #define for_each_domain_iommu(idx, domain)			\
299 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
300 		if (domain->iommu_refcnt[idx])
301 
302 struct dmar_rmrr_unit {
303 	struct list_head list;		/* list of rmrr units	*/
304 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
305 	u64	base_address;		/* reserved base address*/
306 	u64	end_address;		/* reserved end address */
307 	struct dmar_dev_scope *devices;	/* target devices */
308 	int	devices_cnt;		/* target device count */
309 };
310 
311 struct dmar_atsr_unit {
312 	struct list_head list;		/* list of ATSR units */
313 	struct acpi_dmar_header *hdr;	/* ACPI header */
314 	struct dmar_dev_scope *devices;	/* target devices */
315 	int devices_cnt;		/* target device count */
316 	u8 include_all:1;		/* include all ports */
317 };
318 
319 struct dmar_satc_unit {
320 	struct list_head list;		/* list of SATC units */
321 	struct acpi_dmar_header *hdr;	/* ACPI header */
322 	struct dmar_dev_scope *devices;	/* target devices */
323 	struct intel_iommu *iommu;	/* the corresponding iommu */
324 	int devices_cnt;		/* target device count */
325 	u8 atc_required:1;		/* ATS is required */
326 };
327 
328 static LIST_HEAD(dmar_atsr_units);
329 static LIST_HEAD(dmar_rmrr_units);
330 static LIST_HEAD(dmar_satc_units);
331 
332 #define for_each_rmrr_units(rmrr) \
333 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334 
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337 
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int intel_iommu_attach_device(struct iommu_domain *domain,
343 				     struct device *dev);
344 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
345 					    dma_addr_t iova);
346 
347 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
348 int dmar_disabled = 0;
349 #else
350 int dmar_disabled = 1;
351 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
352 
353 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
354 int intel_iommu_sm = 1;
355 #else
356 int intel_iommu_sm;
357 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
358 
359 int intel_iommu_enabled = 0;
360 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
361 
362 static int dmar_map_gfx = 1;
363 static int dmar_forcedac;
364 static int intel_iommu_strict;
365 static int intel_iommu_superpage = 1;
366 static int iommu_identity_mapping;
367 static int iommu_skip_te_disable;
368 
369 #define IDENTMAP_GFX		2
370 #define IDENTMAP_AZALIA		4
371 
372 int intel_iommu_gfx_mapped;
373 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
374 
375 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
376 struct device_domain_info *get_domain_info(struct device *dev)
377 {
378 	struct device_domain_info *info;
379 
380 	if (!dev)
381 		return NULL;
382 
383 	info = dev_iommu_priv_get(dev);
384 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
385 		return NULL;
386 
387 	return info;
388 }
389 
390 DEFINE_SPINLOCK(device_domain_lock);
391 static LIST_HEAD(device_domain_list);
392 
393 /*
394  * Iterate over elements in device_domain_list and call the specified
395  * callback @fn against each element.
396  */
397 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
398 				     void *data), void *data)
399 {
400 	int ret = 0;
401 	unsigned long flags;
402 	struct device_domain_info *info;
403 
404 	spin_lock_irqsave(&device_domain_lock, flags);
405 	list_for_each_entry(info, &device_domain_list, global) {
406 		ret = fn(info, data);
407 		if (ret) {
408 			spin_unlock_irqrestore(&device_domain_lock, flags);
409 			return ret;
410 		}
411 	}
412 	spin_unlock_irqrestore(&device_domain_lock, flags);
413 
414 	return 0;
415 }
416 
417 const struct iommu_ops intel_iommu_ops;
418 
419 static bool translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
422 }
423 
424 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
425 {
426 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
427 }
428 
429 static void init_translation_status(struct intel_iommu *iommu)
430 {
431 	u32 gsts;
432 
433 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
434 	if (gsts & DMA_GSTS_TES)
435 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
436 }
437 
438 static int __init intel_iommu_setup(char *str)
439 {
440 	if (!str)
441 		return -EINVAL;
442 	while (*str) {
443 		if (!strncmp(str, "on", 2)) {
444 			dmar_disabled = 0;
445 			pr_info("IOMMU enabled\n");
446 		} else if (!strncmp(str, "off", 3)) {
447 			dmar_disabled = 1;
448 			no_platform_optin = 1;
449 			pr_info("IOMMU disabled\n");
450 		} else if (!strncmp(str, "igfx_off", 8)) {
451 			dmar_map_gfx = 0;
452 			pr_info("Disable GFX device mapping\n");
453 		} else if (!strncmp(str, "forcedac", 8)) {
454 			pr_info("Forcing DAC for PCI devices\n");
455 			dmar_forcedac = 1;
456 		} else if (!strncmp(str, "strict", 6)) {
457 			pr_info("Disable batched IOTLB flush\n");
458 			intel_iommu_strict = 1;
459 		} else if (!strncmp(str, "sp_off", 6)) {
460 			pr_info("Disable supported super page\n");
461 			intel_iommu_superpage = 0;
462 		} else if (!strncmp(str, "sm_on", 5)) {
463 			pr_info("Intel-IOMMU: scalable mode supported\n");
464 			intel_iommu_sm = 1;
465 		} else if (!strncmp(str, "tboot_noforce", 13)) {
466 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 			intel_iommu_tboot_noforce = 1;
468 		}
469 
470 		str += strcspn(str, ",");
471 		while (*str == ',')
472 			str++;
473 	}
474 	return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477 
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480 
481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 {
483 	struct dmar_domain **domains;
484 	int idx = did >> 8;
485 
486 	domains = iommu->domains[idx];
487 	if (!domains)
488 		return NULL;
489 
490 	return domains[did & 0xff];
491 }
492 
493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494 			     struct dmar_domain *domain)
495 {
496 	struct dmar_domain **domains;
497 	int idx = did >> 8;
498 
499 	if (!iommu->domains[idx]) {
500 		size_t size = 256 * sizeof(struct dmar_domain *);
501 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 	}
503 
504 	domains = iommu->domains[idx];
505 	if (WARN_ON(!domains))
506 		return;
507 	else
508 		domains[did & 0xff] = domain;
509 }
510 
511 void *alloc_pgtable_page(int node)
512 {
513 	struct page *page;
514 	void *vaddr = NULL;
515 
516 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 	if (page)
518 		vaddr = page_address(page);
519 	return vaddr;
520 }
521 
522 void free_pgtable_page(void *vaddr)
523 {
524 	free_page((unsigned long)vaddr);
525 }
526 
527 static inline void *alloc_domain_mem(void)
528 {
529 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 }
531 
532 static void free_domain_mem(void *vaddr)
533 {
534 	kmem_cache_free(iommu_domain_cache, vaddr);
535 }
536 
537 static inline void * alloc_devinfo_mem(void)
538 {
539 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 }
541 
542 static inline void free_devinfo_mem(void *vaddr)
543 {
544 	kmem_cache_free(iommu_devinfo_cache, vaddr);
545 }
546 
547 static inline int domain_type_is_si(struct dmar_domain *domain)
548 {
549 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 }
551 
552 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 {
554 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 }
556 
557 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 				       unsigned long pfn)
559 {
560 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 
562 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 }
564 
565 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
566 {
567 	unsigned long sagaw;
568 	int agaw = -1;
569 
570 	sagaw = cap_sagaw(iommu->cap);
571 	for (agaw = width_to_agaw(max_gaw);
572 	     agaw >= 0; agaw--) {
573 		if (test_bit(agaw, &sagaw))
574 			break;
575 	}
576 
577 	return agaw;
578 }
579 
580 /*
581  * Calculate max SAGAW for each iommu.
582  */
583 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 {
585 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
586 }
587 
588 /*
589  * calculate agaw for each iommu.
590  * "SAGAW" may be different across iommus, use a default agaw, and
591  * get a supported less agaw for iommus that don't support the default agaw.
592  */
593 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 {
595 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 }
597 
598 /* This functionin only returns single iommu in a domain */
599 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
600 {
601 	int iommu_id;
602 
603 	/* si_domain and vm domain should not get here. */
604 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 		return NULL;
606 
607 	for_each_domain_iommu(iommu_id, domain)
608 		break;
609 
610 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 		return NULL;
612 
613 	return g_iommus[iommu_id];
614 }
615 
616 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617 {
618 	return sm_supported(iommu) ?
619 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 }
621 
622 static void domain_update_iommu_coherency(struct dmar_domain *domain)
623 {
624 	struct dmar_drhd_unit *drhd;
625 	struct intel_iommu *iommu;
626 	bool found = false;
627 	int i;
628 
629 	domain->iommu_coherency = 1;
630 
631 	for_each_domain_iommu(i, domain) {
632 		found = true;
633 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
634 			domain->iommu_coherency = 0;
635 			break;
636 		}
637 	}
638 	if (found)
639 		return;
640 
641 	/* No hardware attached; use lowest common denominator */
642 	rcu_read_lock();
643 	for_each_active_iommu(iommu, drhd) {
644 		if (!iommu_paging_structure_coherency(iommu)) {
645 			domain->iommu_coherency = 0;
646 			break;
647 		}
648 	}
649 	rcu_read_unlock();
650 }
651 
652 static int domain_update_iommu_snooping(struct intel_iommu *skip)
653 {
654 	struct dmar_drhd_unit *drhd;
655 	struct intel_iommu *iommu;
656 	int ret = 1;
657 
658 	rcu_read_lock();
659 	for_each_active_iommu(iommu, drhd) {
660 		if (iommu != skip) {
661 			if (!ecap_sc_support(iommu->ecap)) {
662 				ret = 0;
663 				break;
664 			}
665 		}
666 	}
667 	rcu_read_unlock();
668 
669 	return ret;
670 }
671 
672 static int domain_update_iommu_superpage(struct dmar_domain *domain,
673 					 struct intel_iommu *skip)
674 {
675 	struct dmar_drhd_unit *drhd;
676 	struct intel_iommu *iommu;
677 	int mask = 0x3;
678 
679 	if (!intel_iommu_superpage) {
680 		return 0;
681 	}
682 
683 	/* set iommu_superpage to the smallest common denominator */
684 	rcu_read_lock();
685 	for_each_active_iommu(iommu, drhd) {
686 		if (iommu != skip) {
687 			if (domain && domain_use_first_level(domain)) {
688 				if (!cap_fl1gp_support(iommu->cap))
689 					mask = 0x1;
690 			} else {
691 				mask &= cap_super_page_val(iommu->cap);
692 			}
693 
694 			if (!mask)
695 				break;
696 		}
697 	}
698 	rcu_read_unlock();
699 
700 	return fls(mask);
701 }
702 
703 static int domain_update_device_node(struct dmar_domain *domain)
704 {
705 	struct device_domain_info *info;
706 	int nid = NUMA_NO_NODE;
707 
708 	assert_spin_locked(&device_domain_lock);
709 
710 	if (list_empty(&domain->devices))
711 		return NUMA_NO_NODE;
712 
713 	list_for_each_entry(info, &domain->devices, link) {
714 		if (!info->dev)
715 			continue;
716 
717 		/*
718 		 * There could possibly be multiple device numa nodes as devices
719 		 * within the same domain may sit behind different IOMMUs. There
720 		 * isn't perfect answer in such situation, so we select first
721 		 * come first served policy.
722 		 */
723 		nid = dev_to_node(info->dev);
724 		if (nid != NUMA_NO_NODE)
725 			break;
726 	}
727 
728 	return nid;
729 }
730 
731 static void domain_update_iotlb(struct dmar_domain *domain);
732 
733 /* Some capabilities may be different across iommus */
734 static void domain_update_iommu_cap(struct dmar_domain *domain)
735 {
736 	domain_update_iommu_coherency(domain);
737 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
738 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
739 
740 	/*
741 	 * If RHSA is missing, we should default to the device numa domain
742 	 * as fall back.
743 	 */
744 	if (domain->nid == NUMA_NO_NODE)
745 		domain->nid = domain_update_device_node(domain);
746 
747 	/*
748 	 * First-level translation restricts the input-address to a
749 	 * canonical address (i.e., address bits 63:N have the same
750 	 * value as address bit [N-1], where N is 48-bits with 4-level
751 	 * paging and 57-bits with 5-level paging). Hence, skip bit
752 	 * [N-1].
753 	 */
754 	if (domain_use_first_level(domain))
755 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
756 	else
757 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
758 
759 	domain_update_iotlb(domain);
760 }
761 
762 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
763 					 u8 devfn, int alloc)
764 {
765 	struct root_entry *root = &iommu->root_entry[bus];
766 	struct context_entry *context;
767 	u64 *entry;
768 
769 	entry = &root->lo;
770 	if (sm_supported(iommu)) {
771 		if (devfn >= 0x80) {
772 			devfn -= 0x80;
773 			entry = &root->hi;
774 		}
775 		devfn *= 2;
776 	}
777 	if (*entry & 1)
778 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
779 	else {
780 		unsigned long phy_addr;
781 		if (!alloc)
782 			return NULL;
783 
784 		context = alloc_pgtable_page(iommu->node);
785 		if (!context)
786 			return NULL;
787 
788 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
789 		phy_addr = virt_to_phys((void *)context);
790 		*entry = phy_addr | 1;
791 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
792 	}
793 	return &context[devfn];
794 }
795 
796 static bool attach_deferred(struct device *dev)
797 {
798 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
799 }
800 
801 /**
802  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
803  *				 sub-hierarchy of a candidate PCI-PCI bridge
804  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
805  * @bridge: the candidate PCI-PCI bridge
806  *
807  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
808  */
809 static bool
810 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
811 {
812 	struct pci_dev *pdev, *pbridge;
813 
814 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
815 		return false;
816 
817 	pdev = to_pci_dev(dev);
818 	pbridge = to_pci_dev(bridge);
819 
820 	if (pbridge->subordinate &&
821 	    pbridge->subordinate->number <= pdev->bus->number &&
822 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
823 		return true;
824 
825 	return false;
826 }
827 
828 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
829 {
830 	struct dmar_drhd_unit *drhd;
831 	u32 vtbar;
832 	int rc;
833 
834 	/* We know that this device on this chipset has its own IOMMU.
835 	 * If we find it under a different IOMMU, then the BIOS is lying
836 	 * to us. Hope that the IOMMU for this device is actually
837 	 * disabled, and it needs no translation...
838 	 */
839 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
840 	if (rc) {
841 		/* "can't" happen */
842 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
843 		return false;
844 	}
845 	vtbar &= 0xffff0000;
846 
847 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
848 	drhd = dmar_find_matched_drhd_unit(pdev);
849 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
850 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
851 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
852 		return true;
853 	}
854 
855 	return false;
856 }
857 
858 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
859 {
860 	if (!iommu || iommu->drhd->ignored)
861 		return true;
862 
863 	if (dev_is_pci(dev)) {
864 		struct pci_dev *pdev = to_pci_dev(dev);
865 
866 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
867 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
868 		    quirk_ioat_snb_local_iommu(pdev))
869 			return true;
870 	}
871 
872 	return false;
873 }
874 
875 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
876 {
877 	struct dmar_drhd_unit *drhd = NULL;
878 	struct pci_dev *pdev = NULL;
879 	struct intel_iommu *iommu;
880 	struct device *tmp;
881 	u16 segment = 0;
882 	int i;
883 
884 	if (!dev)
885 		return NULL;
886 
887 	if (dev_is_pci(dev)) {
888 		struct pci_dev *pf_pdev;
889 
890 		pdev = pci_real_dma_dev(to_pci_dev(dev));
891 
892 		/* VFs aren't listed in scope tables; we need to look up
893 		 * the PF instead to find the IOMMU. */
894 		pf_pdev = pci_physfn(pdev);
895 		dev = &pf_pdev->dev;
896 		segment = pci_domain_nr(pdev->bus);
897 	} else if (has_acpi_companion(dev))
898 		dev = &ACPI_COMPANION(dev)->dev;
899 
900 	rcu_read_lock();
901 	for_each_iommu(iommu, drhd) {
902 		if (pdev && segment != drhd->segment)
903 			continue;
904 
905 		for_each_active_dev_scope(drhd->devices,
906 					  drhd->devices_cnt, i, tmp) {
907 			if (tmp == dev) {
908 				/* For a VF use its original BDF# not that of the PF
909 				 * which we used for the IOMMU lookup. Strictly speaking
910 				 * we could do this for all PCI devices; we only need to
911 				 * get the BDF# from the scope table for ACPI matches. */
912 				if (pdev && pdev->is_virtfn)
913 					goto got_pdev;
914 
915 				if (bus && devfn) {
916 					*bus = drhd->devices[i].bus;
917 					*devfn = drhd->devices[i].devfn;
918 				}
919 				goto out;
920 			}
921 
922 			if (is_downstream_to_pci_bridge(dev, tmp))
923 				goto got_pdev;
924 		}
925 
926 		if (pdev && drhd->include_all) {
927 		got_pdev:
928 			if (bus && devfn) {
929 				*bus = pdev->bus->number;
930 				*devfn = pdev->devfn;
931 			}
932 			goto out;
933 		}
934 	}
935 	iommu = NULL;
936  out:
937 	if (iommu_is_dummy(iommu, dev))
938 		iommu = NULL;
939 
940 	rcu_read_unlock();
941 
942 	return iommu;
943 }
944 
945 static void domain_flush_cache(struct dmar_domain *domain,
946 			       void *addr, int size)
947 {
948 	if (!domain->iommu_coherency)
949 		clflush_cache_range(addr, size);
950 }
951 
952 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
953 {
954 	struct context_entry *context;
955 	int ret = 0;
956 	unsigned long flags;
957 
958 	spin_lock_irqsave(&iommu->lock, flags);
959 	context = iommu_context_addr(iommu, bus, devfn, 0);
960 	if (context)
961 		ret = context_present(context);
962 	spin_unlock_irqrestore(&iommu->lock, flags);
963 	return ret;
964 }
965 
966 static void free_context_table(struct intel_iommu *iommu)
967 {
968 	int i;
969 	unsigned long flags;
970 	struct context_entry *context;
971 
972 	spin_lock_irqsave(&iommu->lock, flags);
973 	if (!iommu->root_entry) {
974 		goto out;
975 	}
976 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
977 		context = iommu_context_addr(iommu, i, 0, 0);
978 		if (context)
979 			free_pgtable_page(context);
980 
981 		if (!sm_supported(iommu))
982 			continue;
983 
984 		context = iommu_context_addr(iommu, i, 0x80, 0);
985 		if (context)
986 			free_pgtable_page(context);
987 
988 	}
989 	free_pgtable_page(iommu->root_entry);
990 	iommu->root_entry = NULL;
991 out:
992 	spin_unlock_irqrestore(&iommu->lock, flags);
993 }
994 
995 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
996 				      unsigned long pfn, int *target_level)
997 {
998 	struct dma_pte *parent, *pte;
999 	int level = agaw_to_level(domain->agaw);
1000 	int offset;
1001 
1002 	BUG_ON(!domain->pgd);
1003 
1004 	if (!domain_pfn_supported(domain, pfn))
1005 		/* Address beyond IOMMU's addressing capabilities. */
1006 		return NULL;
1007 
1008 	parent = domain->pgd;
1009 
1010 	while (1) {
1011 		void *tmp_page;
1012 
1013 		offset = pfn_level_offset(pfn, level);
1014 		pte = &parent[offset];
1015 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1016 			break;
1017 		if (level == *target_level)
1018 			break;
1019 
1020 		if (!dma_pte_present(pte)) {
1021 			uint64_t pteval;
1022 
1023 			tmp_page = alloc_pgtable_page(domain->nid);
1024 
1025 			if (!tmp_page)
1026 				return NULL;
1027 
1028 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1029 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1030 			if (domain_use_first_level(domain)) {
1031 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1032 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1033 					pteval |= DMA_FL_PTE_ACCESS;
1034 			}
1035 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1036 				/* Someone else set it while we were thinking; use theirs. */
1037 				free_pgtable_page(tmp_page);
1038 			else
1039 				domain_flush_cache(domain, pte, sizeof(*pte));
1040 		}
1041 		if (level == 1)
1042 			break;
1043 
1044 		parent = phys_to_virt(dma_pte_addr(pte));
1045 		level--;
1046 	}
1047 
1048 	if (!*target_level)
1049 		*target_level = level;
1050 
1051 	return pte;
1052 }
1053 
1054 /* return address's pte at specific level */
1055 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1056 					 unsigned long pfn,
1057 					 int level, int *large_page)
1058 {
1059 	struct dma_pte *parent, *pte;
1060 	int total = agaw_to_level(domain->agaw);
1061 	int offset;
1062 
1063 	parent = domain->pgd;
1064 	while (level <= total) {
1065 		offset = pfn_level_offset(pfn, total);
1066 		pte = &parent[offset];
1067 		if (level == total)
1068 			return pte;
1069 
1070 		if (!dma_pte_present(pte)) {
1071 			*large_page = total;
1072 			break;
1073 		}
1074 
1075 		if (dma_pte_superpage(pte)) {
1076 			*large_page = total;
1077 			return pte;
1078 		}
1079 
1080 		parent = phys_to_virt(dma_pte_addr(pte));
1081 		total--;
1082 	}
1083 	return NULL;
1084 }
1085 
1086 /* clear last level pte, a tlb flush should be followed */
1087 static void dma_pte_clear_range(struct dmar_domain *domain,
1088 				unsigned long start_pfn,
1089 				unsigned long last_pfn)
1090 {
1091 	unsigned int large_page;
1092 	struct dma_pte *first_pte, *pte;
1093 
1094 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1095 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1096 	BUG_ON(start_pfn > last_pfn);
1097 
1098 	/* we don't need lock here; nobody else touches the iova range */
1099 	do {
1100 		large_page = 1;
1101 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1102 		if (!pte) {
1103 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1104 			continue;
1105 		}
1106 		do {
1107 			dma_clear_pte(pte);
1108 			start_pfn += lvl_to_nr_pages(large_page);
1109 			pte++;
1110 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1111 
1112 		domain_flush_cache(domain, first_pte,
1113 				   (void *)pte - (void *)first_pte);
1114 
1115 	} while (start_pfn && start_pfn <= last_pfn);
1116 }
1117 
1118 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1119 			       int retain_level, struct dma_pte *pte,
1120 			       unsigned long pfn, unsigned long start_pfn,
1121 			       unsigned long last_pfn)
1122 {
1123 	pfn = max(start_pfn, pfn);
1124 	pte = &pte[pfn_level_offset(pfn, level)];
1125 
1126 	do {
1127 		unsigned long level_pfn;
1128 		struct dma_pte *level_pte;
1129 
1130 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1131 			goto next;
1132 
1133 		level_pfn = pfn & level_mask(level);
1134 		level_pte = phys_to_virt(dma_pte_addr(pte));
1135 
1136 		if (level > 2) {
1137 			dma_pte_free_level(domain, level - 1, retain_level,
1138 					   level_pte, level_pfn, start_pfn,
1139 					   last_pfn);
1140 		}
1141 
1142 		/*
1143 		 * Free the page table if we're below the level we want to
1144 		 * retain and the range covers the entire table.
1145 		 */
1146 		if (level < retain_level && !(start_pfn > level_pfn ||
1147 		      last_pfn < level_pfn + level_size(level) - 1)) {
1148 			dma_clear_pte(pte);
1149 			domain_flush_cache(domain, pte, sizeof(*pte));
1150 			free_pgtable_page(level_pte);
1151 		}
1152 next:
1153 		pfn += level_size(level);
1154 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155 }
1156 
1157 /*
1158  * clear last level (leaf) ptes and free page table pages below the
1159  * level we wish to keep intact.
1160  */
1161 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1162 				   unsigned long start_pfn,
1163 				   unsigned long last_pfn,
1164 				   int retain_level)
1165 {
1166 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1167 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1168 	BUG_ON(start_pfn > last_pfn);
1169 
1170 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1171 
1172 	/* We don't need lock here; nobody else touches the iova range */
1173 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1174 			   domain->pgd, 0, start_pfn, last_pfn);
1175 
1176 	/* free pgd */
1177 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 		free_pgtable_page(domain->pgd);
1179 		domain->pgd = NULL;
1180 	}
1181 }
1182 
1183 /* When a page at a given level is being unlinked from its parent, we don't
1184    need to *modify* it at all. All we need to do is make a list of all the
1185    pages which can be freed just as soon as we've flushed the IOTLB and we
1186    know the hardware page-walk will no longer touch them.
1187    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1188    be freed. */
1189 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1190 					    int level, struct dma_pte *pte,
1191 					    struct page *freelist)
1192 {
1193 	struct page *pg;
1194 
1195 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1196 	pg->freelist = freelist;
1197 	freelist = pg;
1198 
1199 	if (level == 1)
1200 		return freelist;
1201 
1202 	pte = page_address(pg);
1203 	do {
1204 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1205 			freelist = dma_pte_list_pagetables(domain, level - 1,
1206 							   pte, freelist);
1207 		pte++;
1208 	} while (!first_pte_in_page(pte));
1209 
1210 	return freelist;
1211 }
1212 
1213 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1214 					struct dma_pte *pte, unsigned long pfn,
1215 					unsigned long start_pfn,
1216 					unsigned long last_pfn,
1217 					struct page *freelist)
1218 {
1219 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1220 
1221 	pfn = max(start_pfn, pfn);
1222 	pte = &pte[pfn_level_offset(pfn, level)];
1223 
1224 	do {
1225 		unsigned long level_pfn;
1226 
1227 		if (!dma_pte_present(pte))
1228 			goto next;
1229 
1230 		level_pfn = pfn & level_mask(level);
1231 
1232 		/* If range covers entire pagetable, free it */
1233 		if (start_pfn <= level_pfn &&
1234 		    last_pfn >= level_pfn + level_size(level) - 1) {
1235 			/* These suborbinate page tables are going away entirely. Don't
1236 			   bother to clear them; we're just going to *free* them. */
1237 			if (level > 1 && !dma_pte_superpage(pte))
1238 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1239 
1240 			dma_clear_pte(pte);
1241 			if (!first_pte)
1242 				first_pte = pte;
1243 			last_pte = pte;
1244 		} else if (level > 1) {
1245 			/* Recurse down into a level that isn't *entirely* obsolete */
1246 			freelist = dma_pte_clear_level(domain, level - 1,
1247 						       phys_to_virt(dma_pte_addr(pte)),
1248 						       level_pfn, start_pfn, last_pfn,
1249 						       freelist);
1250 		}
1251 next:
1252 		pfn += level_size(level);
1253 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1254 
1255 	if (first_pte)
1256 		domain_flush_cache(domain, first_pte,
1257 				   (void *)++last_pte - (void *)first_pte);
1258 
1259 	return freelist;
1260 }
1261 
1262 /* We can't just free the pages because the IOMMU may still be walking
1263    the page tables, and may have cached the intermediate levels. The
1264    pages can only be freed after the IOTLB flush has been done. */
1265 static struct page *domain_unmap(struct dmar_domain *domain,
1266 				 unsigned long start_pfn,
1267 				 unsigned long last_pfn,
1268 				 struct page *freelist)
1269 {
1270 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1271 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1272 	BUG_ON(start_pfn > last_pfn);
1273 
1274 	/* we don't need lock here; nobody else touches the iova range */
1275 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1276 				       domain->pgd, 0, start_pfn, last_pfn,
1277 				       freelist);
1278 
1279 	/* free pgd */
1280 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1281 		struct page *pgd_page = virt_to_page(domain->pgd);
1282 		pgd_page->freelist = freelist;
1283 		freelist = pgd_page;
1284 
1285 		domain->pgd = NULL;
1286 	}
1287 
1288 	return freelist;
1289 }
1290 
1291 static void dma_free_pagelist(struct page *freelist)
1292 {
1293 	struct page *pg;
1294 
1295 	while ((pg = freelist)) {
1296 		freelist = pg->freelist;
1297 		free_pgtable_page(page_address(pg));
1298 	}
1299 }
1300 
1301 /* iommu handling */
1302 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1303 {
1304 	struct root_entry *root;
1305 	unsigned long flags;
1306 
1307 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1308 	if (!root) {
1309 		pr_err("Allocating root entry for %s failed\n",
1310 			iommu->name);
1311 		return -ENOMEM;
1312 	}
1313 
1314 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1315 
1316 	spin_lock_irqsave(&iommu->lock, flags);
1317 	iommu->root_entry = root;
1318 	spin_unlock_irqrestore(&iommu->lock, flags);
1319 
1320 	return 0;
1321 }
1322 
1323 static void iommu_set_root_entry(struct intel_iommu *iommu)
1324 {
1325 	u64 addr;
1326 	u32 sts;
1327 	unsigned long flag;
1328 
1329 	addr = virt_to_phys(iommu->root_entry);
1330 	if (sm_supported(iommu))
1331 		addr |= DMA_RTADDR_SMT;
1332 
1333 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1334 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1335 
1336 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1337 
1338 	/* Make sure hardware complete it */
1339 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1340 		      readl, (sts & DMA_GSTS_RTPS), sts);
1341 
1342 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1343 }
1344 
1345 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1346 {
1347 	u32 val;
1348 	unsigned long flag;
1349 
1350 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1351 		return;
1352 
1353 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1355 
1356 	/* Make sure hardware complete it */
1357 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1358 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1359 
1360 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1361 }
1362 
1363 /* return value determine if we need a write buffer flush */
1364 static void __iommu_flush_context(struct intel_iommu *iommu,
1365 				  u16 did, u16 source_id, u8 function_mask,
1366 				  u64 type)
1367 {
1368 	u64 val = 0;
1369 	unsigned long flag;
1370 
1371 	switch (type) {
1372 	case DMA_CCMD_GLOBAL_INVL:
1373 		val = DMA_CCMD_GLOBAL_INVL;
1374 		break;
1375 	case DMA_CCMD_DOMAIN_INVL:
1376 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1377 		break;
1378 	case DMA_CCMD_DEVICE_INVL:
1379 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1380 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1381 		break;
1382 	default:
1383 		BUG();
1384 	}
1385 	val |= DMA_CCMD_ICC;
1386 
1387 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1388 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1389 
1390 	/* Make sure hardware complete it */
1391 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1392 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1393 
1394 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395 }
1396 
1397 /* return value determine if we need a write buffer flush */
1398 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1399 				u64 addr, unsigned int size_order, u64 type)
1400 {
1401 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1402 	u64 val = 0, val_iva = 0;
1403 	unsigned long flag;
1404 
1405 	switch (type) {
1406 	case DMA_TLB_GLOBAL_FLUSH:
1407 		/* global flush doesn't need set IVA_REG */
1408 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1409 		break;
1410 	case DMA_TLB_DSI_FLUSH:
1411 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1412 		break;
1413 	case DMA_TLB_PSI_FLUSH:
1414 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1415 		/* IH bit is passed in as part of address */
1416 		val_iva = size_order | addr;
1417 		break;
1418 	default:
1419 		BUG();
1420 	}
1421 	/* Note: set drain read/write */
1422 #if 0
1423 	/*
1424 	 * This is probably to be super secure.. Looks like we can
1425 	 * ignore it without any impact.
1426 	 */
1427 	if (cap_read_drain(iommu->cap))
1428 		val |= DMA_TLB_READ_DRAIN;
1429 #endif
1430 	if (cap_write_drain(iommu->cap))
1431 		val |= DMA_TLB_WRITE_DRAIN;
1432 
1433 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1434 	/* Note: Only uses first TLB reg currently */
1435 	if (val_iva)
1436 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1437 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1438 
1439 	/* Make sure hardware complete it */
1440 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1441 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1442 
1443 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1444 
1445 	/* check IOTLB invalidation granularity */
1446 	if (DMA_TLB_IAIG(val) == 0)
1447 		pr_err("Flush IOTLB failed\n");
1448 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1449 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1450 			(unsigned long long)DMA_TLB_IIRG(type),
1451 			(unsigned long long)DMA_TLB_IAIG(val));
1452 }
1453 
1454 static struct device_domain_info *
1455 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1456 			 u8 bus, u8 devfn)
1457 {
1458 	struct device_domain_info *info;
1459 
1460 	assert_spin_locked(&device_domain_lock);
1461 
1462 	if (!iommu->qi)
1463 		return NULL;
1464 
1465 	list_for_each_entry(info, &domain->devices, link)
1466 		if (info->iommu == iommu && info->bus == bus &&
1467 		    info->devfn == devfn) {
1468 			if (info->ats_supported && info->dev)
1469 				return info;
1470 			break;
1471 		}
1472 
1473 	return NULL;
1474 }
1475 
1476 static void domain_update_iotlb(struct dmar_domain *domain)
1477 {
1478 	struct device_domain_info *info;
1479 	bool has_iotlb_device = false;
1480 
1481 	assert_spin_locked(&device_domain_lock);
1482 
1483 	list_for_each_entry(info, &domain->devices, link)
1484 		if (info->ats_enabled) {
1485 			has_iotlb_device = true;
1486 			break;
1487 		}
1488 
1489 	if (!has_iotlb_device) {
1490 		struct subdev_domain_info *sinfo;
1491 
1492 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1493 			info = get_domain_info(sinfo->pdev);
1494 			if (info && info->ats_enabled) {
1495 				has_iotlb_device = true;
1496 				break;
1497 			}
1498 		}
1499 	}
1500 
1501 	domain->has_iotlb_device = has_iotlb_device;
1502 }
1503 
1504 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1505 {
1506 	struct pci_dev *pdev;
1507 
1508 	assert_spin_locked(&device_domain_lock);
1509 
1510 	if (!info || !dev_is_pci(info->dev))
1511 		return;
1512 
1513 	pdev = to_pci_dev(info->dev);
1514 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1515 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1516 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1517 	 * reserved, which should be set to 0.
1518 	 */
1519 	if (!ecap_dit(info->iommu->ecap))
1520 		info->pfsid = 0;
1521 	else {
1522 		struct pci_dev *pf_pdev;
1523 
1524 		/* pdev will be returned if device is not a vf */
1525 		pf_pdev = pci_physfn(pdev);
1526 		info->pfsid = pci_dev_id(pf_pdev);
1527 	}
1528 
1529 #ifdef CONFIG_INTEL_IOMMU_SVM
1530 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1531 	   the device if you enable PASID support after ATS support is
1532 	   undefined. So always enable PASID support on devices which
1533 	   have it, even if we can't yet know if we're ever going to
1534 	   use it. */
1535 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1536 		info->pasid_enabled = 1;
1537 
1538 	if (info->pri_supported &&
1539 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1540 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1541 		info->pri_enabled = 1;
1542 #endif
1543 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1544 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1545 		info->ats_enabled = 1;
1546 		domain_update_iotlb(info->domain);
1547 		info->ats_qdep = pci_ats_queue_depth(pdev);
1548 	}
1549 }
1550 
1551 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1552 {
1553 	struct pci_dev *pdev;
1554 
1555 	assert_spin_locked(&device_domain_lock);
1556 
1557 	if (!dev_is_pci(info->dev))
1558 		return;
1559 
1560 	pdev = to_pci_dev(info->dev);
1561 
1562 	if (info->ats_enabled) {
1563 		pci_disable_ats(pdev);
1564 		info->ats_enabled = 0;
1565 		domain_update_iotlb(info->domain);
1566 	}
1567 #ifdef CONFIG_INTEL_IOMMU_SVM
1568 	if (info->pri_enabled) {
1569 		pci_disable_pri(pdev);
1570 		info->pri_enabled = 0;
1571 	}
1572 	if (info->pasid_enabled) {
1573 		pci_disable_pasid(pdev);
1574 		info->pasid_enabled = 0;
1575 	}
1576 #endif
1577 }
1578 
1579 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1580 				    u64 addr, unsigned int mask)
1581 {
1582 	u16 sid, qdep;
1583 
1584 	if (!info || !info->ats_enabled)
1585 		return;
1586 
1587 	sid = info->bus << 8 | info->devfn;
1588 	qdep = info->ats_qdep;
1589 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1590 			   qdep, addr, mask);
1591 }
1592 
1593 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1594 				  u64 addr, unsigned mask)
1595 {
1596 	unsigned long flags;
1597 	struct device_domain_info *info;
1598 	struct subdev_domain_info *sinfo;
1599 
1600 	if (!domain->has_iotlb_device)
1601 		return;
1602 
1603 	spin_lock_irqsave(&device_domain_lock, flags);
1604 	list_for_each_entry(info, &domain->devices, link)
1605 		__iommu_flush_dev_iotlb(info, addr, mask);
1606 
1607 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1608 		info = get_domain_info(sinfo->pdev);
1609 		__iommu_flush_dev_iotlb(info, addr, mask);
1610 	}
1611 	spin_unlock_irqrestore(&device_domain_lock, flags);
1612 }
1613 
1614 static void domain_flush_piotlb(struct intel_iommu *iommu,
1615 				struct dmar_domain *domain,
1616 				u64 addr, unsigned long npages, bool ih)
1617 {
1618 	u16 did = domain->iommu_did[iommu->seq_id];
1619 
1620 	if (domain->default_pasid)
1621 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1622 				addr, npages, ih);
1623 
1624 	if (!list_empty(&domain->devices))
1625 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1626 }
1627 
1628 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1629 				  struct dmar_domain *domain,
1630 				  unsigned long pfn, unsigned int pages,
1631 				  int ih, int map)
1632 {
1633 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1634 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1635 	u16 did = domain->iommu_did[iommu->seq_id];
1636 
1637 	BUG_ON(pages == 0);
1638 
1639 	if (ih)
1640 		ih = 1 << 6;
1641 
1642 	if (domain_use_first_level(domain)) {
1643 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1644 	} else {
1645 		/*
1646 		 * Fallback to domain selective flush if no PSI support or
1647 		 * the size is too big. PSI requires page size to be 2 ^ x,
1648 		 * and the base address is naturally aligned to the size.
1649 		 */
1650 		if (!cap_pgsel_inv(iommu->cap) ||
1651 		    mask > cap_max_amask_val(iommu->cap))
1652 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1653 							DMA_TLB_DSI_FLUSH);
1654 		else
1655 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1656 							DMA_TLB_PSI_FLUSH);
1657 	}
1658 
1659 	/*
1660 	 * In caching mode, changes of pages from non-present to present require
1661 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1662 	 */
1663 	if (!cap_caching_mode(iommu->cap) || !map)
1664 		iommu_flush_dev_iotlb(domain, addr, mask);
1665 }
1666 
1667 /* Notification for newly created mappings */
1668 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1669 					struct dmar_domain *domain,
1670 					unsigned long pfn, unsigned int pages)
1671 {
1672 	/*
1673 	 * It's a non-present to present mapping. Only flush if caching mode
1674 	 * and second level.
1675 	 */
1676 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1677 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1678 	else
1679 		iommu_flush_write_buffer(iommu);
1680 }
1681 
1682 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1683 {
1684 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1685 	int idx;
1686 
1687 	for_each_domain_iommu(idx, dmar_domain) {
1688 		struct intel_iommu *iommu = g_iommus[idx];
1689 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1690 
1691 		if (domain_use_first_level(dmar_domain))
1692 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1693 		else
1694 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1695 						 DMA_TLB_DSI_FLUSH);
1696 
1697 		if (!cap_caching_mode(iommu->cap))
1698 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1699 					      0, MAX_AGAW_PFN_WIDTH);
1700 	}
1701 }
1702 
1703 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1704 {
1705 	u32 pmen;
1706 	unsigned long flags;
1707 
1708 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1709 		return;
1710 
1711 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1712 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1713 	pmen &= ~DMA_PMEN_EPM;
1714 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1715 
1716 	/* wait for the protected region status bit to clear */
1717 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1718 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1719 
1720 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1721 }
1722 
1723 static void iommu_enable_translation(struct intel_iommu *iommu)
1724 {
1725 	u32 sts;
1726 	unsigned long flags;
1727 
1728 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1729 	iommu->gcmd |= DMA_GCMD_TE;
1730 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1731 
1732 	/* Make sure hardware complete it */
1733 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1734 		      readl, (sts & DMA_GSTS_TES), sts);
1735 
1736 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1737 }
1738 
1739 static void iommu_disable_translation(struct intel_iommu *iommu)
1740 {
1741 	u32 sts;
1742 	unsigned long flag;
1743 
1744 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1745 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1746 		return;
1747 
1748 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1749 	iommu->gcmd &= ~DMA_GCMD_TE;
1750 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1751 
1752 	/* Make sure hardware complete it */
1753 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1754 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1755 
1756 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1757 }
1758 
1759 static int iommu_init_domains(struct intel_iommu *iommu)
1760 {
1761 	u32 ndomains, nlongs;
1762 	size_t size;
1763 
1764 	ndomains = cap_ndoms(iommu->cap);
1765 	pr_debug("%s: Number of Domains supported <%d>\n",
1766 		 iommu->name, ndomains);
1767 	nlongs = BITS_TO_LONGS(ndomains);
1768 
1769 	spin_lock_init(&iommu->lock);
1770 
1771 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1772 	if (!iommu->domain_ids) {
1773 		pr_err("%s: Allocating domain id array failed\n",
1774 		       iommu->name);
1775 		return -ENOMEM;
1776 	}
1777 
1778 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1779 	iommu->domains = kzalloc(size, GFP_KERNEL);
1780 
1781 	if (iommu->domains) {
1782 		size = 256 * sizeof(struct dmar_domain *);
1783 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1784 	}
1785 
1786 	if (!iommu->domains || !iommu->domains[0]) {
1787 		pr_err("%s: Allocating domain array failed\n",
1788 		       iommu->name);
1789 		kfree(iommu->domain_ids);
1790 		kfree(iommu->domains);
1791 		iommu->domain_ids = NULL;
1792 		iommu->domains    = NULL;
1793 		return -ENOMEM;
1794 	}
1795 
1796 	/*
1797 	 * If Caching mode is set, then invalid translations are tagged
1798 	 * with domain-id 0, hence we need to pre-allocate it. We also
1799 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1800 	 * make sure it is not used for a real domain.
1801 	 */
1802 	set_bit(0, iommu->domain_ids);
1803 
1804 	/*
1805 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1806 	 * entry for first-level or pass-through translation modes should
1807 	 * be programmed with a domain id different from those used for
1808 	 * second-level or nested translation. We reserve a domain id for
1809 	 * this purpose.
1810 	 */
1811 	if (sm_supported(iommu))
1812 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1813 
1814 	return 0;
1815 }
1816 
1817 static void disable_dmar_iommu(struct intel_iommu *iommu)
1818 {
1819 	struct device_domain_info *info, *tmp;
1820 	unsigned long flags;
1821 
1822 	if (!iommu->domains || !iommu->domain_ids)
1823 		return;
1824 
1825 	spin_lock_irqsave(&device_domain_lock, flags);
1826 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1827 		if (info->iommu != iommu)
1828 			continue;
1829 
1830 		if (!info->dev || !info->domain)
1831 			continue;
1832 
1833 		__dmar_remove_one_dev_info(info);
1834 	}
1835 	spin_unlock_irqrestore(&device_domain_lock, flags);
1836 
1837 	if (iommu->gcmd & DMA_GCMD_TE)
1838 		iommu_disable_translation(iommu);
1839 }
1840 
1841 static void free_dmar_iommu(struct intel_iommu *iommu)
1842 {
1843 	if ((iommu->domains) && (iommu->domain_ids)) {
1844 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1845 		int i;
1846 
1847 		for (i = 0; i < elems; i++)
1848 			kfree(iommu->domains[i]);
1849 		kfree(iommu->domains);
1850 		kfree(iommu->domain_ids);
1851 		iommu->domains = NULL;
1852 		iommu->domain_ids = NULL;
1853 	}
1854 
1855 	g_iommus[iommu->seq_id] = NULL;
1856 
1857 	/* free context mapping */
1858 	free_context_table(iommu);
1859 
1860 #ifdef CONFIG_INTEL_IOMMU_SVM
1861 	if (pasid_supported(iommu)) {
1862 		if (ecap_prs(iommu->ecap))
1863 			intel_svm_finish_prq(iommu);
1864 	}
1865 	if (vccap_pasid(iommu->vccap))
1866 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1867 
1868 #endif
1869 }
1870 
1871 /*
1872  * Check and return whether first level is used by default for
1873  * DMA translation.
1874  */
1875 static bool first_level_by_default(void)
1876 {
1877 	return scalable_mode_support() && intel_cap_flts_sanity();
1878 }
1879 
1880 static struct dmar_domain *alloc_domain(int flags)
1881 {
1882 	struct dmar_domain *domain;
1883 
1884 	domain = alloc_domain_mem();
1885 	if (!domain)
1886 		return NULL;
1887 
1888 	memset(domain, 0, sizeof(*domain));
1889 	domain->nid = NUMA_NO_NODE;
1890 	domain->flags = flags;
1891 	if (first_level_by_default())
1892 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1893 	domain->has_iotlb_device = false;
1894 	INIT_LIST_HEAD(&domain->devices);
1895 	INIT_LIST_HEAD(&domain->subdevices);
1896 
1897 	return domain;
1898 }
1899 
1900 /* Must be called with iommu->lock */
1901 static int domain_attach_iommu(struct dmar_domain *domain,
1902 			       struct intel_iommu *iommu)
1903 {
1904 	unsigned long ndomains;
1905 	int num;
1906 
1907 	assert_spin_locked(&device_domain_lock);
1908 	assert_spin_locked(&iommu->lock);
1909 
1910 	domain->iommu_refcnt[iommu->seq_id] += 1;
1911 	domain->iommu_count += 1;
1912 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1913 		ndomains = cap_ndoms(iommu->cap);
1914 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1915 
1916 		if (num >= ndomains) {
1917 			pr_err("%s: No free domain ids\n", iommu->name);
1918 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1919 			domain->iommu_count -= 1;
1920 			return -ENOSPC;
1921 		}
1922 
1923 		set_bit(num, iommu->domain_ids);
1924 		set_iommu_domain(iommu, num, domain);
1925 
1926 		domain->iommu_did[iommu->seq_id] = num;
1927 		domain->nid			 = iommu->node;
1928 
1929 		domain_update_iommu_cap(domain);
1930 	}
1931 
1932 	return 0;
1933 }
1934 
1935 static int domain_detach_iommu(struct dmar_domain *domain,
1936 			       struct intel_iommu *iommu)
1937 {
1938 	int num, count;
1939 
1940 	assert_spin_locked(&device_domain_lock);
1941 	assert_spin_locked(&iommu->lock);
1942 
1943 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1944 	count = --domain->iommu_count;
1945 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1946 		num = domain->iommu_did[iommu->seq_id];
1947 		clear_bit(num, iommu->domain_ids);
1948 		set_iommu_domain(iommu, num, NULL);
1949 
1950 		domain_update_iommu_cap(domain);
1951 		domain->iommu_did[iommu->seq_id] = 0;
1952 	}
1953 
1954 	return count;
1955 }
1956 
1957 static inline int guestwidth_to_adjustwidth(int gaw)
1958 {
1959 	int agaw;
1960 	int r = (gaw - 12) % 9;
1961 
1962 	if (r == 0)
1963 		agaw = gaw;
1964 	else
1965 		agaw = gaw + 9 - r;
1966 	if (agaw > 64)
1967 		agaw = 64;
1968 	return agaw;
1969 }
1970 
1971 static void domain_exit(struct dmar_domain *domain)
1972 {
1973 
1974 	/* Remove associated devices and clear attached or cached domains */
1975 	domain_remove_dev_info(domain);
1976 
1977 	/* destroy iovas */
1978 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1979 		iommu_put_dma_cookie(&domain->domain);
1980 
1981 	if (domain->pgd) {
1982 		struct page *freelist;
1983 
1984 		freelist = domain_unmap(domain, 0,
1985 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1986 		dma_free_pagelist(freelist);
1987 	}
1988 
1989 	free_domain_mem(domain);
1990 }
1991 
1992 /*
1993  * Get the PASID directory size for scalable mode context entry.
1994  * Value of X in the PDTS field of a scalable mode context entry
1995  * indicates PASID directory with 2^(X + 7) entries.
1996  */
1997 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1998 {
1999 	int pds, max_pde;
2000 
2001 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2002 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2003 	if (pds < 7)
2004 		return 0;
2005 
2006 	return pds - 7;
2007 }
2008 
2009 /*
2010  * Set the RID_PASID field of a scalable mode context entry. The
2011  * IOMMU hardware will use the PASID value set in this field for
2012  * DMA translations of DMA requests without PASID.
2013  */
2014 static inline void
2015 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2016 {
2017 	context->hi |= pasid & ((1 << 20) - 1);
2018 }
2019 
2020 /*
2021  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2022  * entry.
2023  */
2024 static inline void context_set_sm_dte(struct context_entry *context)
2025 {
2026 	context->lo |= (1 << 2);
2027 }
2028 
2029 /*
2030  * Set the PRE(Page Request Enable) field of a scalable mode context
2031  * entry.
2032  */
2033 static inline void context_set_sm_pre(struct context_entry *context)
2034 {
2035 	context->lo |= (1 << 4);
2036 }
2037 
2038 /* Convert value to context PASID directory size field coding. */
2039 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2040 
2041 static int domain_context_mapping_one(struct dmar_domain *domain,
2042 				      struct intel_iommu *iommu,
2043 				      struct pasid_table *table,
2044 				      u8 bus, u8 devfn)
2045 {
2046 	u16 did = domain->iommu_did[iommu->seq_id];
2047 	int translation = CONTEXT_TT_MULTI_LEVEL;
2048 	struct device_domain_info *info = NULL;
2049 	struct context_entry *context;
2050 	unsigned long flags;
2051 	int ret;
2052 
2053 	WARN_ON(did == 0);
2054 
2055 	if (hw_pass_through && domain_type_is_si(domain))
2056 		translation = CONTEXT_TT_PASS_THROUGH;
2057 
2058 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2059 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2060 
2061 	BUG_ON(!domain->pgd);
2062 
2063 	spin_lock_irqsave(&device_domain_lock, flags);
2064 	spin_lock(&iommu->lock);
2065 
2066 	ret = -ENOMEM;
2067 	context = iommu_context_addr(iommu, bus, devfn, 1);
2068 	if (!context)
2069 		goto out_unlock;
2070 
2071 	ret = 0;
2072 	if (context_present(context))
2073 		goto out_unlock;
2074 
2075 	/*
2076 	 * For kdump cases, old valid entries may be cached due to the
2077 	 * in-flight DMA and copied pgtable, but there is no unmapping
2078 	 * behaviour for them, thus we need an explicit cache flush for
2079 	 * the newly-mapped device. For kdump, at this point, the device
2080 	 * is supposed to finish reset at its driver probe stage, so no
2081 	 * in-flight DMA will exist, and we don't need to worry anymore
2082 	 * hereafter.
2083 	 */
2084 	if (context_copied(context)) {
2085 		u16 did_old = context_domain_id(context);
2086 
2087 		if (did_old < cap_ndoms(iommu->cap)) {
2088 			iommu->flush.flush_context(iommu, did_old,
2089 						   (((u16)bus) << 8) | devfn,
2090 						   DMA_CCMD_MASK_NOBIT,
2091 						   DMA_CCMD_DEVICE_INVL);
2092 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2093 						 DMA_TLB_DSI_FLUSH);
2094 		}
2095 	}
2096 
2097 	context_clear_entry(context);
2098 
2099 	if (sm_supported(iommu)) {
2100 		unsigned long pds;
2101 
2102 		WARN_ON(!table);
2103 
2104 		/* Setup the PASID DIR pointer: */
2105 		pds = context_get_sm_pds(table);
2106 		context->lo = (u64)virt_to_phys(table->table) |
2107 				context_pdts(pds);
2108 
2109 		/* Setup the RID_PASID field: */
2110 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2111 
2112 		/*
2113 		 * Setup the Device-TLB enable bit and Page request
2114 		 * Enable bit:
2115 		 */
2116 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2117 		if (info && info->ats_supported)
2118 			context_set_sm_dte(context);
2119 		if (info && info->pri_supported)
2120 			context_set_sm_pre(context);
2121 	} else {
2122 		struct dma_pte *pgd = domain->pgd;
2123 		int agaw;
2124 
2125 		context_set_domain_id(context, did);
2126 
2127 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2128 			/*
2129 			 * Skip top levels of page tables for iommu which has
2130 			 * less agaw than default. Unnecessary for PT mode.
2131 			 */
2132 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2133 				ret = -ENOMEM;
2134 				pgd = phys_to_virt(dma_pte_addr(pgd));
2135 				if (!dma_pte_present(pgd))
2136 					goto out_unlock;
2137 			}
2138 
2139 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2140 			if (info && info->ats_supported)
2141 				translation = CONTEXT_TT_DEV_IOTLB;
2142 			else
2143 				translation = CONTEXT_TT_MULTI_LEVEL;
2144 
2145 			context_set_address_root(context, virt_to_phys(pgd));
2146 			context_set_address_width(context, agaw);
2147 		} else {
2148 			/*
2149 			 * In pass through mode, AW must be programmed to
2150 			 * indicate the largest AGAW value supported by
2151 			 * hardware. And ASR is ignored by hardware.
2152 			 */
2153 			context_set_address_width(context, iommu->msagaw);
2154 		}
2155 
2156 		context_set_translation_type(context, translation);
2157 	}
2158 
2159 	context_set_fault_enable(context);
2160 	context_set_present(context);
2161 	if (!ecap_coherent(iommu->ecap))
2162 		clflush_cache_range(context, sizeof(*context));
2163 
2164 	/*
2165 	 * It's a non-present to present mapping. If hardware doesn't cache
2166 	 * non-present entry we only need to flush the write-buffer. If the
2167 	 * _does_ cache non-present entries, then it does so in the special
2168 	 * domain #0, which we have to flush:
2169 	 */
2170 	if (cap_caching_mode(iommu->cap)) {
2171 		iommu->flush.flush_context(iommu, 0,
2172 					   (((u16)bus) << 8) | devfn,
2173 					   DMA_CCMD_MASK_NOBIT,
2174 					   DMA_CCMD_DEVICE_INVL);
2175 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2176 	} else {
2177 		iommu_flush_write_buffer(iommu);
2178 	}
2179 	iommu_enable_dev_iotlb(info);
2180 
2181 	ret = 0;
2182 
2183 out_unlock:
2184 	spin_unlock(&iommu->lock);
2185 	spin_unlock_irqrestore(&device_domain_lock, flags);
2186 
2187 	return ret;
2188 }
2189 
2190 struct domain_context_mapping_data {
2191 	struct dmar_domain *domain;
2192 	struct intel_iommu *iommu;
2193 	struct pasid_table *table;
2194 };
2195 
2196 static int domain_context_mapping_cb(struct pci_dev *pdev,
2197 				     u16 alias, void *opaque)
2198 {
2199 	struct domain_context_mapping_data *data = opaque;
2200 
2201 	return domain_context_mapping_one(data->domain, data->iommu,
2202 					  data->table, PCI_BUS_NUM(alias),
2203 					  alias & 0xff);
2204 }
2205 
2206 static int
2207 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2208 {
2209 	struct domain_context_mapping_data data;
2210 	struct pasid_table *table;
2211 	struct intel_iommu *iommu;
2212 	u8 bus, devfn;
2213 
2214 	iommu = device_to_iommu(dev, &bus, &devfn);
2215 	if (!iommu)
2216 		return -ENODEV;
2217 
2218 	table = intel_pasid_get_table(dev);
2219 
2220 	if (!dev_is_pci(dev))
2221 		return domain_context_mapping_one(domain, iommu, table,
2222 						  bus, devfn);
2223 
2224 	data.domain = domain;
2225 	data.iommu = iommu;
2226 	data.table = table;
2227 
2228 	return pci_for_each_dma_alias(to_pci_dev(dev),
2229 				      &domain_context_mapping_cb, &data);
2230 }
2231 
2232 static int domain_context_mapped_cb(struct pci_dev *pdev,
2233 				    u16 alias, void *opaque)
2234 {
2235 	struct intel_iommu *iommu = opaque;
2236 
2237 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2238 }
2239 
2240 static int domain_context_mapped(struct device *dev)
2241 {
2242 	struct intel_iommu *iommu;
2243 	u8 bus, devfn;
2244 
2245 	iommu = device_to_iommu(dev, &bus, &devfn);
2246 	if (!iommu)
2247 		return -ENODEV;
2248 
2249 	if (!dev_is_pci(dev))
2250 		return device_context_mapped(iommu, bus, devfn);
2251 
2252 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2253 				       domain_context_mapped_cb, iommu);
2254 }
2255 
2256 /* Returns a number of VTD pages, but aligned to MM page size */
2257 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2258 					    size_t size)
2259 {
2260 	host_addr &= ~PAGE_MASK;
2261 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2262 }
2263 
2264 /* Return largest possible superpage level for a given mapping */
2265 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2266 					  unsigned long iov_pfn,
2267 					  unsigned long phy_pfn,
2268 					  unsigned long pages)
2269 {
2270 	int support, level = 1;
2271 	unsigned long pfnmerge;
2272 
2273 	support = domain->iommu_superpage;
2274 
2275 	/* To use a large page, the virtual *and* physical addresses
2276 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2277 	   of them will mean we have to use smaller pages. So just
2278 	   merge them and check both at once. */
2279 	pfnmerge = iov_pfn | phy_pfn;
2280 
2281 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2282 		pages >>= VTD_STRIDE_SHIFT;
2283 		if (!pages)
2284 			break;
2285 		pfnmerge >>= VTD_STRIDE_SHIFT;
2286 		level++;
2287 		support--;
2288 	}
2289 	return level;
2290 }
2291 
2292 static int
2293 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2294 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2295 {
2296 	unsigned int largepage_lvl = 0;
2297 	unsigned long lvl_pages = 0;
2298 	struct dma_pte *pte = NULL;
2299 	phys_addr_t pteval;
2300 	u64 attr;
2301 
2302 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2303 
2304 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2305 		return -EINVAL;
2306 
2307 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2308 	if (domain_use_first_level(domain)) {
2309 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2310 
2311 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2312 			attr |= DMA_FL_PTE_ACCESS;
2313 			if (prot & DMA_PTE_WRITE)
2314 				attr |= DMA_FL_PTE_DIRTY;
2315 		}
2316 	}
2317 
2318 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2319 
2320 	while (nr_pages > 0) {
2321 		uint64_t tmp;
2322 
2323 		if (!pte) {
2324 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2325 					phys_pfn, nr_pages);
2326 
2327 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328 			if (!pte)
2329 				return -ENOMEM;
2330 			/* It is large page*/
2331 			if (largepage_lvl > 1) {
2332 				unsigned long nr_superpages, end_pfn;
2333 
2334 				pteval |= DMA_PTE_LARGE_PAGE;
2335 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336 
2337 				nr_superpages = nr_pages / lvl_pages;
2338 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339 
2340 				/*
2341 				 * Ensure that old small page tables are
2342 				 * removed to make room for superpage(s).
2343 				 * We're adding new large pages, so make sure
2344 				 * we don't remove their parent tables.
2345 				 */
2346 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347 						       largepage_lvl + 1);
2348 			} else {
2349 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350 			}
2351 
2352 		}
2353 		/* We don't need lock here, nobody else
2354 		 * touches the iova range
2355 		 */
2356 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357 		if (tmp) {
2358 			static int dumps = 5;
2359 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360 				iov_pfn, tmp, (unsigned long long)pteval);
2361 			if (dumps) {
2362 				dumps--;
2363 				debug_dma_dump_mappings(NULL);
2364 			}
2365 			WARN_ON(1);
2366 		}
2367 
2368 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369 
2370 		BUG_ON(nr_pages < lvl_pages);
2371 
2372 		nr_pages -= lvl_pages;
2373 		iov_pfn += lvl_pages;
2374 		phys_pfn += lvl_pages;
2375 		pteval += lvl_pages * VTD_PAGE_SIZE;
2376 
2377 		/* If the next PTE would be the first in a new page, then we
2378 		 * need to flush the cache on the entries we've just written.
2379 		 * And then we'll need to recalculate 'pte', so clear it and
2380 		 * let it get set again in the if (!pte) block above.
2381 		 *
2382 		 * If we're done (!nr_pages) we need to flush the cache too.
2383 		 *
2384 		 * Also if we've been setting superpages, we may need to
2385 		 * recalculate 'pte' and switch back to smaller pages for the
2386 		 * end of the mapping, if the trailing size is not enough to
2387 		 * use another superpage (i.e. nr_pages < lvl_pages).
2388 		 *
2389 		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2390 		 * callback.
2391 		 */
2392 		pte++;
2393 		if (!nr_pages || first_pte_in_page(pte) ||
2394 		    (largepage_lvl > 1 && nr_pages < lvl_pages))
2395 			pte = NULL;
2396 	}
2397 
2398 	return 0;
2399 }
2400 
2401 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2402 {
2403 	unsigned long flags;
2404 	struct context_entry *context;
2405 	u16 did_old;
2406 
2407 	if (!iommu)
2408 		return;
2409 
2410 	spin_lock_irqsave(&iommu->lock, flags);
2411 	context = iommu_context_addr(iommu, bus, devfn, 0);
2412 	if (!context) {
2413 		spin_unlock_irqrestore(&iommu->lock, flags);
2414 		return;
2415 	}
2416 	did_old = context_domain_id(context);
2417 	context_clear_entry(context);
2418 	__iommu_flush_cache(iommu, context, sizeof(*context));
2419 	spin_unlock_irqrestore(&iommu->lock, flags);
2420 	iommu->flush.flush_context(iommu,
2421 				   did_old,
2422 				   (((u16)bus) << 8) | devfn,
2423 				   DMA_CCMD_MASK_NOBIT,
2424 				   DMA_CCMD_DEVICE_INVL);
2425 	iommu->flush.flush_iotlb(iommu,
2426 				 did_old,
2427 				 0,
2428 				 0,
2429 				 DMA_TLB_DSI_FLUSH);
2430 }
2431 
2432 static inline void unlink_domain_info(struct device_domain_info *info)
2433 {
2434 	assert_spin_locked(&device_domain_lock);
2435 	list_del(&info->link);
2436 	list_del(&info->global);
2437 	if (info->dev)
2438 		dev_iommu_priv_set(info->dev, NULL);
2439 }
2440 
2441 static void domain_remove_dev_info(struct dmar_domain *domain)
2442 {
2443 	struct device_domain_info *info, *tmp;
2444 	unsigned long flags;
2445 
2446 	spin_lock_irqsave(&device_domain_lock, flags);
2447 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2448 		__dmar_remove_one_dev_info(info);
2449 	spin_unlock_irqrestore(&device_domain_lock, flags);
2450 }
2451 
2452 struct dmar_domain *find_domain(struct device *dev)
2453 {
2454 	struct device_domain_info *info;
2455 
2456 	if (unlikely(!dev || !dev->iommu))
2457 		return NULL;
2458 
2459 	if (unlikely(attach_deferred(dev)))
2460 		return NULL;
2461 
2462 	/* No lock here, assumes no domain exit in normal case */
2463 	info = get_domain_info(dev);
2464 	if (likely(info))
2465 		return info->domain;
2466 
2467 	return NULL;
2468 }
2469 
2470 static inline struct device_domain_info *
2471 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2472 {
2473 	struct device_domain_info *info;
2474 
2475 	list_for_each_entry(info, &device_domain_list, global)
2476 		if (info->segment == segment && info->bus == bus &&
2477 		    info->devfn == devfn)
2478 			return info;
2479 
2480 	return NULL;
2481 }
2482 
2483 static int domain_setup_first_level(struct intel_iommu *iommu,
2484 				    struct dmar_domain *domain,
2485 				    struct device *dev,
2486 				    u32 pasid)
2487 {
2488 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2489 	struct dma_pte *pgd = domain->pgd;
2490 	int agaw, level;
2491 
2492 	/*
2493 	 * Skip top levels of page tables for iommu which has
2494 	 * less agaw than default. Unnecessary for PT mode.
2495 	 */
2496 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2497 		pgd = phys_to_virt(dma_pte_addr(pgd));
2498 		if (!dma_pte_present(pgd))
2499 			return -ENOMEM;
2500 	}
2501 
2502 	level = agaw_to_level(agaw);
2503 	if (level != 4 && level != 5)
2504 		return -EINVAL;
2505 
2506 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2507 
2508 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2509 					     domain->iommu_did[iommu->seq_id],
2510 					     flags);
2511 }
2512 
2513 static bool dev_is_real_dma_subdevice(struct device *dev)
2514 {
2515 	return dev && dev_is_pci(dev) &&
2516 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2517 }
2518 
2519 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2520 						    int bus, int devfn,
2521 						    struct device *dev,
2522 						    struct dmar_domain *domain)
2523 {
2524 	struct dmar_domain *found = NULL;
2525 	struct device_domain_info *info;
2526 	unsigned long flags;
2527 	int ret;
2528 
2529 	info = alloc_devinfo_mem();
2530 	if (!info)
2531 		return NULL;
2532 
2533 	if (!dev_is_real_dma_subdevice(dev)) {
2534 		info->bus = bus;
2535 		info->devfn = devfn;
2536 		info->segment = iommu->segment;
2537 	} else {
2538 		struct pci_dev *pdev = to_pci_dev(dev);
2539 
2540 		info->bus = pdev->bus->number;
2541 		info->devfn = pdev->devfn;
2542 		info->segment = pci_domain_nr(pdev->bus);
2543 	}
2544 
2545 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2546 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2547 	info->ats_qdep = 0;
2548 	info->dev = dev;
2549 	info->domain = domain;
2550 	info->iommu = iommu;
2551 	info->pasid_table = NULL;
2552 	info->auxd_enabled = 0;
2553 	INIT_LIST_HEAD(&info->subdevices);
2554 
2555 	if (dev && dev_is_pci(dev)) {
2556 		struct pci_dev *pdev = to_pci_dev(info->dev);
2557 
2558 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2559 		    pci_ats_supported(pdev) &&
2560 		    dmar_find_matched_atsr_unit(pdev))
2561 			info->ats_supported = 1;
2562 
2563 		if (sm_supported(iommu)) {
2564 			if (pasid_supported(iommu)) {
2565 				int features = pci_pasid_features(pdev);
2566 				if (features >= 0)
2567 					info->pasid_supported = features | 1;
2568 			}
2569 
2570 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2571 			    pci_pri_supported(pdev))
2572 				info->pri_supported = 1;
2573 		}
2574 	}
2575 
2576 	spin_lock_irqsave(&device_domain_lock, flags);
2577 	if (dev)
2578 		found = find_domain(dev);
2579 
2580 	if (!found) {
2581 		struct device_domain_info *info2;
2582 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2583 						       info->devfn);
2584 		if (info2) {
2585 			found      = info2->domain;
2586 			info2->dev = dev;
2587 		}
2588 	}
2589 
2590 	if (found) {
2591 		spin_unlock_irqrestore(&device_domain_lock, flags);
2592 		free_devinfo_mem(info);
2593 		/* Caller must free the original domain */
2594 		return found;
2595 	}
2596 
2597 	spin_lock(&iommu->lock);
2598 	ret = domain_attach_iommu(domain, iommu);
2599 	spin_unlock(&iommu->lock);
2600 
2601 	if (ret) {
2602 		spin_unlock_irqrestore(&device_domain_lock, flags);
2603 		free_devinfo_mem(info);
2604 		return NULL;
2605 	}
2606 
2607 	list_add(&info->link, &domain->devices);
2608 	list_add(&info->global, &device_domain_list);
2609 	if (dev)
2610 		dev_iommu_priv_set(dev, info);
2611 	spin_unlock_irqrestore(&device_domain_lock, flags);
2612 
2613 	/* PASID table is mandatory for a PCI device in scalable mode. */
2614 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2615 		ret = intel_pasid_alloc_table(dev);
2616 		if (ret) {
2617 			dev_err(dev, "PASID table allocation failed\n");
2618 			dmar_remove_one_dev_info(dev);
2619 			return NULL;
2620 		}
2621 
2622 		/* Setup the PASID entry for requests without PASID: */
2623 		spin_lock_irqsave(&iommu->lock, flags);
2624 		if (hw_pass_through && domain_type_is_si(domain))
2625 			ret = intel_pasid_setup_pass_through(iommu, domain,
2626 					dev, PASID_RID2PASID);
2627 		else if (domain_use_first_level(domain))
2628 			ret = domain_setup_first_level(iommu, domain, dev,
2629 					PASID_RID2PASID);
2630 		else
2631 			ret = intel_pasid_setup_second_level(iommu, domain,
2632 					dev, PASID_RID2PASID);
2633 		spin_unlock_irqrestore(&iommu->lock, flags);
2634 		if (ret) {
2635 			dev_err(dev, "Setup RID2PASID failed\n");
2636 			dmar_remove_one_dev_info(dev);
2637 			return NULL;
2638 		}
2639 	}
2640 
2641 	if (dev && domain_context_mapping(domain, dev)) {
2642 		dev_err(dev, "Domain context map failed\n");
2643 		dmar_remove_one_dev_info(dev);
2644 		return NULL;
2645 	}
2646 
2647 	return domain;
2648 }
2649 
2650 static int iommu_domain_identity_map(struct dmar_domain *domain,
2651 				     unsigned long first_vpfn,
2652 				     unsigned long last_vpfn)
2653 {
2654 	/*
2655 	 * RMRR range might have overlap with physical memory range,
2656 	 * clear it first
2657 	 */
2658 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2659 
2660 	return __domain_mapping(domain, first_vpfn,
2661 				first_vpfn, last_vpfn - first_vpfn + 1,
2662 				DMA_PTE_READ|DMA_PTE_WRITE);
2663 }
2664 
2665 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2666 
2667 static int __init si_domain_init(int hw)
2668 {
2669 	struct dmar_rmrr_unit *rmrr;
2670 	struct device *dev;
2671 	int i, nid, ret;
2672 
2673 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2674 	if (!si_domain)
2675 		return -EFAULT;
2676 
2677 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2678 		domain_exit(si_domain);
2679 		return -EFAULT;
2680 	}
2681 
2682 	if (hw)
2683 		return 0;
2684 
2685 	for_each_online_node(nid) {
2686 		unsigned long start_pfn, end_pfn;
2687 		int i;
2688 
2689 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2690 			ret = iommu_domain_identity_map(si_domain,
2691 					mm_to_dma_pfn(start_pfn),
2692 					mm_to_dma_pfn(end_pfn));
2693 			if (ret)
2694 				return ret;
2695 		}
2696 	}
2697 
2698 	/*
2699 	 * Identity map the RMRRs so that devices with RMRRs could also use
2700 	 * the si_domain.
2701 	 */
2702 	for_each_rmrr_units(rmrr) {
2703 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2704 					  i, dev) {
2705 			unsigned long long start = rmrr->base_address;
2706 			unsigned long long end = rmrr->end_address;
2707 
2708 			if (WARN_ON(end < start ||
2709 				    end >> agaw_to_width(si_domain->agaw)))
2710 				continue;
2711 
2712 			ret = iommu_domain_identity_map(si_domain,
2713 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2714 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2715 			if (ret)
2716 				return ret;
2717 		}
2718 	}
2719 
2720 	return 0;
2721 }
2722 
2723 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2724 {
2725 	struct dmar_domain *ndomain;
2726 	struct intel_iommu *iommu;
2727 	u8 bus, devfn;
2728 
2729 	iommu = device_to_iommu(dev, &bus, &devfn);
2730 	if (!iommu)
2731 		return -ENODEV;
2732 
2733 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2734 	if (ndomain != domain)
2735 		return -EBUSY;
2736 
2737 	return 0;
2738 }
2739 
2740 static bool device_has_rmrr(struct device *dev)
2741 {
2742 	struct dmar_rmrr_unit *rmrr;
2743 	struct device *tmp;
2744 	int i;
2745 
2746 	rcu_read_lock();
2747 	for_each_rmrr_units(rmrr) {
2748 		/*
2749 		 * Return TRUE if this RMRR contains the device that
2750 		 * is passed in.
2751 		 */
2752 		for_each_active_dev_scope(rmrr->devices,
2753 					  rmrr->devices_cnt, i, tmp)
2754 			if (tmp == dev ||
2755 			    is_downstream_to_pci_bridge(dev, tmp)) {
2756 				rcu_read_unlock();
2757 				return true;
2758 			}
2759 	}
2760 	rcu_read_unlock();
2761 	return false;
2762 }
2763 
2764 /**
2765  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2766  * is relaxable (ie. is allowed to be not enforced under some conditions)
2767  * @dev: device handle
2768  *
2769  * We assume that PCI USB devices with RMRRs have them largely
2770  * for historical reasons and that the RMRR space is not actively used post
2771  * boot.  This exclusion may change if vendors begin to abuse it.
2772  *
2773  * The same exception is made for graphics devices, with the requirement that
2774  * any use of the RMRR regions will be torn down before assigning the device
2775  * to a guest.
2776  *
2777  * Return: true if the RMRR is relaxable, false otherwise
2778  */
2779 static bool device_rmrr_is_relaxable(struct device *dev)
2780 {
2781 	struct pci_dev *pdev;
2782 
2783 	if (!dev_is_pci(dev))
2784 		return false;
2785 
2786 	pdev = to_pci_dev(dev);
2787 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2788 		return true;
2789 	else
2790 		return false;
2791 }
2792 
2793 /*
2794  * There are a couple cases where we need to restrict the functionality of
2795  * devices associated with RMRRs.  The first is when evaluating a device for
2796  * identity mapping because problems exist when devices are moved in and out
2797  * of domains and their respective RMRR information is lost.  This means that
2798  * a device with associated RMRRs will never be in a "passthrough" domain.
2799  * The second is use of the device through the IOMMU API.  This interface
2800  * expects to have full control of the IOVA space for the device.  We cannot
2801  * satisfy both the requirement that RMRR access is maintained and have an
2802  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2803  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2804  * We therefore prevent devices associated with an RMRR from participating in
2805  * the IOMMU API, which eliminates them from device assignment.
2806  *
2807  * In both cases, devices which have relaxable RMRRs are not concerned by this
2808  * restriction. See device_rmrr_is_relaxable comment.
2809  */
2810 static bool device_is_rmrr_locked(struct device *dev)
2811 {
2812 	if (!device_has_rmrr(dev))
2813 		return false;
2814 
2815 	if (device_rmrr_is_relaxable(dev))
2816 		return false;
2817 
2818 	return true;
2819 }
2820 
2821 /*
2822  * Return the required default domain type for a specific device.
2823  *
2824  * @dev: the device in query
2825  * @startup: true if this is during early boot
2826  *
2827  * Returns:
2828  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2829  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2830  *  - 0: both identity and dynamic domains work for this device
2831  */
2832 static int device_def_domain_type(struct device *dev)
2833 {
2834 	if (dev_is_pci(dev)) {
2835 		struct pci_dev *pdev = to_pci_dev(dev);
2836 
2837 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2838 			return IOMMU_DOMAIN_IDENTITY;
2839 
2840 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2841 			return IOMMU_DOMAIN_IDENTITY;
2842 	}
2843 
2844 	return 0;
2845 }
2846 
2847 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2848 {
2849 	/*
2850 	 * Start from the sane iommu hardware state.
2851 	 * If the queued invalidation is already initialized by us
2852 	 * (for example, while enabling interrupt-remapping) then
2853 	 * we got the things already rolling from a sane state.
2854 	 */
2855 	if (!iommu->qi) {
2856 		/*
2857 		 * Clear any previous faults.
2858 		 */
2859 		dmar_fault(-1, iommu);
2860 		/*
2861 		 * Disable queued invalidation if supported and already enabled
2862 		 * before OS handover.
2863 		 */
2864 		dmar_disable_qi(iommu);
2865 	}
2866 
2867 	if (dmar_enable_qi(iommu)) {
2868 		/*
2869 		 * Queued Invalidate not enabled, use Register Based Invalidate
2870 		 */
2871 		iommu->flush.flush_context = __iommu_flush_context;
2872 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2873 		pr_info("%s: Using Register based invalidation\n",
2874 			iommu->name);
2875 	} else {
2876 		iommu->flush.flush_context = qi_flush_context;
2877 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2878 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2879 	}
2880 }
2881 
2882 static int copy_context_table(struct intel_iommu *iommu,
2883 			      struct root_entry *old_re,
2884 			      struct context_entry **tbl,
2885 			      int bus, bool ext)
2886 {
2887 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2888 	struct context_entry *new_ce = NULL, ce;
2889 	struct context_entry *old_ce = NULL;
2890 	struct root_entry re;
2891 	phys_addr_t old_ce_phys;
2892 
2893 	tbl_idx = ext ? bus * 2 : bus;
2894 	memcpy(&re, old_re, sizeof(re));
2895 
2896 	for (devfn = 0; devfn < 256; devfn++) {
2897 		/* First calculate the correct index */
2898 		idx = (ext ? devfn * 2 : devfn) % 256;
2899 
2900 		if (idx == 0) {
2901 			/* First save what we may have and clean up */
2902 			if (new_ce) {
2903 				tbl[tbl_idx] = new_ce;
2904 				__iommu_flush_cache(iommu, new_ce,
2905 						    VTD_PAGE_SIZE);
2906 				pos = 1;
2907 			}
2908 
2909 			if (old_ce)
2910 				memunmap(old_ce);
2911 
2912 			ret = 0;
2913 			if (devfn < 0x80)
2914 				old_ce_phys = root_entry_lctp(&re);
2915 			else
2916 				old_ce_phys = root_entry_uctp(&re);
2917 
2918 			if (!old_ce_phys) {
2919 				if (ext && devfn == 0) {
2920 					/* No LCTP, try UCTP */
2921 					devfn = 0x7f;
2922 					continue;
2923 				} else {
2924 					goto out;
2925 				}
2926 			}
2927 
2928 			ret = -ENOMEM;
2929 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2930 					MEMREMAP_WB);
2931 			if (!old_ce)
2932 				goto out;
2933 
2934 			new_ce = alloc_pgtable_page(iommu->node);
2935 			if (!new_ce)
2936 				goto out_unmap;
2937 
2938 			ret = 0;
2939 		}
2940 
2941 		/* Now copy the context entry */
2942 		memcpy(&ce, old_ce + idx, sizeof(ce));
2943 
2944 		if (!__context_present(&ce))
2945 			continue;
2946 
2947 		did = context_domain_id(&ce);
2948 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2949 			set_bit(did, iommu->domain_ids);
2950 
2951 		/*
2952 		 * We need a marker for copied context entries. This
2953 		 * marker needs to work for the old format as well as
2954 		 * for extended context entries.
2955 		 *
2956 		 * Bit 67 of the context entry is used. In the old
2957 		 * format this bit is available to software, in the
2958 		 * extended format it is the PGE bit, but PGE is ignored
2959 		 * by HW if PASIDs are disabled (and thus still
2960 		 * available).
2961 		 *
2962 		 * So disable PASIDs first and then mark the entry
2963 		 * copied. This means that we don't copy PASID
2964 		 * translations from the old kernel, but this is fine as
2965 		 * faults there are not fatal.
2966 		 */
2967 		context_clear_pasid_enable(&ce);
2968 		context_set_copied(&ce);
2969 
2970 		new_ce[idx] = ce;
2971 	}
2972 
2973 	tbl[tbl_idx + pos] = new_ce;
2974 
2975 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2976 
2977 out_unmap:
2978 	memunmap(old_ce);
2979 
2980 out:
2981 	return ret;
2982 }
2983 
2984 static int copy_translation_tables(struct intel_iommu *iommu)
2985 {
2986 	struct context_entry **ctxt_tbls;
2987 	struct root_entry *old_rt;
2988 	phys_addr_t old_rt_phys;
2989 	int ctxt_table_entries;
2990 	unsigned long flags;
2991 	u64 rtaddr_reg;
2992 	int bus, ret;
2993 	bool new_ext, ext;
2994 
2995 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2996 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2997 	new_ext    = !!ecap_ecs(iommu->ecap);
2998 
2999 	/*
3000 	 * The RTT bit can only be changed when translation is disabled,
3001 	 * but disabling translation means to open a window for data
3002 	 * corruption. So bail out and don't copy anything if we would
3003 	 * have to change the bit.
3004 	 */
3005 	if (new_ext != ext)
3006 		return -EINVAL;
3007 
3008 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3009 	if (!old_rt_phys)
3010 		return -EINVAL;
3011 
3012 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3013 	if (!old_rt)
3014 		return -ENOMEM;
3015 
3016 	/* This is too big for the stack - allocate it from slab */
3017 	ctxt_table_entries = ext ? 512 : 256;
3018 	ret = -ENOMEM;
3019 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3020 	if (!ctxt_tbls)
3021 		goto out_unmap;
3022 
3023 	for (bus = 0; bus < 256; bus++) {
3024 		ret = copy_context_table(iommu, &old_rt[bus],
3025 					 ctxt_tbls, bus, ext);
3026 		if (ret) {
3027 			pr_err("%s: Failed to copy context table for bus %d\n",
3028 				iommu->name, bus);
3029 			continue;
3030 		}
3031 	}
3032 
3033 	spin_lock_irqsave(&iommu->lock, flags);
3034 
3035 	/* Context tables are copied, now write them to the root_entry table */
3036 	for (bus = 0; bus < 256; bus++) {
3037 		int idx = ext ? bus * 2 : bus;
3038 		u64 val;
3039 
3040 		if (ctxt_tbls[idx]) {
3041 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3042 			iommu->root_entry[bus].lo = val;
3043 		}
3044 
3045 		if (!ext || !ctxt_tbls[idx + 1])
3046 			continue;
3047 
3048 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3049 		iommu->root_entry[bus].hi = val;
3050 	}
3051 
3052 	spin_unlock_irqrestore(&iommu->lock, flags);
3053 
3054 	kfree(ctxt_tbls);
3055 
3056 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3057 
3058 	ret = 0;
3059 
3060 out_unmap:
3061 	memunmap(old_rt);
3062 
3063 	return ret;
3064 }
3065 
3066 #ifdef CONFIG_INTEL_IOMMU_SVM
3067 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3068 {
3069 	struct intel_iommu *iommu = data;
3070 	ioasid_t ioasid;
3071 
3072 	if (!iommu)
3073 		return INVALID_IOASID;
3074 	/*
3075 	 * VT-d virtual command interface always uses the full 20 bit
3076 	 * PASID range. Host can partition guest PASID range based on
3077 	 * policies but it is out of guest's control.
3078 	 */
3079 	if (min < PASID_MIN || max > intel_pasid_max_id)
3080 		return INVALID_IOASID;
3081 
3082 	if (vcmd_alloc_pasid(iommu, &ioasid))
3083 		return INVALID_IOASID;
3084 
3085 	return ioasid;
3086 }
3087 
3088 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3089 {
3090 	struct intel_iommu *iommu = data;
3091 
3092 	if (!iommu)
3093 		return;
3094 	/*
3095 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3096 	 * We can only free the PASID when all the devices are unbound.
3097 	 */
3098 	if (ioasid_find(NULL, ioasid, NULL)) {
3099 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3100 		return;
3101 	}
3102 	vcmd_free_pasid(iommu, ioasid);
3103 }
3104 
3105 static void register_pasid_allocator(struct intel_iommu *iommu)
3106 {
3107 	/*
3108 	 * If we are running in the host, no need for custom allocator
3109 	 * in that PASIDs are allocated from the host system-wide.
3110 	 */
3111 	if (!cap_caching_mode(iommu->cap))
3112 		return;
3113 
3114 	if (!sm_supported(iommu)) {
3115 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3116 		return;
3117 	}
3118 
3119 	/*
3120 	 * Register a custom PASID allocator if we are running in a guest,
3121 	 * guest PASID must be obtained via virtual command interface.
3122 	 * There can be multiple vIOMMUs in each guest but only one allocator
3123 	 * is active. All vIOMMU allocators will eventually be calling the same
3124 	 * host allocator.
3125 	 */
3126 	if (!vccap_pasid(iommu->vccap))
3127 		return;
3128 
3129 	pr_info("Register custom PASID allocator\n");
3130 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3131 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3132 	iommu->pasid_allocator.pdata = (void *)iommu;
3133 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3134 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3135 		/*
3136 		 * Disable scalable mode on this IOMMU if there
3137 		 * is no custom allocator. Mixing SM capable vIOMMU
3138 		 * and non-SM vIOMMU are not supported.
3139 		 */
3140 		intel_iommu_sm = 0;
3141 	}
3142 }
3143 #endif
3144 
3145 static int __init init_dmars(void)
3146 {
3147 	struct dmar_drhd_unit *drhd;
3148 	struct intel_iommu *iommu;
3149 	int ret;
3150 
3151 	/*
3152 	 * for each drhd
3153 	 *    allocate root
3154 	 *    initialize and program root entry to not present
3155 	 * endfor
3156 	 */
3157 	for_each_drhd_unit(drhd) {
3158 		/*
3159 		 * lock not needed as this is only incremented in the single
3160 		 * threaded kernel __init code path all other access are read
3161 		 * only
3162 		 */
3163 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3164 			g_num_of_iommus++;
3165 			continue;
3166 		}
3167 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3168 	}
3169 
3170 	/* Preallocate enough resources for IOMMU hot-addition */
3171 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3172 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3173 
3174 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3175 			GFP_KERNEL);
3176 	if (!g_iommus) {
3177 		pr_err("Allocating global iommu array failed\n");
3178 		ret = -ENOMEM;
3179 		goto error;
3180 	}
3181 
3182 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3183 	if (ret)
3184 		goto free_iommu;
3185 
3186 	for_each_iommu(iommu, drhd) {
3187 		if (drhd->ignored) {
3188 			iommu_disable_translation(iommu);
3189 			continue;
3190 		}
3191 
3192 		/*
3193 		 * Find the max pasid size of all IOMMU's in the system.
3194 		 * We need to ensure the system pasid table is no bigger
3195 		 * than the smallest supported.
3196 		 */
3197 		if (pasid_supported(iommu)) {
3198 			u32 temp = 2 << ecap_pss(iommu->ecap);
3199 
3200 			intel_pasid_max_id = min_t(u32, temp,
3201 						   intel_pasid_max_id);
3202 		}
3203 
3204 		g_iommus[iommu->seq_id] = iommu;
3205 
3206 		intel_iommu_init_qi(iommu);
3207 
3208 		ret = iommu_init_domains(iommu);
3209 		if (ret)
3210 			goto free_iommu;
3211 
3212 		init_translation_status(iommu);
3213 
3214 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3215 			iommu_disable_translation(iommu);
3216 			clear_translation_pre_enabled(iommu);
3217 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3218 				iommu->name);
3219 		}
3220 
3221 		/*
3222 		 * TBD:
3223 		 * we could share the same root & context tables
3224 		 * among all IOMMU's. Need to Split it later.
3225 		 */
3226 		ret = iommu_alloc_root_entry(iommu);
3227 		if (ret)
3228 			goto free_iommu;
3229 
3230 		if (translation_pre_enabled(iommu)) {
3231 			pr_info("Translation already enabled - trying to copy translation structures\n");
3232 
3233 			ret = copy_translation_tables(iommu);
3234 			if (ret) {
3235 				/*
3236 				 * We found the IOMMU with translation
3237 				 * enabled - but failed to copy over the
3238 				 * old root-entry table. Try to proceed
3239 				 * by disabling translation now and
3240 				 * allocating a clean root-entry table.
3241 				 * This might cause DMAR faults, but
3242 				 * probably the dump will still succeed.
3243 				 */
3244 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3245 				       iommu->name);
3246 				iommu_disable_translation(iommu);
3247 				clear_translation_pre_enabled(iommu);
3248 			} else {
3249 				pr_info("Copied translation tables from previous kernel for %s\n",
3250 					iommu->name);
3251 			}
3252 		}
3253 
3254 		if (!ecap_pass_through(iommu->ecap))
3255 			hw_pass_through = 0;
3256 		intel_svm_check(iommu);
3257 	}
3258 
3259 	/*
3260 	 * Now that qi is enabled on all iommus, set the root entry and flush
3261 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3262 	 * flush_context function will loop forever and the boot hangs.
3263 	 */
3264 	for_each_active_iommu(iommu, drhd) {
3265 		iommu_flush_write_buffer(iommu);
3266 #ifdef CONFIG_INTEL_IOMMU_SVM
3267 		register_pasid_allocator(iommu);
3268 #endif
3269 		iommu_set_root_entry(iommu);
3270 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3271 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3272 	}
3273 
3274 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3275 	dmar_map_gfx = 0;
3276 #endif
3277 
3278 	if (!dmar_map_gfx)
3279 		iommu_identity_mapping |= IDENTMAP_GFX;
3280 
3281 	check_tylersburg_isoch();
3282 
3283 	ret = si_domain_init(hw_pass_through);
3284 	if (ret)
3285 		goto free_iommu;
3286 
3287 	/*
3288 	 * for each drhd
3289 	 *   enable fault log
3290 	 *   global invalidate context cache
3291 	 *   global invalidate iotlb
3292 	 *   enable translation
3293 	 */
3294 	for_each_iommu(iommu, drhd) {
3295 		if (drhd->ignored) {
3296 			/*
3297 			 * we always have to disable PMRs or DMA may fail on
3298 			 * this device
3299 			 */
3300 			if (force_on)
3301 				iommu_disable_protect_mem_regions(iommu);
3302 			continue;
3303 		}
3304 
3305 		iommu_flush_write_buffer(iommu);
3306 
3307 #ifdef CONFIG_INTEL_IOMMU_SVM
3308 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3309 			/*
3310 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3311 			 * could cause possible lock race condition.
3312 			 */
3313 			up_write(&dmar_global_lock);
3314 			ret = intel_svm_enable_prq(iommu);
3315 			down_write(&dmar_global_lock);
3316 			if (ret)
3317 				goto free_iommu;
3318 		}
3319 #endif
3320 		ret = dmar_set_interrupt(iommu);
3321 		if (ret)
3322 			goto free_iommu;
3323 	}
3324 
3325 	return 0;
3326 
3327 free_iommu:
3328 	for_each_active_iommu(iommu, drhd) {
3329 		disable_dmar_iommu(iommu);
3330 		free_dmar_iommu(iommu);
3331 	}
3332 
3333 	kfree(g_iommus);
3334 
3335 error:
3336 	return ret;
3337 }
3338 
3339 static inline int iommu_domain_cache_init(void)
3340 {
3341 	int ret = 0;
3342 
3343 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3344 					 sizeof(struct dmar_domain),
3345 					 0,
3346 					 SLAB_HWCACHE_ALIGN,
3347 
3348 					 NULL);
3349 	if (!iommu_domain_cache) {
3350 		pr_err("Couldn't create iommu_domain cache\n");
3351 		ret = -ENOMEM;
3352 	}
3353 
3354 	return ret;
3355 }
3356 
3357 static inline int iommu_devinfo_cache_init(void)
3358 {
3359 	int ret = 0;
3360 
3361 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3362 					 sizeof(struct device_domain_info),
3363 					 0,
3364 					 SLAB_HWCACHE_ALIGN,
3365 					 NULL);
3366 	if (!iommu_devinfo_cache) {
3367 		pr_err("Couldn't create devinfo cache\n");
3368 		ret = -ENOMEM;
3369 	}
3370 
3371 	return ret;
3372 }
3373 
3374 static int __init iommu_init_mempool(void)
3375 {
3376 	int ret;
3377 	ret = iova_cache_get();
3378 	if (ret)
3379 		return ret;
3380 
3381 	ret = iommu_domain_cache_init();
3382 	if (ret)
3383 		goto domain_error;
3384 
3385 	ret = iommu_devinfo_cache_init();
3386 	if (!ret)
3387 		return ret;
3388 
3389 	kmem_cache_destroy(iommu_domain_cache);
3390 domain_error:
3391 	iova_cache_put();
3392 
3393 	return -ENOMEM;
3394 }
3395 
3396 static void __init iommu_exit_mempool(void)
3397 {
3398 	kmem_cache_destroy(iommu_devinfo_cache);
3399 	kmem_cache_destroy(iommu_domain_cache);
3400 	iova_cache_put();
3401 }
3402 
3403 static void __init init_no_remapping_devices(void)
3404 {
3405 	struct dmar_drhd_unit *drhd;
3406 	struct device *dev;
3407 	int i;
3408 
3409 	for_each_drhd_unit(drhd) {
3410 		if (!drhd->include_all) {
3411 			for_each_active_dev_scope(drhd->devices,
3412 						  drhd->devices_cnt, i, dev)
3413 				break;
3414 			/* ignore DMAR unit if no devices exist */
3415 			if (i == drhd->devices_cnt)
3416 				drhd->ignored = 1;
3417 		}
3418 	}
3419 
3420 	for_each_active_drhd_unit(drhd) {
3421 		if (drhd->include_all)
3422 			continue;
3423 
3424 		for_each_active_dev_scope(drhd->devices,
3425 					  drhd->devices_cnt, i, dev)
3426 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3427 				break;
3428 		if (i < drhd->devices_cnt)
3429 			continue;
3430 
3431 		/* This IOMMU has *only* gfx devices. Either bypass it or
3432 		   set the gfx_mapped flag, as appropriate */
3433 		drhd->gfx_dedicated = 1;
3434 		if (!dmar_map_gfx)
3435 			drhd->ignored = 1;
3436 	}
3437 }
3438 
3439 #ifdef CONFIG_SUSPEND
3440 static int init_iommu_hw(void)
3441 {
3442 	struct dmar_drhd_unit *drhd;
3443 	struct intel_iommu *iommu = NULL;
3444 
3445 	for_each_active_iommu(iommu, drhd)
3446 		if (iommu->qi)
3447 			dmar_reenable_qi(iommu);
3448 
3449 	for_each_iommu(iommu, drhd) {
3450 		if (drhd->ignored) {
3451 			/*
3452 			 * we always have to disable PMRs or DMA may fail on
3453 			 * this device
3454 			 */
3455 			if (force_on)
3456 				iommu_disable_protect_mem_regions(iommu);
3457 			continue;
3458 		}
3459 
3460 		iommu_flush_write_buffer(iommu);
3461 
3462 		iommu_set_root_entry(iommu);
3463 
3464 		iommu->flush.flush_context(iommu, 0, 0, 0,
3465 					   DMA_CCMD_GLOBAL_INVL);
3466 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3467 		iommu_enable_translation(iommu);
3468 		iommu_disable_protect_mem_regions(iommu);
3469 	}
3470 
3471 	return 0;
3472 }
3473 
3474 static void iommu_flush_all(void)
3475 {
3476 	struct dmar_drhd_unit *drhd;
3477 	struct intel_iommu *iommu;
3478 
3479 	for_each_active_iommu(iommu, drhd) {
3480 		iommu->flush.flush_context(iommu, 0, 0, 0,
3481 					   DMA_CCMD_GLOBAL_INVL);
3482 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3483 					 DMA_TLB_GLOBAL_FLUSH);
3484 	}
3485 }
3486 
3487 static int iommu_suspend(void)
3488 {
3489 	struct dmar_drhd_unit *drhd;
3490 	struct intel_iommu *iommu = NULL;
3491 	unsigned long flag;
3492 
3493 	for_each_active_iommu(iommu, drhd) {
3494 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3495 					     GFP_KERNEL);
3496 		if (!iommu->iommu_state)
3497 			goto nomem;
3498 	}
3499 
3500 	iommu_flush_all();
3501 
3502 	for_each_active_iommu(iommu, drhd) {
3503 		iommu_disable_translation(iommu);
3504 
3505 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3506 
3507 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3508 			readl(iommu->reg + DMAR_FECTL_REG);
3509 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3510 			readl(iommu->reg + DMAR_FEDATA_REG);
3511 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3512 			readl(iommu->reg + DMAR_FEADDR_REG);
3513 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3514 			readl(iommu->reg + DMAR_FEUADDR_REG);
3515 
3516 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3517 	}
3518 	return 0;
3519 
3520 nomem:
3521 	for_each_active_iommu(iommu, drhd)
3522 		kfree(iommu->iommu_state);
3523 
3524 	return -ENOMEM;
3525 }
3526 
3527 static void iommu_resume(void)
3528 {
3529 	struct dmar_drhd_unit *drhd;
3530 	struct intel_iommu *iommu = NULL;
3531 	unsigned long flag;
3532 
3533 	if (init_iommu_hw()) {
3534 		if (force_on)
3535 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3536 		else
3537 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3538 		return;
3539 	}
3540 
3541 	for_each_active_iommu(iommu, drhd) {
3542 
3543 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3544 
3545 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3546 			iommu->reg + DMAR_FECTL_REG);
3547 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3548 			iommu->reg + DMAR_FEDATA_REG);
3549 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3550 			iommu->reg + DMAR_FEADDR_REG);
3551 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3552 			iommu->reg + DMAR_FEUADDR_REG);
3553 
3554 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3555 	}
3556 
3557 	for_each_active_iommu(iommu, drhd)
3558 		kfree(iommu->iommu_state);
3559 }
3560 
3561 static struct syscore_ops iommu_syscore_ops = {
3562 	.resume		= iommu_resume,
3563 	.suspend	= iommu_suspend,
3564 };
3565 
3566 static void __init init_iommu_pm_ops(void)
3567 {
3568 	register_syscore_ops(&iommu_syscore_ops);
3569 }
3570 
3571 #else
3572 static inline void init_iommu_pm_ops(void) {}
3573 #endif	/* CONFIG_PM */
3574 
3575 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3576 {
3577 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3578 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3579 	    rmrr->end_address <= rmrr->base_address ||
3580 	    arch_rmrr_sanity_check(rmrr))
3581 		return -EINVAL;
3582 
3583 	return 0;
3584 }
3585 
3586 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3587 {
3588 	struct acpi_dmar_reserved_memory *rmrr;
3589 	struct dmar_rmrr_unit *rmrru;
3590 
3591 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3592 	if (rmrr_sanity_check(rmrr)) {
3593 		pr_warn(FW_BUG
3594 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3595 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3596 			   rmrr->base_address, rmrr->end_address,
3597 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3598 			   dmi_get_system_info(DMI_BIOS_VERSION),
3599 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3600 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3601 	}
3602 
3603 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3604 	if (!rmrru)
3605 		goto out;
3606 
3607 	rmrru->hdr = header;
3608 
3609 	rmrru->base_address = rmrr->base_address;
3610 	rmrru->end_address = rmrr->end_address;
3611 
3612 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3613 				((void *)rmrr) + rmrr->header.length,
3614 				&rmrru->devices_cnt);
3615 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3616 		goto free_rmrru;
3617 
3618 	list_add(&rmrru->list, &dmar_rmrr_units);
3619 
3620 	return 0;
3621 free_rmrru:
3622 	kfree(rmrru);
3623 out:
3624 	return -ENOMEM;
3625 }
3626 
3627 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3628 {
3629 	struct dmar_atsr_unit *atsru;
3630 	struct acpi_dmar_atsr *tmp;
3631 
3632 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3633 				dmar_rcu_check()) {
3634 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3635 		if (atsr->segment != tmp->segment)
3636 			continue;
3637 		if (atsr->header.length != tmp->header.length)
3638 			continue;
3639 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3640 			return atsru;
3641 	}
3642 
3643 	return NULL;
3644 }
3645 
3646 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3647 {
3648 	struct acpi_dmar_atsr *atsr;
3649 	struct dmar_atsr_unit *atsru;
3650 
3651 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3652 		return 0;
3653 
3654 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3655 	atsru = dmar_find_atsr(atsr);
3656 	if (atsru)
3657 		return 0;
3658 
3659 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3660 	if (!atsru)
3661 		return -ENOMEM;
3662 
3663 	/*
3664 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3665 	 * copy the memory content because the memory buffer will be freed
3666 	 * on return.
3667 	 */
3668 	atsru->hdr = (void *)(atsru + 1);
3669 	memcpy(atsru->hdr, hdr, hdr->length);
3670 	atsru->include_all = atsr->flags & 0x1;
3671 	if (!atsru->include_all) {
3672 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3673 				(void *)atsr + atsr->header.length,
3674 				&atsru->devices_cnt);
3675 		if (atsru->devices_cnt && atsru->devices == NULL) {
3676 			kfree(atsru);
3677 			return -ENOMEM;
3678 		}
3679 	}
3680 
3681 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3682 
3683 	return 0;
3684 }
3685 
3686 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3687 {
3688 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3689 	kfree(atsru);
3690 }
3691 
3692 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3693 {
3694 	struct acpi_dmar_atsr *atsr;
3695 	struct dmar_atsr_unit *atsru;
3696 
3697 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3698 	atsru = dmar_find_atsr(atsr);
3699 	if (atsru) {
3700 		list_del_rcu(&atsru->list);
3701 		synchronize_rcu();
3702 		intel_iommu_free_atsr(atsru);
3703 	}
3704 
3705 	return 0;
3706 }
3707 
3708 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3709 {
3710 	int i;
3711 	struct device *dev;
3712 	struct acpi_dmar_atsr *atsr;
3713 	struct dmar_atsr_unit *atsru;
3714 
3715 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3716 	atsru = dmar_find_atsr(atsr);
3717 	if (!atsru)
3718 		return 0;
3719 
3720 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3721 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3722 					  i, dev)
3723 			return -EBUSY;
3724 	}
3725 
3726 	return 0;
3727 }
3728 
3729 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3730 {
3731 	struct dmar_satc_unit *satcu;
3732 	struct acpi_dmar_satc *tmp;
3733 
3734 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3735 				dmar_rcu_check()) {
3736 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3737 		if (satc->segment != tmp->segment)
3738 			continue;
3739 		if (satc->header.length != tmp->header.length)
3740 			continue;
3741 		if (memcmp(satc, tmp, satc->header.length) == 0)
3742 			return satcu;
3743 	}
3744 
3745 	return NULL;
3746 }
3747 
3748 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3749 {
3750 	struct acpi_dmar_satc *satc;
3751 	struct dmar_satc_unit *satcu;
3752 
3753 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3754 		return 0;
3755 
3756 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3757 	satcu = dmar_find_satc(satc);
3758 	if (satcu)
3759 		return 0;
3760 
3761 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3762 	if (!satcu)
3763 		return -ENOMEM;
3764 
3765 	satcu->hdr = (void *)(satcu + 1);
3766 	memcpy(satcu->hdr, hdr, hdr->length);
3767 	satcu->atc_required = satc->flags & 0x1;
3768 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3769 					      (void *)satc + satc->header.length,
3770 					      &satcu->devices_cnt);
3771 	if (satcu->devices_cnt && !satcu->devices) {
3772 		kfree(satcu);
3773 		return -ENOMEM;
3774 	}
3775 	list_add_rcu(&satcu->list, &dmar_satc_units);
3776 
3777 	return 0;
3778 }
3779 
3780 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3781 {
3782 	int sp, ret;
3783 	struct intel_iommu *iommu = dmaru->iommu;
3784 
3785 	if (g_iommus[iommu->seq_id])
3786 		return 0;
3787 
3788 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3789 	if (ret)
3790 		goto out;
3791 
3792 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3793 		pr_warn("%s: Doesn't support hardware pass through.\n",
3794 			iommu->name);
3795 		return -ENXIO;
3796 	}
3797 	if (!ecap_sc_support(iommu->ecap) &&
3798 	    domain_update_iommu_snooping(iommu)) {
3799 		pr_warn("%s: Doesn't support snooping.\n",
3800 			iommu->name);
3801 		return -ENXIO;
3802 	}
3803 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3804 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3805 		pr_warn("%s: Doesn't support large page.\n",
3806 			iommu->name);
3807 		return -ENXIO;
3808 	}
3809 
3810 	/*
3811 	 * Disable translation if already enabled prior to OS handover.
3812 	 */
3813 	if (iommu->gcmd & DMA_GCMD_TE)
3814 		iommu_disable_translation(iommu);
3815 
3816 	g_iommus[iommu->seq_id] = iommu;
3817 	ret = iommu_init_domains(iommu);
3818 	if (ret == 0)
3819 		ret = iommu_alloc_root_entry(iommu);
3820 	if (ret)
3821 		goto out;
3822 
3823 	intel_svm_check(iommu);
3824 
3825 	if (dmaru->ignored) {
3826 		/*
3827 		 * we always have to disable PMRs or DMA may fail on this device
3828 		 */
3829 		if (force_on)
3830 			iommu_disable_protect_mem_regions(iommu);
3831 		return 0;
3832 	}
3833 
3834 	intel_iommu_init_qi(iommu);
3835 	iommu_flush_write_buffer(iommu);
3836 
3837 #ifdef CONFIG_INTEL_IOMMU_SVM
3838 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3839 		ret = intel_svm_enable_prq(iommu);
3840 		if (ret)
3841 			goto disable_iommu;
3842 	}
3843 #endif
3844 	ret = dmar_set_interrupt(iommu);
3845 	if (ret)
3846 		goto disable_iommu;
3847 
3848 	iommu_set_root_entry(iommu);
3849 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3850 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3851 	iommu_enable_translation(iommu);
3852 
3853 	iommu_disable_protect_mem_regions(iommu);
3854 	return 0;
3855 
3856 disable_iommu:
3857 	disable_dmar_iommu(iommu);
3858 out:
3859 	free_dmar_iommu(iommu);
3860 	return ret;
3861 }
3862 
3863 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3864 {
3865 	int ret = 0;
3866 	struct intel_iommu *iommu = dmaru->iommu;
3867 
3868 	if (!intel_iommu_enabled)
3869 		return 0;
3870 	if (iommu == NULL)
3871 		return -EINVAL;
3872 
3873 	if (insert) {
3874 		ret = intel_iommu_add(dmaru);
3875 	} else {
3876 		disable_dmar_iommu(iommu);
3877 		free_dmar_iommu(iommu);
3878 	}
3879 
3880 	return ret;
3881 }
3882 
3883 static void intel_iommu_free_dmars(void)
3884 {
3885 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3886 	struct dmar_atsr_unit *atsru, *atsr_n;
3887 	struct dmar_satc_unit *satcu, *satc_n;
3888 
3889 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3890 		list_del(&rmrru->list);
3891 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3892 		kfree(rmrru);
3893 	}
3894 
3895 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3896 		list_del(&atsru->list);
3897 		intel_iommu_free_atsr(atsru);
3898 	}
3899 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3900 		list_del(&satcu->list);
3901 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3902 		kfree(satcu);
3903 	}
3904 }
3905 
3906 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3907 {
3908 	int i, ret = 1;
3909 	struct pci_bus *bus;
3910 	struct pci_dev *bridge = NULL;
3911 	struct device *tmp;
3912 	struct acpi_dmar_atsr *atsr;
3913 	struct dmar_atsr_unit *atsru;
3914 
3915 	dev = pci_physfn(dev);
3916 	for (bus = dev->bus; bus; bus = bus->parent) {
3917 		bridge = bus->self;
3918 		/* If it's an integrated device, allow ATS */
3919 		if (!bridge)
3920 			return 1;
3921 		/* Connected via non-PCIe: no ATS */
3922 		if (!pci_is_pcie(bridge) ||
3923 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3924 			return 0;
3925 		/* If we found the root port, look it up in the ATSR */
3926 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3927 			break;
3928 	}
3929 
3930 	rcu_read_lock();
3931 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3932 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3933 		if (atsr->segment != pci_domain_nr(dev->bus))
3934 			continue;
3935 
3936 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3937 			if (tmp == &bridge->dev)
3938 				goto out;
3939 
3940 		if (atsru->include_all)
3941 			goto out;
3942 	}
3943 	ret = 0;
3944 out:
3945 	rcu_read_unlock();
3946 
3947 	return ret;
3948 }
3949 
3950 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3951 {
3952 	int ret;
3953 	struct dmar_rmrr_unit *rmrru;
3954 	struct dmar_atsr_unit *atsru;
3955 	struct dmar_satc_unit *satcu;
3956 	struct acpi_dmar_atsr *atsr;
3957 	struct acpi_dmar_reserved_memory *rmrr;
3958 	struct acpi_dmar_satc *satc;
3959 
3960 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3961 		return 0;
3962 
3963 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3964 		rmrr = container_of(rmrru->hdr,
3965 				    struct acpi_dmar_reserved_memory, header);
3966 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3967 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3968 				((void *)rmrr) + rmrr->header.length,
3969 				rmrr->segment, rmrru->devices,
3970 				rmrru->devices_cnt);
3971 			if (ret < 0)
3972 				return ret;
3973 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3974 			dmar_remove_dev_scope(info, rmrr->segment,
3975 				rmrru->devices, rmrru->devices_cnt);
3976 		}
3977 	}
3978 
3979 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3980 		if (atsru->include_all)
3981 			continue;
3982 
3983 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3984 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3985 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3986 					(void *)atsr + atsr->header.length,
3987 					atsr->segment, atsru->devices,
3988 					atsru->devices_cnt);
3989 			if (ret > 0)
3990 				break;
3991 			else if (ret < 0)
3992 				return ret;
3993 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3994 			if (dmar_remove_dev_scope(info, atsr->segment,
3995 					atsru->devices, atsru->devices_cnt))
3996 				break;
3997 		}
3998 	}
3999 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4000 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4001 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4002 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4003 					(void *)satc + satc->header.length,
4004 					satc->segment, satcu->devices,
4005 					satcu->devices_cnt);
4006 			if (ret > 0)
4007 				break;
4008 			else if (ret < 0)
4009 				return ret;
4010 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4011 			if (dmar_remove_dev_scope(info, satc->segment,
4012 					satcu->devices, satcu->devices_cnt))
4013 				break;
4014 		}
4015 	}
4016 
4017 	return 0;
4018 }
4019 
4020 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4021 				       unsigned long val, void *v)
4022 {
4023 	struct memory_notify *mhp = v;
4024 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4025 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4026 			mhp->nr_pages - 1);
4027 
4028 	switch (val) {
4029 	case MEM_GOING_ONLINE:
4030 		if (iommu_domain_identity_map(si_domain,
4031 					      start_vpfn, last_vpfn)) {
4032 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4033 				start_vpfn, last_vpfn);
4034 			return NOTIFY_BAD;
4035 		}
4036 		break;
4037 
4038 	case MEM_OFFLINE:
4039 	case MEM_CANCEL_ONLINE:
4040 		{
4041 			struct dmar_drhd_unit *drhd;
4042 			struct intel_iommu *iommu;
4043 			struct page *freelist;
4044 
4045 			freelist = domain_unmap(si_domain,
4046 						start_vpfn, last_vpfn,
4047 						NULL);
4048 
4049 			rcu_read_lock();
4050 			for_each_active_iommu(iommu, drhd)
4051 				iommu_flush_iotlb_psi(iommu, si_domain,
4052 					start_vpfn, mhp->nr_pages,
4053 					!freelist, 0);
4054 			rcu_read_unlock();
4055 			dma_free_pagelist(freelist);
4056 		}
4057 		break;
4058 	}
4059 
4060 	return NOTIFY_OK;
4061 }
4062 
4063 static struct notifier_block intel_iommu_memory_nb = {
4064 	.notifier_call = intel_iommu_memory_notifier,
4065 	.priority = 0
4066 };
4067 
4068 static void free_all_cpu_cached_iovas(unsigned int cpu)
4069 {
4070 	int i;
4071 
4072 	for (i = 0; i < g_num_of_iommus; i++) {
4073 		struct intel_iommu *iommu = g_iommus[i];
4074 		struct dmar_domain *domain;
4075 		int did;
4076 
4077 		if (!iommu)
4078 			continue;
4079 
4080 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4081 			domain = get_iommu_domain(iommu, (u16)did);
4082 
4083 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4084 				continue;
4085 
4086 			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4087 		}
4088 	}
4089 }
4090 
4091 static int intel_iommu_cpu_dead(unsigned int cpu)
4092 {
4093 	free_all_cpu_cached_iovas(cpu);
4094 	return 0;
4095 }
4096 
4097 static void intel_disable_iommus(void)
4098 {
4099 	struct intel_iommu *iommu = NULL;
4100 	struct dmar_drhd_unit *drhd;
4101 
4102 	for_each_iommu(iommu, drhd)
4103 		iommu_disable_translation(iommu);
4104 }
4105 
4106 void intel_iommu_shutdown(void)
4107 {
4108 	struct dmar_drhd_unit *drhd;
4109 	struct intel_iommu *iommu = NULL;
4110 
4111 	if (no_iommu || dmar_disabled)
4112 		return;
4113 
4114 	down_write(&dmar_global_lock);
4115 
4116 	/* Disable PMRs explicitly here. */
4117 	for_each_iommu(iommu, drhd)
4118 		iommu_disable_protect_mem_regions(iommu);
4119 
4120 	/* Make sure the IOMMUs are switched off */
4121 	intel_disable_iommus();
4122 
4123 	up_write(&dmar_global_lock);
4124 }
4125 
4126 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4127 {
4128 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4129 
4130 	return container_of(iommu_dev, struct intel_iommu, iommu);
4131 }
4132 
4133 static ssize_t intel_iommu_show_version(struct device *dev,
4134 					struct device_attribute *attr,
4135 					char *buf)
4136 {
4137 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4138 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4139 	return sprintf(buf, "%d:%d\n",
4140 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4141 }
4142 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4143 
4144 static ssize_t intel_iommu_show_address(struct device *dev,
4145 					struct device_attribute *attr,
4146 					char *buf)
4147 {
4148 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4149 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4150 }
4151 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4152 
4153 static ssize_t intel_iommu_show_cap(struct device *dev,
4154 				    struct device_attribute *attr,
4155 				    char *buf)
4156 {
4157 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4158 	return sprintf(buf, "%llx\n", iommu->cap);
4159 }
4160 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4161 
4162 static ssize_t intel_iommu_show_ecap(struct device *dev,
4163 				    struct device_attribute *attr,
4164 				    char *buf)
4165 {
4166 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4167 	return sprintf(buf, "%llx\n", iommu->ecap);
4168 }
4169 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4170 
4171 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4172 				      struct device_attribute *attr,
4173 				      char *buf)
4174 {
4175 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4176 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4177 }
4178 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4179 
4180 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4181 					   struct device_attribute *attr,
4182 					   char *buf)
4183 {
4184 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4185 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4186 						  cap_ndoms(iommu->cap)));
4187 }
4188 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4189 
4190 static struct attribute *intel_iommu_attrs[] = {
4191 	&dev_attr_version.attr,
4192 	&dev_attr_address.attr,
4193 	&dev_attr_cap.attr,
4194 	&dev_attr_ecap.attr,
4195 	&dev_attr_domains_supported.attr,
4196 	&dev_attr_domains_used.attr,
4197 	NULL,
4198 };
4199 
4200 static struct attribute_group intel_iommu_group = {
4201 	.name = "intel-iommu",
4202 	.attrs = intel_iommu_attrs,
4203 };
4204 
4205 const struct attribute_group *intel_iommu_groups[] = {
4206 	&intel_iommu_group,
4207 	NULL,
4208 };
4209 
4210 static inline bool has_external_pci(void)
4211 {
4212 	struct pci_dev *pdev = NULL;
4213 
4214 	for_each_pci_dev(pdev)
4215 		if (pdev->external_facing)
4216 			return true;
4217 
4218 	return false;
4219 }
4220 
4221 static int __init platform_optin_force_iommu(void)
4222 {
4223 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4224 		return 0;
4225 
4226 	if (no_iommu || dmar_disabled)
4227 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4228 
4229 	/*
4230 	 * If Intel-IOMMU is disabled by default, we will apply identity
4231 	 * map for all devices except those marked as being untrusted.
4232 	 */
4233 	if (dmar_disabled)
4234 		iommu_set_default_passthrough(false);
4235 
4236 	dmar_disabled = 0;
4237 	no_iommu = 0;
4238 
4239 	return 1;
4240 }
4241 
4242 static int __init probe_acpi_namespace_devices(void)
4243 {
4244 	struct dmar_drhd_unit *drhd;
4245 	/* To avoid a -Wunused-but-set-variable warning. */
4246 	struct intel_iommu *iommu __maybe_unused;
4247 	struct device *dev;
4248 	int i, ret = 0;
4249 
4250 	for_each_active_iommu(iommu, drhd) {
4251 		for_each_active_dev_scope(drhd->devices,
4252 					  drhd->devices_cnt, i, dev) {
4253 			struct acpi_device_physical_node *pn;
4254 			struct iommu_group *group;
4255 			struct acpi_device *adev;
4256 
4257 			if (dev->bus != &acpi_bus_type)
4258 				continue;
4259 
4260 			adev = to_acpi_device(dev);
4261 			mutex_lock(&adev->physical_node_lock);
4262 			list_for_each_entry(pn,
4263 					    &adev->physical_node_list, node) {
4264 				group = iommu_group_get(pn->dev);
4265 				if (group) {
4266 					iommu_group_put(group);
4267 					continue;
4268 				}
4269 
4270 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4271 				ret = iommu_probe_device(pn->dev);
4272 				if (ret)
4273 					break;
4274 			}
4275 			mutex_unlock(&adev->physical_node_lock);
4276 
4277 			if (ret)
4278 				return ret;
4279 		}
4280 	}
4281 
4282 	return 0;
4283 }
4284 
4285 int __init intel_iommu_init(void)
4286 {
4287 	int ret = -ENODEV;
4288 	struct dmar_drhd_unit *drhd;
4289 	struct intel_iommu *iommu;
4290 
4291 	/*
4292 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4293 	 * opt in, so enforce that.
4294 	 */
4295 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4296 		    platform_optin_force_iommu();
4297 
4298 	if (iommu_init_mempool()) {
4299 		if (force_on)
4300 			panic("tboot: Failed to initialize iommu memory\n");
4301 		return -ENOMEM;
4302 	}
4303 
4304 	down_write(&dmar_global_lock);
4305 	if (dmar_table_init()) {
4306 		if (force_on)
4307 			panic("tboot: Failed to initialize DMAR table\n");
4308 		goto out_free_dmar;
4309 	}
4310 
4311 	if (dmar_dev_scope_init() < 0) {
4312 		if (force_on)
4313 			panic("tboot: Failed to initialize DMAR device scope\n");
4314 		goto out_free_dmar;
4315 	}
4316 
4317 	up_write(&dmar_global_lock);
4318 
4319 	/*
4320 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4321 	 * complain later when we register it under the lock.
4322 	 */
4323 	dmar_register_bus_notifier();
4324 
4325 	down_write(&dmar_global_lock);
4326 
4327 	if (!no_iommu)
4328 		intel_iommu_debugfs_init();
4329 
4330 	if (no_iommu || dmar_disabled) {
4331 		/*
4332 		 * We exit the function here to ensure IOMMU's remapping and
4333 		 * mempool aren't setup, which means that the IOMMU's PMRs
4334 		 * won't be disabled via the call to init_dmars(). So disable
4335 		 * it explicitly here. The PMRs were setup by tboot prior to
4336 		 * calling SENTER, but the kernel is expected to reset/tear
4337 		 * down the PMRs.
4338 		 */
4339 		if (intel_iommu_tboot_noforce) {
4340 			for_each_iommu(iommu, drhd)
4341 				iommu_disable_protect_mem_regions(iommu);
4342 		}
4343 
4344 		/*
4345 		 * Make sure the IOMMUs are switched off, even when we
4346 		 * boot into a kexec kernel and the previous kernel left
4347 		 * them enabled
4348 		 */
4349 		intel_disable_iommus();
4350 		goto out_free_dmar;
4351 	}
4352 
4353 	if (list_empty(&dmar_rmrr_units))
4354 		pr_info("No RMRR found\n");
4355 
4356 	if (list_empty(&dmar_atsr_units))
4357 		pr_info("No ATSR found\n");
4358 
4359 	if (list_empty(&dmar_satc_units))
4360 		pr_info("No SATC found\n");
4361 
4362 	if (dmar_map_gfx)
4363 		intel_iommu_gfx_mapped = 1;
4364 
4365 	init_no_remapping_devices();
4366 
4367 	ret = init_dmars();
4368 	if (ret) {
4369 		if (force_on)
4370 			panic("tboot: Failed to initialize DMARs\n");
4371 		pr_err("Initialization failed\n");
4372 		goto out_free_dmar;
4373 	}
4374 	up_write(&dmar_global_lock);
4375 
4376 	init_iommu_pm_ops();
4377 
4378 	down_read(&dmar_global_lock);
4379 	for_each_active_iommu(iommu, drhd) {
4380 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4381 				       intel_iommu_groups,
4382 				       "%s", iommu->name);
4383 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4384 		iommu_device_register(&iommu->iommu);
4385 	}
4386 	up_read(&dmar_global_lock);
4387 
4388 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4389 	if (si_domain && !hw_pass_through)
4390 		register_memory_notifier(&intel_iommu_memory_nb);
4391 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4392 			  intel_iommu_cpu_dead);
4393 
4394 	down_read(&dmar_global_lock);
4395 	if (probe_acpi_namespace_devices())
4396 		pr_warn("ACPI name space devices didn't probe correctly\n");
4397 
4398 	/* Finally, we enable the DMA remapping hardware. */
4399 	for_each_iommu(iommu, drhd) {
4400 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4401 			iommu_enable_translation(iommu);
4402 
4403 		iommu_disable_protect_mem_regions(iommu);
4404 	}
4405 	up_read(&dmar_global_lock);
4406 
4407 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4408 
4409 	intel_iommu_enabled = 1;
4410 
4411 	return 0;
4412 
4413 out_free_dmar:
4414 	intel_iommu_free_dmars();
4415 	up_write(&dmar_global_lock);
4416 	iommu_exit_mempool();
4417 	return ret;
4418 }
4419 
4420 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4421 {
4422 	struct intel_iommu *iommu = opaque;
4423 
4424 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4425 	return 0;
4426 }
4427 
4428 /*
4429  * NB - intel-iommu lacks any sort of reference counting for the users of
4430  * dependent devices.  If multiple endpoints have intersecting dependent
4431  * devices, unbinding the driver from any one of them will possibly leave
4432  * the others unable to operate.
4433  */
4434 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4435 {
4436 	if (!iommu || !dev || !dev_is_pci(dev))
4437 		return;
4438 
4439 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4440 }
4441 
4442 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4443 {
4444 	struct dmar_domain *domain;
4445 	struct intel_iommu *iommu;
4446 	unsigned long flags;
4447 
4448 	assert_spin_locked(&device_domain_lock);
4449 
4450 	if (WARN_ON(!info))
4451 		return;
4452 
4453 	iommu = info->iommu;
4454 	domain = info->domain;
4455 
4456 	if (info->dev) {
4457 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4458 			intel_pasid_tear_down_entry(iommu, info->dev,
4459 					PASID_RID2PASID, false);
4460 
4461 		iommu_disable_dev_iotlb(info);
4462 		if (!dev_is_real_dma_subdevice(info->dev))
4463 			domain_context_clear(iommu, info->dev);
4464 		intel_pasid_free_table(info->dev);
4465 	}
4466 
4467 	unlink_domain_info(info);
4468 
4469 	spin_lock_irqsave(&iommu->lock, flags);
4470 	domain_detach_iommu(domain, iommu);
4471 	spin_unlock_irqrestore(&iommu->lock, flags);
4472 
4473 	free_devinfo_mem(info);
4474 }
4475 
4476 static void dmar_remove_one_dev_info(struct device *dev)
4477 {
4478 	struct device_domain_info *info;
4479 	unsigned long flags;
4480 
4481 	spin_lock_irqsave(&device_domain_lock, flags);
4482 	info = get_domain_info(dev);
4483 	if (info)
4484 		__dmar_remove_one_dev_info(info);
4485 	spin_unlock_irqrestore(&device_domain_lock, flags);
4486 }
4487 
4488 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4489 {
4490 	int adjust_width;
4491 
4492 	/* calculate AGAW */
4493 	domain->gaw = guest_width;
4494 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4495 	domain->agaw = width_to_agaw(adjust_width);
4496 
4497 	domain->iommu_coherency = 0;
4498 	domain->iommu_snooping = 0;
4499 	domain->iommu_superpage = 0;
4500 	domain->max_addr = 0;
4501 
4502 	/* always allocate the top pgd */
4503 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4504 	if (!domain->pgd)
4505 		return -ENOMEM;
4506 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4507 	return 0;
4508 }
4509 
4510 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4511 {
4512 	struct dmar_domain *dmar_domain;
4513 	struct iommu_domain *domain;
4514 
4515 	switch (type) {
4516 	case IOMMU_DOMAIN_DMA:
4517 	case IOMMU_DOMAIN_UNMANAGED:
4518 		dmar_domain = alloc_domain(0);
4519 		if (!dmar_domain) {
4520 			pr_err("Can't allocate dmar_domain\n");
4521 			return NULL;
4522 		}
4523 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4524 			pr_err("Domain initialization failed\n");
4525 			domain_exit(dmar_domain);
4526 			return NULL;
4527 		}
4528 
4529 		if (type == IOMMU_DOMAIN_DMA &&
4530 		    iommu_get_dma_cookie(&dmar_domain->domain))
4531 			return NULL;
4532 
4533 		domain = &dmar_domain->domain;
4534 		domain->geometry.aperture_start = 0;
4535 		domain->geometry.aperture_end   =
4536 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4537 		domain->geometry.force_aperture = true;
4538 
4539 		return domain;
4540 	case IOMMU_DOMAIN_IDENTITY:
4541 		return &si_domain->domain;
4542 	default:
4543 		return NULL;
4544 	}
4545 
4546 	return NULL;
4547 }
4548 
4549 static void intel_iommu_domain_free(struct iommu_domain *domain)
4550 {
4551 	if (domain != &si_domain->domain)
4552 		domain_exit(to_dmar_domain(domain));
4553 }
4554 
4555 /*
4556  * Check whether a @domain could be attached to the @dev through the
4557  * aux-domain attach/detach APIs.
4558  */
4559 static inline bool
4560 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4561 {
4562 	struct device_domain_info *info = get_domain_info(dev);
4563 
4564 	return info && info->auxd_enabled &&
4565 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4566 }
4567 
4568 static inline struct subdev_domain_info *
4569 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4570 {
4571 	struct subdev_domain_info *sinfo;
4572 
4573 	if (!list_empty(&domain->subdevices)) {
4574 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4575 			if (sinfo->pdev == dev)
4576 				return sinfo;
4577 		}
4578 	}
4579 
4580 	return NULL;
4581 }
4582 
4583 static int auxiliary_link_device(struct dmar_domain *domain,
4584 				 struct device *dev)
4585 {
4586 	struct device_domain_info *info = get_domain_info(dev);
4587 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4588 
4589 	assert_spin_locked(&device_domain_lock);
4590 	if (WARN_ON(!info))
4591 		return -EINVAL;
4592 
4593 	if (!sinfo) {
4594 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4595 		sinfo->domain = domain;
4596 		sinfo->pdev = dev;
4597 		list_add(&sinfo->link_phys, &info->subdevices);
4598 		list_add(&sinfo->link_domain, &domain->subdevices);
4599 	}
4600 
4601 	return ++sinfo->users;
4602 }
4603 
4604 static int auxiliary_unlink_device(struct dmar_domain *domain,
4605 				   struct device *dev)
4606 {
4607 	struct device_domain_info *info = get_domain_info(dev);
4608 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4609 	int ret;
4610 
4611 	assert_spin_locked(&device_domain_lock);
4612 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4613 		return -EINVAL;
4614 
4615 	ret = --sinfo->users;
4616 	if (!ret) {
4617 		list_del(&sinfo->link_phys);
4618 		list_del(&sinfo->link_domain);
4619 		kfree(sinfo);
4620 	}
4621 
4622 	return ret;
4623 }
4624 
4625 static int aux_domain_add_dev(struct dmar_domain *domain,
4626 			      struct device *dev)
4627 {
4628 	int ret;
4629 	unsigned long flags;
4630 	struct intel_iommu *iommu;
4631 
4632 	iommu = device_to_iommu(dev, NULL, NULL);
4633 	if (!iommu)
4634 		return -ENODEV;
4635 
4636 	if (domain->default_pasid <= 0) {
4637 		u32 pasid;
4638 
4639 		/* No private data needed for the default pasid */
4640 		pasid = ioasid_alloc(NULL, PASID_MIN,
4641 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4642 				     NULL);
4643 		if (pasid == INVALID_IOASID) {
4644 			pr_err("Can't allocate default pasid\n");
4645 			return -ENODEV;
4646 		}
4647 		domain->default_pasid = pasid;
4648 	}
4649 
4650 	spin_lock_irqsave(&device_domain_lock, flags);
4651 	ret = auxiliary_link_device(domain, dev);
4652 	if (ret <= 0)
4653 		goto link_failed;
4654 
4655 	/*
4656 	 * Subdevices from the same physical device can be attached to the
4657 	 * same domain. For such cases, only the first subdevice attachment
4658 	 * needs to go through the full steps in this function. So if ret >
4659 	 * 1, just goto out.
4660 	 */
4661 	if (ret > 1)
4662 		goto out;
4663 
4664 	/*
4665 	 * iommu->lock must be held to attach domain to iommu and setup the
4666 	 * pasid entry for second level translation.
4667 	 */
4668 	spin_lock(&iommu->lock);
4669 	ret = domain_attach_iommu(domain, iommu);
4670 	if (ret)
4671 		goto attach_failed;
4672 
4673 	/* Setup the PASID entry for mediated devices: */
4674 	if (domain_use_first_level(domain))
4675 		ret = domain_setup_first_level(iommu, domain, dev,
4676 					       domain->default_pasid);
4677 	else
4678 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4679 						     domain->default_pasid);
4680 	if (ret)
4681 		goto table_failed;
4682 
4683 	spin_unlock(&iommu->lock);
4684 out:
4685 	spin_unlock_irqrestore(&device_domain_lock, flags);
4686 
4687 	return 0;
4688 
4689 table_failed:
4690 	domain_detach_iommu(domain, iommu);
4691 attach_failed:
4692 	spin_unlock(&iommu->lock);
4693 	auxiliary_unlink_device(domain, dev);
4694 link_failed:
4695 	spin_unlock_irqrestore(&device_domain_lock, flags);
4696 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4697 		ioasid_put(domain->default_pasid);
4698 
4699 	return ret;
4700 }
4701 
4702 static void aux_domain_remove_dev(struct dmar_domain *domain,
4703 				  struct device *dev)
4704 {
4705 	struct device_domain_info *info;
4706 	struct intel_iommu *iommu;
4707 	unsigned long flags;
4708 
4709 	if (!is_aux_domain(dev, &domain->domain))
4710 		return;
4711 
4712 	spin_lock_irqsave(&device_domain_lock, flags);
4713 	info = get_domain_info(dev);
4714 	iommu = info->iommu;
4715 
4716 	if (!auxiliary_unlink_device(domain, dev)) {
4717 		spin_lock(&iommu->lock);
4718 		intel_pasid_tear_down_entry(iommu, dev,
4719 					    domain->default_pasid, false);
4720 		domain_detach_iommu(domain, iommu);
4721 		spin_unlock(&iommu->lock);
4722 	}
4723 
4724 	spin_unlock_irqrestore(&device_domain_lock, flags);
4725 
4726 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4727 		ioasid_put(domain->default_pasid);
4728 }
4729 
4730 static int prepare_domain_attach_device(struct iommu_domain *domain,
4731 					struct device *dev)
4732 {
4733 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4734 	struct intel_iommu *iommu;
4735 	int addr_width;
4736 
4737 	iommu = device_to_iommu(dev, NULL, NULL);
4738 	if (!iommu)
4739 		return -ENODEV;
4740 
4741 	/* check if this iommu agaw is sufficient for max mapped address */
4742 	addr_width = agaw_to_width(iommu->agaw);
4743 	if (addr_width > cap_mgaw(iommu->cap))
4744 		addr_width = cap_mgaw(iommu->cap);
4745 
4746 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4747 		dev_err(dev, "%s: iommu width (%d) is not "
4748 		        "sufficient for the mapped address (%llx)\n",
4749 		        __func__, addr_width, dmar_domain->max_addr);
4750 		return -EFAULT;
4751 	}
4752 	dmar_domain->gaw = addr_width;
4753 
4754 	/*
4755 	 * Knock out extra levels of page tables if necessary
4756 	 */
4757 	while (iommu->agaw < dmar_domain->agaw) {
4758 		struct dma_pte *pte;
4759 
4760 		pte = dmar_domain->pgd;
4761 		if (dma_pte_present(pte)) {
4762 			dmar_domain->pgd = (struct dma_pte *)
4763 				phys_to_virt(dma_pte_addr(pte));
4764 			free_pgtable_page(pte);
4765 		}
4766 		dmar_domain->agaw--;
4767 	}
4768 
4769 	return 0;
4770 }
4771 
4772 static int intel_iommu_attach_device(struct iommu_domain *domain,
4773 				     struct device *dev)
4774 {
4775 	int ret;
4776 
4777 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4778 	    device_is_rmrr_locked(dev)) {
4779 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4780 		return -EPERM;
4781 	}
4782 
4783 	if (is_aux_domain(dev, domain))
4784 		return -EPERM;
4785 
4786 	/* normally dev is not mapped */
4787 	if (unlikely(domain_context_mapped(dev))) {
4788 		struct dmar_domain *old_domain;
4789 
4790 		old_domain = find_domain(dev);
4791 		if (old_domain)
4792 			dmar_remove_one_dev_info(dev);
4793 	}
4794 
4795 	ret = prepare_domain_attach_device(domain, dev);
4796 	if (ret)
4797 		return ret;
4798 
4799 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4800 }
4801 
4802 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4803 					 struct device *dev)
4804 {
4805 	int ret;
4806 
4807 	if (!is_aux_domain(dev, domain))
4808 		return -EPERM;
4809 
4810 	ret = prepare_domain_attach_device(domain, dev);
4811 	if (ret)
4812 		return ret;
4813 
4814 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4815 }
4816 
4817 static void intel_iommu_detach_device(struct iommu_domain *domain,
4818 				      struct device *dev)
4819 {
4820 	dmar_remove_one_dev_info(dev);
4821 }
4822 
4823 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4824 					  struct device *dev)
4825 {
4826 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4827 }
4828 
4829 #ifdef CONFIG_INTEL_IOMMU_SVM
4830 /*
4831  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4832  * VT-d granularity. Invalidation is typically included in the unmap operation
4833  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4834  * owns the first level page tables. Invalidations of translation caches in the
4835  * guest are trapped and passed down to the host.
4836  *
4837  * vIOMMU in the guest will only expose first level page tables, therefore
4838  * we do not support IOTLB granularity for request without PASID (second level).
4839  *
4840  * For example, to find the VT-d granularity encoding for IOTLB
4841  * type and page selective granularity within PASID:
4842  * X: indexed by iommu cache type
4843  * Y: indexed by enum iommu_inv_granularity
4844  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4845  */
4846 
4847 static const int
4848 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4849 	/*
4850 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4851 	 * page selective (address granularity)
4852 	 */
4853 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4854 	/* PASID based dev TLBs */
4855 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4856 	/* PASID cache */
4857 	{-EINVAL, -EINVAL, -EINVAL}
4858 };
4859 
4860 static inline int to_vtd_granularity(int type, int granu)
4861 {
4862 	return inv_type_granu_table[type][granu];
4863 }
4864 
4865 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4866 {
4867 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4868 
4869 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4870 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4871 	 * granu size in contiguous memory.
4872 	 */
4873 	return order_base_2(nr_pages);
4874 }
4875 
4876 static int
4877 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4878 			   struct iommu_cache_invalidate_info *inv_info)
4879 {
4880 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4881 	struct device_domain_info *info;
4882 	struct intel_iommu *iommu;
4883 	unsigned long flags;
4884 	int cache_type;
4885 	u8 bus, devfn;
4886 	u16 did, sid;
4887 	int ret = 0;
4888 	u64 size = 0;
4889 
4890 	if (!inv_info || !dmar_domain)
4891 		return -EINVAL;
4892 
4893 	if (!dev || !dev_is_pci(dev))
4894 		return -ENODEV;
4895 
4896 	iommu = device_to_iommu(dev, &bus, &devfn);
4897 	if (!iommu)
4898 		return -ENODEV;
4899 
4900 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4901 		return -EINVAL;
4902 
4903 	spin_lock_irqsave(&device_domain_lock, flags);
4904 	spin_lock(&iommu->lock);
4905 	info = get_domain_info(dev);
4906 	if (!info) {
4907 		ret = -EINVAL;
4908 		goto out_unlock;
4909 	}
4910 	did = dmar_domain->iommu_did[iommu->seq_id];
4911 	sid = PCI_DEVID(bus, devfn);
4912 
4913 	/* Size is only valid in address selective invalidation */
4914 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4915 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4916 				   inv_info->granu.addr_info.nb_granules);
4917 
4918 	for_each_set_bit(cache_type,
4919 			 (unsigned long *)&inv_info->cache,
4920 			 IOMMU_CACHE_INV_TYPE_NR) {
4921 		int granu = 0;
4922 		u64 pasid = 0;
4923 		u64 addr = 0;
4924 
4925 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4926 		if (granu == -EINVAL) {
4927 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4928 					   cache_type, inv_info->granularity);
4929 			break;
4930 		}
4931 
4932 		/*
4933 		 * PASID is stored in different locations based on the
4934 		 * granularity.
4935 		 */
4936 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4937 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4938 			pasid = inv_info->granu.pasid_info.pasid;
4939 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4940 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4941 			pasid = inv_info->granu.addr_info.pasid;
4942 
4943 		switch (BIT(cache_type)) {
4944 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4945 			/* HW will ignore LSB bits based on address mask */
4946 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4947 			    size &&
4948 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4949 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4950 						   inv_info->granu.addr_info.addr, size);
4951 			}
4952 
4953 			/*
4954 			 * If granu is PASID-selective, address is ignored.
4955 			 * We use npages = -1 to indicate that.
4956 			 */
4957 			qi_flush_piotlb(iommu, did, pasid,
4958 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4959 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4960 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4961 
4962 			if (!info->ats_enabled)
4963 				break;
4964 			/*
4965 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4966 			 * in the guest may assume IOTLB flush is inclusive,
4967 			 * which is more efficient.
4968 			 */
4969 			fallthrough;
4970 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4971 			/*
4972 			 * PASID based device TLB invalidation does not support
4973 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4974 			 * IOMMU_INV_GRANU_ADDR.
4975 			 * The equivalent of that is we set the size to be the
4976 			 * entire range of 64 bit. User only provides PASID info
4977 			 * without address info. So we set addr to 0.
4978 			 */
4979 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4980 				size = 64 - VTD_PAGE_SHIFT;
4981 				addr = 0;
4982 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4983 				addr = inv_info->granu.addr_info.addr;
4984 			}
4985 
4986 			if (info->ats_enabled)
4987 				qi_flush_dev_iotlb_pasid(iommu, sid,
4988 						info->pfsid, pasid,
4989 						info->ats_qdep, addr,
4990 						size);
4991 			else
4992 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4993 			break;
4994 		default:
4995 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4996 					    cache_type);
4997 			ret = -EINVAL;
4998 		}
4999 	}
5000 out_unlock:
5001 	spin_unlock(&iommu->lock);
5002 	spin_unlock_irqrestore(&device_domain_lock, flags);
5003 
5004 	return ret;
5005 }
5006 #endif
5007 
5008 static int intel_iommu_map(struct iommu_domain *domain,
5009 			   unsigned long iova, phys_addr_t hpa,
5010 			   size_t size, int iommu_prot, gfp_t gfp)
5011 {
5012 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5013 	u64 max_addr;
5014 	int prot = 0;
5015 
5016 	if (iommu_prot & IOMMU_READ)
5017 		prot |= DMA_PTE_READ;
5018 	if (iommu_prot & IOMMU_WRITE)
5019 		prot |= DMA_PTE_WRITE;
5020 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5021 		prot |= DMA_PTE_SNP;
5022 
5023 	max_addr = iova + size;
5024 	if (dmar_domain->max_addr < max_addr) {
5025 		u64 end;
5026 
5027 		/* check if minimum agaw is sufficient for mapped address */
5028 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5029 		if (end < max_addr) {
5030 			pr_err("%s: iommu width (%d) is not "
5031 			       "sufficient for the mapped address (%llx)\n",
5032 			       __func__, dmar_domain->gaw, max_addr);
5033 			return -EFAULT;
5034 		}
5035 		dmar_domain->max_addr = max_addr;
5036 	}
5037 	/* Round up size to next multiple of PAGE_SIZE, if it and
5038 	   the low bits of hpa would take us onto the next page */
5039 	size = aligned_nrpages(hpa, size);
5040 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5041 				hpa >> VTD_PAGE_SHIFT, size, prot);
5042 }
5043 
5044 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5045 				unsigned long iova, size_t size,
5046 				struct iommu_iotlb_gather *gather)
5047 {
5048 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5049 	unsigned long start_pfn, last_pfn;
5050 	int level = 0;
5051 
5052 	/* Cope with horrid API which requires us to unmap more than the
5053 	   size argument if it happens to be a large-page mapping. */
5054 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5055 
5056 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5057 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5058 
5059 	start_pfn = iova >> VTD_PAGE_SHIFT;
5060 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5061 
5062 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5063 					last_pfn, gather->freelist);
5064 
5065 	if (dmar_domain->max_addr == iova + size)
5066 		dmar_domain->max_addr = iova;
5067 
5068 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5069 
5070 	return size;
5071 }
5072 
5073 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5074 				 struct iommu_iotlb_gather *gather)
5075 {
5076 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5077 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5078 	size_t size = gather->end - gather->start;
5079 	unsigned long start_pfn;
5080 	unsigned long nrpages;
5081 	int iommu_id;
5082 
5083 	nrpages = aligned_nrpages(gather->start, size);
5084 	start_pfn = mm_to_dma_pfn(iova_pfn);
5085 
5086 	for_each_domain_iommu(iommu_id, dmar_domain)
5087 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5088 				      start_pfn, nrpages, !gather->freelist, 0);
5089 
5090 	dma_free_pagelist(gather->freelist);
5091 }
5092 
5093 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5094 					    dma_addr_t iova)
5095 {
5096 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5097 	struct dma_pte *pte;
5098 	int level = 0;
5099 	u64 phys = 0;
5100 
5101 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5102 	if (pte && dma_pte_present(pte))
5103 		phys = dma_pte_addr(pte) +
5104 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5105 						VTD_PAGE_SHIFT) - 1));
5106 
5107 	return phys;
5108 }
5109 
5110 static bool intel_iommu_capable(enum iommu_cap cap)
5111 {
5112 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5113 		return domain_update_iommu_snooping(NULL) == 1;
5114 	if (cap == IOMMU_CAP_INTR_REMAP)
5115 		return irq_remapping_enabled == 1;
5116 
5117 	return false;
5118 }
5119 
5120 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5121 {
5122 	struct intel_iommu *iommu;
5123 
5124 	iommu = device_to_iommu(dev, NULL, NULL);
5125 	if (!iommu)
5126 		return ERR_PTR(-ENODEV);
5127 
5128 	if (translation_pre_enabled(iommu))
5129 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5130 
5131 	return &iommu->iommu;
5132 }
5133 
5134 static void intel_iommu_release_device(struct device *dev)
5135 {
5136 	struct intel_iommu *iommu;
5137 
5138 	iommu = device_to_iommu(dev, NULL, NULL);
5139 	if (!iommu)
5140 		return;
5141 
5142 	dmar_remove_one_dev_info(dev);
5143 
5144 	set_dma_ops(dev, NULL);
5145 }
5146 
5147 static void intel_iommu_probe_finalize(struct device *dev)
5148 {
5149 	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5150 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5151 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5152 
5153 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5154 		iommu_setup_dma_ops(dev, base,
5155 				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5156 	else
5157 		set_dma_ops(dev, NULL);
5158 }
5159 
5160 static void intel_iommu_get_resv_regions(struct device *device,
5161 					 struct list_head *head)
5162 {
5163 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5164 	struct iommu_resv_region *reg;
5165 	struct dmar_rmrr_unit *rmrr;
5166 	struct device *i_dev;
5167 	int i;
5168 
5169 	down_read(&dmar_global_lock);
5170 	for_each_rmrr_units(rmrr) {
5171 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5172 					  i, i_dev) {
5173 			struct iommu_resv_region *resv;
5174 			enum iommu_resv_type type;
5175 			size_t length;
5176 
5177 			if (i_dev != device &&
5178 			    !is_downstream_to_pci_bridge(device, i_dev))
5179 				continue;
5180 
5181 			length = rmrr->end_address - rmrr->base_address + 1;
5182 
5183 			type = device_rmrr_is_relaxable(device) ?
5184 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5185 
5186 			resv = iommu_alloc_resv_region(rmrr->base_address,
5187 						       length, prot, type);
5188 			if (!resv)
5189 				break;
5190 
5191 			list_add_tail(&resv->list, head);
5192 		}
5193 	}
5194 	up_read(&dmar_global_lock);
5195 
5196 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5197 	if (dev_is_pci(device)) {
5198 		struct pci_dev *pdev = to_pci_dev(device);
5199 
5200 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5201 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5202 						   IOMMU_RESV_DIRECT_RELAXABLE);
5203 			if (reg)
5204 				list_add_tail(&reg->list, head);
5205 		}
5206 	}
5207 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5208 
5209 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5210 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5211 				      0, IOMMU_RESV_MSI);
5212 	if (!reg)
5213 		return;
5214 	list_add_tail(&reg->list, head);
5215 }
5216 
5217 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5218 {
5219 	struct device_domain_info *info;
5220 	struct context_entry *context;
5221 	struct dmar_domain *domain;
5222 	unsigned long flags;
5223 	u64 ctx_lo;
5224 	int ret;
5225 
5226 	domain = find_domain(dev);
5227 	if (!domain)
5228 		return -EINVAL;
5229 
5230 	spin_lock_irqsave(&device_domain_lock, flags);
5231 	spin_lock(&iommu->lock);
5232 
5233 	ret = -EINVAL;
5234 	info = get_domain_info(dev);
5235 	if (!info || !info->pasid_supported)
5236 		goto out;
5237 
5238 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5239 	if (WARN_ON(!context))
5240 		goto out;
5241 
5242 	ctx_lo = context[0].lo;
5243 
5244 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5245 		ctx_lo |= CONTEXT_PASIDE;
5246 		context[0].lo = ctx_lo;
5247 		wmb();
5248 		iommu->flush.flush_context(iommu,
5249 					   domain->iommu_did[iommu->seq_id],
5250 					   PCI_DEVID(info->bus, info->devfn),
5251 					   DMA_CCMD_MASK_NOBIT,
5252 					   DMA_CCMD_DEVICE_INVL);
5253 	}
5254 
5255 	/* Enable PASID support in the device, if it wasn't already */
5256 	if (!info->pasid_enabled)
5257 		iommu_enable_dev_iotlb(info);
5258 
5259 	ret = 0;
5260 
5261  out:
5262 	spin_unlock(&iommu->lock);
5263 	spin_unlock_irqrestore(&device_domain_lock, flags);
5264 
5265 	return ret;
5266 }
5267 
5268 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5269 {
5270 	if (dev_is_pci(dev))
5271 		return pci_device_group(dev);
5272 	return generic_device_group(dev);
5273 }
5274 
5275 static int intel_iommu_enable_auxd(struct device *dev)
5276 {
5277 	struct device_domain_info *info;
5278 	struct intel_iommu *iommu;
5279 	unsigned long flags;
5280 	int ret;
5281 
5282 	iommu = device_to_iommu(dev, NULL, NULL);
5283 	if (!iommu || dmar_disabled)
5284 		return -EINVAL;
5285 
5286 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5287 		return -EINVAL;
5288 
5289 	ret = intel_iommu_enable_pasid(iommu, dev);
5290 	if (ret)
5291 		return -ENODEV;
5292 
5293 	spin_lock_irqsave(&device_domain_lock, flags);
5294 	info = get_domain_info(dev);
5295 	info->auxd_enabled = 1;
5296 	spin_unlock_irqrestore(&device_domain_lock, flags);
5297 
5298 	return 0;
5299 }
5300 
5301 static int intel_iommu_disable_auxd(struct device *dev)
5302 {
5303 	struct device_domain_info *info;
5304 	unsigned long flags;
5305 
5306 	spin_lock_irqsave(&device_domain_lock, flags);
5307 	info = get_domain_info(dev);
5308 	if (!WARN_ON(!info))
5309 		info->auxd_enabled = 0;
5310 	spin_unlock_irqrestore(&device_domain_lock, flags);
5311 
5312 	return 0;
5313 }
5314 
5315 /*
5316  * A PCI express designated vendor specific extended capability is defined
5317  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5318  * for system software and tools to detect endpoint devices supporting the
5319  * Intel scalable IO virtualization without host driver dependency.
5320  *
5321  * Returns the address of the matching extended capability structure within
5322  * the device's PCI configuration space or 0 if the device does not support
5323  * it.
5324  */
5325 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5326 {
5327 	int pos;
5328 	u16 vendor, id;
5329 
5330 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5331 	while (pos) {
5332 		pci_read_config_word(pdev, pos + 4, &vendor);
5333 		pci_read_config_word(pdev, pos + 8, &id);
5334 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5335 			return pos;
5336 
5337 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5338 	}
5339 
5340 	return 0;
5341 }
5342 
5343 static bool
5344 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5345 {
5346 	if (feat == IOMMU_DEV_FEAT_AUX) {
5347 		int ret;
5348 
5349 		if (!dev_is_pci(dev) || dmar_disabled ||
5350 		    !scalable_mode_support() || !pasid_mode_support())
5351 			return false;
5352 
5353 		ret = pci_pasid_features(to_pci_dev(dev));
5354 		if (ret < 0)
5355 			return false;
5356 
5357 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5358 	}
5359 
5360 	if (feat == IOMMU_DEV_FEAT_SVA) {
5361 		struct device_domain_info *info = get_domain_info(dev);
5362 
5363 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5364 			info->pasid_supported && info->pri_supported &&
5365 			info->ats_supported;
5366 	}
5367 
5368 	return false;
5369 }
5370 
5371 static int
5372 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5373 {
5374 	if (feat == IOMMU_DEV_FEAT_AUX)
5375 		return intel_iommu_enable_auxd(dev);
5376 
5377 	if (feat == IOMMU_DEV_FEAT_SVA) {
5378 		struct device_domain_info *info = get_domain_info(dev);
5379 
5380 		if (!info)
5381 			return -EINVAL;
5382 
5383 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5384 			return 0;
5385 	}
5386 
5387 	return -ENODEV;
5388 }
5389 
5390 static int
5391 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5392 {
5393 	if (feat == IOMMU_DEV_FEAT_AUX)
5394 		return intel_iommu_disable_auxd(dev);
5395 
5396 	return -ENODEV;
5397 }
5398 
5399 static bool
5400 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5401 {
5402 	struct device_domain_info *info = get_domain_info(dev);
5403 
5404 	if (feat == IOMMU_DEV_FEAT_AUX)
5405 		return scalable_mode_support() && info && info->auxd_enabled;
5406 
5407 	return false;
5408 }
5409 
5410 static int
5411 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5412 {
5413 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5414 
5415 	return dmar_domain->default_pasid > 0 ?
5416 			dmar_domain->default_pasid : -EINVAL;
5417 }
5418 
5419 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5420 					   struct device *dev)
5421 {
5422 	return attach_deferred(dev);
5423 }
5424 
5425 static int
5426 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5427 			    enum iommu_attr attr, void *data)
5428 {
5429 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5430 	unsigned long flags;
5431 	int ret = 0;
5432 
5433 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5434 		return -EINVAL;
5435 
5436 	switch (attr) {
5437 	case DOMAIN_ATTR_NESTING:
5438 		spin_lock_irqsave(&device_domain_lock, flags);
5439 		if (nested_mode_support() &&
5440 		    list_empty(&dmar_domain->devices)) {
5441 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5442 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5443 		} else {
5444 			ret = -ENODEV;
5445 		}
5446 		spin_unlock_irqrestore(&device_domain_lock, flags);
5447 		break;
5448 	default:
5449 		ret = -EINVAL;
5450 		break;
5451 	}
5452 
5453 	return ret;
5454 }
5455 
5456 static bool domain_use_flush_queue(void)
5457 {
5458 	struct dmar_drhd_unit *drhd;
5459 	struct intel_iommu *iommu;
5460 	bool r = true;
5461 
5462 	if (intel_iommu_strict)
5463 		return false;
5464 
5465 	/*
5466 	 * The flush queue implementation does not perform page-selective
5467 	 * invalidations that are required for efficient TLB flushes in virtual
5468 	 * environments. The benefit of batching is likely to be much lower than
5469 	 * the overhead of synchronizing the virtual and physical IOMMU
5470 	 * page-tables.
5471 	 */
5472 	rcu_read_lock();
5473 	for_each_active_iommu(iommu, drhd) {
5474 		if (!cap_caching_mode(iommu->cap))
5475 			continue;
5476 
5477 		pr_warn_once("IOMMU batching is disabled due to virtualization");
5478 		r = false;
5479 		break;
5480 	}
5481 	rcu_read_unlock();
5482 
5483 	return r;
5484 }
5485 
5486 static int
5487 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5488 			    enum iommu_attr attr, void *data)
5489 {
5490 	switch (domain->type) {
5491 	case IOMMU_DOMAIN_UNMANAGED:
5492 		return -ENODEV;
5493 	case IOMMU_DOMAIN_DMA:
5494 		switch (attr) {
5495 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5496 			*(int *)data = domain_use_flush_queue();
5497 			return 0;
5498 		default:
5499 			return -ENODEV;
5500 		}
5501 		break;
5502 	default:
5503 		return -EINVAL;
5504 	}
5505 }
5506 
5507 /*
5508  * Check that the device does not live on an external facing PCI port that is
5509  * marked as untrusted. Such devices should not be able to apply quirks and
5510  * thus not be able to bypass the IOMMU restrictions.
5511  */
5512 static bool risky_device(struct pci_dev *pdev)
5513 {
5514 	if (pdev->untrusted) {
5515 		pci_info(pdev,
5516 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5517 			 pdev->vendor, pdev->device);
5518 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5519 		return true;
5520 	}
5521 	return false;
5522 }
5523 
5524 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5525 			     unsigned long clf_pages)
5526 {
5527 	struct dma_pte *first_pte = NULL, *pte = NULL;
5528 	unsigned long lvl_pages = 0;
5529 	int level = 0;
5530 
5531 	while (clf_pages > 0) {
5532 		if (!pte) {
5533 			level = 0;
5534 			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5535 			if (WARN_ON(!pte))
5536 				return;
5537 			first_pte = pte;
5538 			lvl_pages = lvl_to_nr_pages(level);
5539 		}
5540 
5541 		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5542 			return;
5543 
5544 		clf_pages -= lvl_pages;
5545 		clf_pfn += lvl_pages;
5546 		pte++;
5547 
5548 		if (!clf_pages || first_pte_in_page(pte) ||
5549 		    (level > 1 && clf_pages < lvl_pages)) {
5550 			domain_flush_cache(domain, first_pte,
5551 					   (void *)pte - (void *)first_pte);
5552 			pte = NULL;
5553 		}
5554 	}
5555 }
5556 
5557 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5558 				       unsigned long iova, size_t size)
5559 {
5560 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5561 	unsigned long pages = aligned_nrpages(iova, size);
5562 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5563 	struct intel_iommu *iommu;
5564 	int iommu_id;
5565 
5566 	if (!dmar_domain->iommu_coherency)
5567 		clflush_sync_map(dmar_domain, pfn, pages);
5568 
5569 	for_each_domain_iommu(iommu_id, dmar_domain) {
5570 		iommu = g_iommus[iommu_id];
5571 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5572 	}
5573 }
5574 
5575 const struct iommu_ops intel_iommu_ops = {
5576 	.capable		= intel_iommu_capable,
5577 	.domain_alloc		= intel_iommu_domain_alloc,
5578 	.domain_free		= intel_iommu_domain_free,
5579 	.domain_get_attr        = intel_iommu_domain_get_attr,
5580 	.domain_set_attr	= intel_iommu_domain_set_attr,
5581 	.attach_dev		= intel_iommu_attach_device,
5582 	.detach_dev		= intel_iommu_detach_device,
5583 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5584 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5585 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5586 	.map			= intel_iommu_map,
5587 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5588 	.unmap			= intel_iommu_unmap,
5589 	.flush_iotlb_all        = intel_flush_iotlb_all,
5590 	.iotlb_sync		= intel_iommu_tlb_sync,
5591 	.iova_to_phys		= intel_iommu_iova_to_phys,
5592 	.probe_device		= intel_iommu_probe_device,
5593 	.probe_finalize		= intel_iommu_probe_finalize,
5594 	.release_device		= intel_iommu_release_device,
5595 	.get_resv_regions	= intel_iommu_get_resv_regions,
5596 	.put_resv_regions	= generic_iommu_put_resv_regions,
5597 	.device_group		= intel_iommu_device_group,
5598 	.dev_has_feat		= intel_iommu_dev_has_feat,
5599 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5600 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5601 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5602 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5603 	.def_domain_type	= device_def_domain_type,
5604 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5605 #ifdef CONFIG_INTEL_IOMMU_SVM
5606 	.cache_invalidate	= intel_iommu_sva_invalidate,
5607 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5608 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5609 	.sva_bind		= intel_svm_bind,
5610 	.sva_unbind		= intel_svm_unbind,
5611 	.sva_get_pasid		= intel_svm_get_pasid,
5612 	.page_response		= intel_svm_page_response,
5613 #endif
5614 };
5615 
5616 static void quirk_iommu_igfx(struct pci_dev *dev)
5617 {
5618 	if (risky_device(dev))
5619 		return;
5620 
5621 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5622 	dmar_map_gfx = 0;
5623 }
5624 
5625 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5633 
5634 /* Broadwell igfx malfunctions with dmar */
5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5659 
5660 static void quirk_iommu_rwbf(struct pci_dev *dev)
5661 {
5662 	if (risky_device(dev))
5663 		return;
5664 
5665 	/*
5666 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5667 	 * but needs it. Same seems to hold for the desktop versions.
5668 	 */
5669 	pci_info(dev, "Forcing write-buffer flush capability\n");
5670 	rwbf_quirk = 1;
5671 }
5672 
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5680 
5681 #define GGC 0x52
5682 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5683 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5684 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5685 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5686 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5687 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5688 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5689 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5690 
5691 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5692 {
5693 	unsigned short ggc;
5694 
5695 	if (risky_device(dev))
5696 		return;
5697 
5698 	if (pci_read_config_word(dev, GGC, &ggc))
5699 		return;
5700 
5701 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5702 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5703 		dmar_map_gfx = 0;
5704 	} else if (dmar_map_gfx) {
5705 		/* we have to ensure the gfx device is idle before we flush */
5706 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5707 		intel_iommu_strict = 1;
5708        }
5709 }
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5714 
5715 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5716 {
5717 	unsigned short ver;
5718 
5719 	if (!IS_GFX_DEVICE(dev))
5720 		return;
5721 
5722 	ver = (dev->device >> 8) & 0xff;
5723 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5724 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5725 	    ver != 0x9a)
5726 		return;
5727 
5728 	if (risky_device(dev))
5729 		return;
5730 
5731 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5732 	iommu_skip_te_disable = 1;
5733 }
5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5735 
5736 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5737    ISOCH DMAR unit for the Azalia sound device, but not give it any
5738    TLB entries, which causes it to deadlock. Check for that.  We do
5739    this in a function called from init_dmars(), instead of in a PCI
5740    quirk, because we don't want to print the obnoxious "BIOS broken"
5741    message if VT-d is actually disabled.
5742 */
5743 static void __init check_tylersburg_isoch(void)
5744 {
5745 	struct pci_dev *pdev;
5746 	uint32_t vtisochctrl;
5747 
5748 	/* If there's no Azalia in the system anyway, forget it. */
5749 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5750 	if (!pdev)
5751 		return;
5752 
5753 	if (risky_device(pdev)) {
5754 		pci_dev_put(pdev);
5755 		return;
5756 	}
5757 
5758 	pci_dev_put(pdev);
5759 
5760 	/* System Management Registers. Might be hidden, in which case
5761 	   we can't do the sanity check. But that's OK, because the
5762 	   known-broken BIOSes _don't_ actually hide it, so far. */
5763 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5764 	if (!pdev)
5765 		return;
5766 
5767 	if (risky_device(pdev)) {
5768 		pci_dev_put(pdev);
5769 		return;
5770 	}
5771 
5772 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5773 		pci_dev_put(pdev);
5774 		return;
5775 	}
5776 
5777 	pci_dev_put(pdev);
5778 
5779 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5780 	if (vtisochctrl & 1)
5781 		return;
5782 
5783 	/* Drop all bits other than the number of TLB entries */
5784 	vtisochctrl &= 0x1c;
5785 
5786 	/* If we have the recommended number of TLB entries (16), fine. */
5787 	if (vtisochctrl == 0x10)
5788 		return;
5789 
5790 	/* Zero TLB entries? You get to ride the short bus to school. */
5791 	if (!vtisochctrl) {
5792 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5793 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5794 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5795 		     dmi_get_system_info(DMI_BIOS_VERSION),
5796 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5797 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5798 		return;
5799 	}
5800 
5801 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5802 	       vtisochctrl);
5803 }
5804