xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 9dbbc3b9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 
48 #include "../irq_remapping.h"
49 #include "../iommu-sva-lib.h"
50 #include "pasid.h"
51 #include "cap_audit.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 struct dmar_satc_unit {
321 	struct list_head list;		/* list of SATC units */
322 	struct acpi_dmar_header *hdr;	/* ACPI header */
323 	struct dmar_dev_scope *devices;	/* target devices */
324 	struct intel_iommu *iommu;	/* the corresponding iommu */
325 	int devices_cnt;		/* target device count */
326 	u8 atc_required:1;		/* ATS is required */
327 };
328 
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331 static LIST_HEAD(dmar_satc_units);
332 
333 #define for_each_rmrr_units(rmrr) \
334 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 
336 /* bitmap for indexing intel_iommus */
337 static int g_num_of_iommus;
338 
339 static void domain_exit(struct dmar_domain *domain);
340 static void domain_remove_dev_info(struct dmar_domain *domain);
341 static void dmar_remove_one_dev_info(struct device *dev);
342 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
343 static int intel_iommu_attach_device(struct iommu_domain *domain,
344 				     struct device *dev);
345 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
346 					    dma_addr_t iova);
347 
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
350 #else
351 int dmar_disabled = 1;
352 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
353 
354 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
355 int intel_iommu_sm = 1;
356 #else
357 int intel_iommu_sm;
358 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
359 
360 int intel_iommu_enabled = 0;
361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
362 
363 static int dmar_map_gfx = 1;
364 static int intel_iommu_strict;
365 static int intel_iommu_superpage = 1;
366 static int iommu_identity_mapping;
367 static int iommu_skip_te_disable;
368 
369 #define IDENTMAP_GFX		2
370 #define IDENTMAP_AZALIA		4
371 
372 int intel_iommu_gfx_mapped;
373 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
374 
375 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
376 struct device_domain_info *get_domain_info(struct device *dev)
377 {
378 	struct device_domain_info *info;
379 
380 	if (!dev)
381 		return NULL;
382 
383 	info = dev_iommu_priv_get(dev);
384 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
385 		return NULL;
386 
387 	return info;
388 }
389 
390 DEFINE_SPINLOCK(device_domain_lock);
391 static LIST_HEAD(device_domain_list);
392 
393 /*
394  * Iterate over elements in device_domain_list and call the specified
395  * callback @fn against each element.
396  */
397 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
398 				     void *data), void *data)
399 {
400 	int ret = 0;
401 	unsigned long flags;
402 	struct device_domain_info *info;
403 
404 	spin_lock_irqsave(&device_domain_lock, flags);
405 	list_for_each_entry(info, &device_domain_list, global) {
406 		ret = fn(info, data);
407 		if (ret) {
408 			spin_unlock_irqrestore(&device_domain_lock, flags);
409 			return ret;
410 		}
411 	}
412 	spin_unlock_irqrestore(&device_domain_lock, flags);
413 
414 	return 0;
415 }
416 
417 const struct iommu_ops intel_iommu_ops;
418 
419 static bool translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
422 }
423 
424 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
425 {
426 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
427 }
428 
429 static void init_translation_status(struct intel_iommu *iommu)
430 {
431 	u32 gsts;
432 
433 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
434 	if (gsts & DMA_GSTS_TES)
435 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
436 }
437 
438 static int __init intel_iommu_setup(char *str)
439 {
440 	if (!str)
441 		return -EINVAL;
442 	while (*str) {
443 		if (!strncmp(str, "on", 2)) {
444 			dmar_disabled = 0;
445 			pr_info("IOMMU enabled\n");
446 		} else if (!strncmp(str, "off", 3)) {
447 			dmar_disabled = 1;
448 			no_platform_optin = 1;
449 			pr_info("IOMMU disabled\n");
450 		} else if (!strncmp(str, "igfx_off", 8)) {
451 			dmar_map_gfx = 0;
452 			pr_info("Disable GFX device mapping\n");
453 		} else if (!strncmp(str, "forcedac", 8)) {
454 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
455 			iommu_dma_forcedac = true;
456 		} else if (!strncmp(str, "strict", 6)) {
457 			pr_info("Disable batched IOTLB flush\n");
458 			intel_iommu_strict = 1;
459 		} else if (!strncmp(str, "sp_off", 6)) {
460 			pr_info("Disable supported super page\n");
461 			intel_iommu_superpage = 0;
462 		} else if (!strncmp(str, "sm_on", 5)) {
463 			pr_info("Intel-IOMMU: scalable mode supported\n");
464 			intel_iommu_sm = 1;
465 		} else if (!strncmp(str, "tboot_noforce", 13)) {
466 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 			intel_iommu_tboot_noforce = 1;
468 		}
469 
470 		str += strcspn(str, ",");
471 		while (*str == ',')
472 			str++;
473 	}
474 	return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477 
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480 
481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 {
483 	struct dmar_domain **domains;
484 	int idx = did >> 8;
485 
486 	domains = iommu->domains[idx];
487 	if (!domains)
488 		return NULL;
489 
490 	return domains[did & 0xff];
491 }
492 
493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494 			     struct dmar_domain *domain)
495 {
496 	struct dmar_domain **domains;
497 	int idx = did >> 8;
498 
499 	if (!iommu->domains[idx]) {
500 		size_t size = 256 * sizeof(struct dmar_domain *);
501 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 	}
503 
504 	domains = iommu->domains[idx];
505 	if (WARN_ON(!domains))
506 		return;
507 	else
508 		domains[did & 0xff] = domain;
509 }
510 
511 void *alloc_pgtable_page(int node)
512 {
513 	struct page *page;
514 	void *vaddr = NULL;
515 
516 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 	if (page)
518 		vaddr = page_address(page);
519 	return vaddr;
520 }
521 
522 void free_pgtable_page(void *vaddr)
523 {
524 	free_page((unsigned long)vaddr);
525 }
526 
527 static inline void *alloc_domain_mem(void)
528 {
529 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 }
531 
532 static void free_domain_mem(void *vaddr)
533 {
534 	kmem_cache_free(iommu_domain_cache, vaddr);
535 }
536 
537 static inline void * alloc_devinfo_mem(void)
538 {
539 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 }
541 
542 static inline void free_devinfo_mem(void *vaddr)
543 {
544 	kmem_cache_free(iommu_devinfo_cache, vaddr);
545 }
546 
547 static inline int domain_type_is_si(struct dmar_domain *domain)
548 {
549 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 }
551 
552 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 {
554 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 }
556 
557 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 				       unsigned long pfn)
559 {
560 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 
562 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 }
564 
565 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
566 {
567 	unsigned long sagaw;
568 	int agaw;
569 
570 	sagaw = cap_sagaw(iommu->cap);
571 	for (agaw = width_to_agaw(max_gaw);
572 	     agaw >= 0; agaw--) {
573 		if (test_bit(agaw, &sagaw))
574 			break;
575 	}
576 
577 	return agaw;
578 }
579 
580 /*
581  * Calculate max SAGAW for each iommu.
582  */
583 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 {
585 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
586 }
587 
588 /*
589  * calculate agaw for each iommu.
590  * "SAGAW" may be different across iommus, use a default agaw, and
591  * get a supported less agaw for iommus that don't support the default agaw.
592  */
593 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 {
595 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 }
597 
598 /* This functionin only returns single iommu in a domain */
599 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
600 {
601 	int iommu_id;
602 
603 	/* si_domain and vm domain should not get here. */
604 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 		return NULL;
606 
607 	for_each_domain_iommu(iommu_id, domain)
608 		break;
609 
610 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 		return NULL;
612 
613 	return g_iommus[iommu_id];
614 }
615 
616 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617 {
618 	return sm_supported(iommu) ?
619 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 }
621 
622 static void domain_update_iommu_coherency(struct dmar_domain *domain)
623 {
624 	struct dmar_drhd_unit *drhd;
625 	struct intel_iommu *iommu;
626 	bool found = false;
627 	int i;
628 
629 	domain->iommu_coherency = true;
630 
631 	for_each_domain_iommu(i, domain) {
632 		found = true;
633 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
634 			domain->iommu_coherency = false;
635 			break;
636 		}
637 	}
638 	if (found)
639 		return;
640 
641 	/* No hardware attached; use lowest common denominator */
642 	rcu_read_lock();
643 	for_each_active_iommu(iommu, drhd) {
644 		if (!iommu_paging_structure_coherency(iommu)) {
645 			domain->iommu_coherency = false;
646 			break;
647 		}
648 	}
649 	rcu_read_unlock();
650 }
651 
652 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
653 {
654 	struct dmar_drhd_unit *drhd;
655 	struct intel_iommu *iommu;
656 	bool ret = true;
657 
658 	rcu_read_lock();
659 	for_each_active_iommu(iommu, drhd) {
660 		if (iommu != skip) {
661 			/*
662 			 * If the hardware is operating in the scalable mode,
663 			 * the snooping control is always supported since we
664 			 * always set PASID-table-entry.PGSNP bit if the domain
665 			 * is managed outside (UNMANAGED).
666 			 */
667 			if (!sm_supported(iommu) &&
668 			    !ecap_sc_support(iommu->ecap)) {
669 				ret = false;
670 				break;
671 			}
672 		}
673 	}
674 	rcu_read_unlock();
675 
676 	return ret;
677 }
678 
679 static int domain_update_iommu_superpage(struct dmar_domain *domain,
680 					 struct intel_iommu *skip)
681 {
682 	struct dmar_drhd_unit *drhd;
683 	struct intel_iommu *iommu;
684 	int mask = 0x3;
685 
686 	if (!intel_iommu_superpage)
687 		return 0;
688 
689 	/* set iommu_superpage to the smallest common denominator */
690 	rcu_read_lock();
691 	for_each_active_iommu(iommu, drhd) {
692 		if (iommu != skip) {
693 			if (domain && domain_use_first_level(domain)) {
694 				if (!cap_fl1gp_support(iommu->cap))
695 					mask = 0x1;
696 			} else {
697 				mask &= cap_super_page_val(iommu->cap);
698 			}
699 
700 			if (!mask)
701 				break;
702 		}
703 	}
704 	rcu_read_unlock();
705 
706 	return fls(mask);
707 }
708 
709 static int domain_update_device_node(struct dmar_domain *domain)
710 {
711 	struct device_domain_info *info;
712 	int nid = NUMA_NO_NODE;
713 
714 	assert_spin_locked(&device_domain_lock);
715 
716 	if (list_empty(&domain->devices))
717 		return NUMA_NO_NODE;
718 
719 	list_for_each_entry(info, &domain->devices, link) {
720 		if (!info->dev)
721 			continue;
722 
723 		/*
724 		 * There could possibly be multiple device numa nodes as devices
725 		 * within the same domain may sit behind different IOMMUs. There
726 		 * isn't perfect answer in such situation, so we select first
727 		 * come first served policy.
728 		 */
729 		nid = dev_to_node(info->dev);
730 		if (nid != NUMA_NO_NODE)
731 			break;
732 	}
733 
734 	return nid;
735 }
736 
737 static void domain_update_iotlb(struct dmar_domain *domain);
738 
739 /* Some capabilities may be different across iommus */
740 static void domain_update_iommu_cap(struct dmar_domain *domain)
741 {
742 	domain_update_iommu_coherency(domain);
743 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
744 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
745 
746 	/*
747 	 * If RHSA is missing, we should default to the device numa domain
748 	 * as fall back.
749 	 */
750 	if (domain->nid == NUMA_NO_NODE)
751 		domain->nid = domain_update_device_node(domain);
752 
753 	/*
754 	 * First-level translation restricts the input-address to a
755 	 * canonical address (i.e., address bits 63:N have the same
756 	 * value as address bit [N-1], where N is 48-bits with 4-level
757 	 * paging and 57-bits with 5-level paging). Hence, skip bit
758 	 * [N-1].
759 	 */
760 	if (domain_use_first_level(domain))
761 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
762 	else
763 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
764 
765 	domain_update_iotlb(domain);
766 }
767 
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 					 u8 devfn, int alloc)
770 {
771 	struct root_entry *root = &iommu->root_entry[bus];
772 	struct context_entry *context;
773 	u64 *entry;
774 
775 	entry = &root->lo;
776 	if (sm_supported(iommu)) {
777 		if (devfn >= 0x80) {
778 			devfn -= 0x80;
779 			entry = &root->hi;
780 		}
781 		devfn *= 2;
782 	}
783 	if (*entry & 1)
784 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 	else {
786 		unsigned long phy_addr;
787 		if (!alloc)
788 			return NULL;
789 
790 		context = alloc_pgtable_page(iommu->node);
791 		if (!context)
792 			return NULL;
793 
794 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 		phy_addr = virt_to_phys((void *)context);
796 		*entry = phy_addr | 1;
797 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
798 	}
799 	return &context[devfn];
800 }
801 
802 static bool attach_deferred(struct device *dev)
803 {
804 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805 }
806 
807 /**
808  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809  *				 sub-hierarchy of a candidate PCI-PCI bridge
810  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811  * @bridge: the candidate PCI-PCI bridge
812  *
813  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814  */
815 static bool
816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817 {
818 	struct pci_dev *pdev, *pbridge;
819 
820 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 		return false;
822 
823 	pdev = to_pci_dev(dev);
824 	pbridge = to_pci_dev(bridge);
825 
826 	if (pbridge->subordinate &&
827 	    pbridge->subordinate->number <= pdev->bus->number &&
828 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 		return true;
830 
831 	return false;
832 }
833 
834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835 {
836 	struct dmar_drhd_unit *drhd;
837 	u32 vtbar;
838 	int rc;
839 
840 	/* We know that this device on this chipset has its own IOMMU.
841 	 * If we find it under a different IOMMU, then the BIOS is lying
842 	 * to us. Hope that the IOMMU for this device is actually
843 	 * disabled, and it needs no translation...
844 	 */
845 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 	if (rc) {
847 		/* "can't" happen */
848 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 		return false;
850 	}
851 	vtbar &= 0xffff0000;
852 
853 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
854 	drhd = dmar_find_matched_drhd_unit(pdev);
855 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 		return true;
859 	}
860 
861 	return false;
862 }
863 
864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865 {
866 	if (!iommu || iommu->drhd->ignored)
867 		return true;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pdev = to_pci_dev(dev);
871 
872 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 		    quirk_ioat_snb_local_iommu(pdev))
875 			return true;
876 	}
877 
878 	return false;
879 }
880 
881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882 {
883 	struct dmar_drhd_unit *drhd = NULL;
884 	struct pci_dev *pdev = NULL;
885 	struct intel_iommu *iommu;
886 	struct device *tmp;
887 	u16 segment = 0;
888 	int i;
889 
890 	if (!dev)
891 		return NULL;
892 
893 	if (dev_is_pci(dev)) {
894 		struct pci_dev *pf_pdev;
895 
896 		pdev = pci_real_dma_dev(to_pci_dev(dev));
897 
898 		/* VFs aren't listed in scope tables; we need to look up
899 		 * the PF instead to find the IOMMU. */
900 		pf_pdev = pci_physfn(pdev);
901 		dev = &pf_pdev->dev;
902 		segment = pci_domain_nr(pdev->bus);
903 	} else if (has_acpi_companion(dev))
904 		dev = &ACPI_COMPANION(dev)->dev;
905 
906 	rcu_read_lock();
907 	for_each_iommu(iommu, drhd) {
908 		if (pdev && segment != drhd->segment)
909 			continue;
910 
911 		for_each_active_dev_scope(drhd->devices,
912 					  drhd->devices_cnt, i, tmp) {
913 			if (tmp == dev) {
914 				/* For a VF use its original BDF# not that of the PF
915 				 * which we used for the IOMMU lookup. Strictly speaking
916 				 * we could do this for all PCI devices; we only need to
917 				 * get the BDF# from the scope table for ACPI matches. */
918 				if (pdev && pdev->is_virtfn)
919 					goto got_pdev;
920 
921 				if (bus && devfn) {
922 					*bus = drhd->devices[i].bus;
923 					*devfn = drhd->devices[i].devfn;
924 				}
925 				goto out;
926 			}
927 
928 			if (is_downstream_to_pci_bridge(dev, tmp))
929 				goto got_pdev;
930 		}
931 
932 		if (pdev && drhd->include_all) {
933 		got_pdev:
934 			if (bus && devfn) {
935 				*bus = pdev->bus->number;
936 				*devfn = pdev->devfn;
937 			}
938 			goto out;
939 		}
940 	}
941 	iommu = NULL;
942  out:
943 	if (iommu_is_dummy(iommu, dev))
944 		iommu = NULL;
945 
946 	rcu_read_unlock();
947 
948 	return iommu;
949 }
950 
951 static void domain_flush_cache(struct dmar_domain *domain,
952 			       void *addr, int size)
953 {
954 	if (!domain->iommu_coherency)
955 		clflush_cache_range(addr, size);
956 }
957 
958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959 {
960 	struct context_entry *context;
961 	int ret = 0;
962 	unsigned long flags;
963 
964 	spin_lock_irqsave(&iommu->lock, flags);
965 	context = iommu_context_addr(iommu, bus, devfn, 0);
966 	if (context)
967 		ret = context_present(context);
968 	spin_unlock_irqrestore(&iommu->lock, flags);
969 	return ret;
970 }
971 
972 static void free_context_table(struct intel_iommu *iommu)
973 {
974 	int i;
975 	unsigned long flags;
976 	struct context_entry *context;
977 
978 	spin_lock_irqsave(&iommu->lock, flags);
979 	if (!iommu->root_entry) {
980 		goto out;
981 	}
982 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 		context = iommu_context_addr(iommu, i, 0, 0);
984 		if (context)
985 			free_pgtable_page(context);
986 
987 		if (!sm_supported(iommu))
988 			continue;
989 
990 		context = iommu_context_addr(iommu, i, 0x80, 0);
991 		if (context)
992 			free_pgtable_page(context);
993 
994 	}
995 	free_pgtable_page(iommu->root_entry);
996 	iommu->root_entry = NULL;
997 out:
998 	spin_unlock_irqrestore(&iommu->lock, flags);
999 }
1000 
1001 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002 				      unsigned long pfn, int *target_level)
1003 {
1004 	struct dma_pte *parent, *pte;
1005 	int level = agaw_to_level(domain->agaw);
1006 	int offset;
1007 
1008 	BUG_ON(!domain->pgd);
1009 
1010 	if (!domain_pfn_supported(domain, pfn))
1011 		/* Address beyond IOMMU's addressing capabilities. */
1012 		return NULL;
1013 
1014 	parent = domain->pgd;
1015 
1016 	while (1) {
1017 		void *tmp_page;
1018 
1019 		offset = pfn_level_offset(pfn, level);
1020 		pte = &parent[offset];
1021 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022 			break;
1023 		if (level == *target_level)
1024 			break;
1025 
1026 		if (!dma_pte_present(pte)) {
1027 			uint64_t pteval;
1028 
1029 			tmp_page = alloc_pgtable_page(domain->nid);
1030 
1031 			if (!tmp_page)
1032 				return NULL;
1033 
1034 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036 			if (domain_use_first_level(domain)) {
1037 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1038 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1039 					pteval |= DMA_FL_PTE_ACCESS;
1040 			}
1041 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1042 				/* Someone else set it while we were thinking; use theirs. */
1043 				free_pgtable_page(tmp_page);
1044 			else
1045 				domain_flush_cache(domain, pte, sizeof(*pte));
1046 		}
1047 		if (level == 1)
1048 			break;
1049 
1050 		parent = phys_to_virt(dma_pte_addr(pte));
1051 		level--;
1052 	}
1053 
1054 	if (!*target_level)
1055 		*target_level = level;
1056 
1057 	return pte;
1058 }
1059 
1060 /* return address's pte at specific level */
1061 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1062 					 unsigned long pfn,
1063 					 int level, int *large_page)
1064 {
1065 	struct dma_pte *parent, *pte;
1066 	int total = agaw_to_level(domain->agaw);
1067 	int offset;
1068 
1069 	parent = domain->pgd;
1070 	while (level <= total) {
1071 		offset = pfn_level_offset(pfn, total);
1072 		pte = &parent[offset];
1073 		if (level == total)
1074 			return pte;
1075 
1076 		if (!dma_pte_present(pte)) {
1077 			*large_page = total;
1078 			break;
1079 		}
1080 
1081 		if (dma_pte_superpage(pte)) {
1082 			*large_page = total;
1083 			return pte;
1084 		}
1085 
1086 		parent = phys_to_virt(dma_pte_addr(pte));
1087 		total--;
1088 	}
1089 	return NULL;
1090 }
1091 
1092 /* clear last level pte, a tlb flush should be followed */
1093 static void dma_pte_clear_range(struct dmar_domain *domain,
1094 				unsigned long start_pfn,
1095 				unsigned long last_pfn)
1096 {
1097 	unsigned int large_page;
1098 	struct dma_pte *first_pte, *pte;
1099 
1100 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1101 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1102 	BUG_ON(start_pfn > last_pfn);
1103 
1104 	/* we don't need lock here; nobody else touches the iova range */
1105 	do {
1106 		large_page = 1;
1107 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1108 		if (!pte) {
1109 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1110 			continue;
1111 		}
1112 		do {
1113 			dma_clear_pte(pte);
1114 			start_pfn += lvl_to_nr_pages(large_page);
1115 			pte++;
1116 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1117 
1118 		domain_flush_cache(domain, first_pte,
1119 				   (void *)pte - (void *)first_pte);
1120 
1121 	} while (start_pfn && start_pfn <= last_pfn);
1122 }
1123 
1124 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1125 			       int retain_level, struct dma_pte *pte,
1126 			       unsigned long pfn, unsigned long start_pfn,
1127 			       unsigned long last_pfn)
1128 {
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn;
1134 		struct dma_pte *level_pte;
1135 
1136 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1137 			goto next;
1138 
1139 		level_pfn = pfn & level_mask(level);
1140 		level_pte = phys_to_virt(dma_pte_addr(pte));
1141 
1142 		if (level > 2) {
1143 			dma_pte_free_level(domain, level - 1, retain_level,
1144 					   level_pte, level_pfn, start_pfn,
1145 					   last_pfn);
1146 		}
1147 
1148 		/*
1149 		 * Free the page table if we're below the level we want to
1150 		 * retain and the range covers the entire table.
1151 		 */
1152 		if (level < retain_level && !(start_pfn > level_pfn ||
1153 		      last_pfn < level_pfn + level_size(level) - 1)) {
1154 			dma_clear_pte(pte);
1155 			domain_flush_cache(domain, pte, sizeof(*pte));
1156 			free_pgtable_page(level_pte);
1157 		}
1158 next:
1159 		pfn += level_size(level);
1160 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161 }
1162 
1163 /*
1164  * clear last level (leaf) ptes and free page table pages below the
1165  * level we wish to keep intact.
1166  */
1167 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168 				   unsigned long start_pfn,
1169 				   unsigned long last_pfn,
1170 				   int retain_level)
1171 {
1172 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174 	BUG_ON(start_pfn > last_pfn);
1175 
1176 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1177 
1178 	/* We don't need lock here; nobody else touches the iova range */
1179 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1180 			   domain->pgd, 0, start_pfn, last_pfn);
1181 
1182 	/* free pgd */
1183 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184 		free_pgtable_page(domain->pgd);
1185 		domain->pgd = NULL;
1186 	}
1187 }
1188 
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196 					    int level, struct dma_pte *pte,
1197 					    struct page *freelist)
1198 {
1199 	struct page *pg;
1200 
1201 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202 	pg->freelist = freelist;
1203 	freelist = pg;
1204 
1205 	if (level == 1)
1206 		return freelist;
1207 
1208 	pte = page_address(pg);
1209 	do {
1210 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211 			freelist = dma_pte_list_pagetables(domain, level - 1,
1212 							   pte, freelist);
1213 		pte++;
1214 	} while (!first_pte_in_page(pte));
1215 
1216 	return freelist;
1217 }
1218 
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 					struct dma_pte *pte, unsigned long pfn,
1221 					unsigned long start_pfn,
1222 					unsigned long last_pfn,
1223 					struct page *freelist)
1224 {
1225 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226 
1227 	pfn = max(start_pfn, pfn);
1228 	pte = &pte[pfn_level_offset(pfn, level)];
1229 
1230 	do {
1231 		unsigned long level_pfn;
1232 
1233 		if (!dma_pte_present(pte))
1234 			goto next;
1235 
1236 		level_pfn = pfn & level_mask(level);
1237 
1238 		/* If range covers entire pagetable, free it */
1239 		if (start_pfn <= level_pfn &&
1240 		    last_pfn >= level_pfn + level_size(level) - 1) {
1241 			/* These suborbinate page tables are going away entirely. Don't
1242 			   bother to clear them; we're just going to *free* them. */
1243 			if (level > 1 && !dma_pte_superpage(pte))
1244 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245 
1246 			dma_clear_pte(pte);
1247 			if (!first_pte)
1248 				first_pte = pte;
1249 			last_pte = pte;
1250 		} else if (level > 1) {
1251 			/* Recurse down into a level that isn't *entirely* obsolete */
1252 			freelist = dma_pte_clear_level(domain, level - 1,
1253 						       phys_to_virt(dma_pte_addr(pte)),
1254 						       level_pfn, start_pfn, last_pfn,
1255 						       freelist);
1256 		}
1257 next:
1258 		pfn += level_size(level);
1259 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260 
1261 	if (first_pte)
1262 		domain_flush_cache(domain, first_pte,
1263 				   (void *)++last_pte - (void *)first_pte);
1264 
1265 	return freelist;
1266 }
1267 
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272 				 unsigned long start_pfn,
1273 				 unsigned long last_pfn,
1274 				 struct page *freelist)
1275 {
1276 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278 	BUG_ON(start_pfn > last_pfn);
1279 
1280 	/* we don't need lock here; nobody else touches the iova range */
1281 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282 				       domain->pgd, 0, start_pfn, last_pfn,
1283 				       freelist);
1284 
1285 	/* free pgd */
1286 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287 		struct page *pgd_page = virt_to_page(domain->pgd);
1288 		pgd_page->freelist = freelist;
1289 		freelist = pgd_page;
1290 
1291 		domain->pgd = NULL;
1292 	}
1293 
1294 	return freelist;
1295 }
1296 
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299 	struct page *pg;
1300 
1301 	while ((pg = freelist)) {
1302 		freelist = pg->freelist;
1303 		free_pgtable_page(page_address(pg));
1304 	}
1305 }
1306 
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310 	struct root_entry *root;
1311 	unsigned long flags;
1312 
1313 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314 	if (!root) {
1315 		pr_err("Allocating root entry for %s failed\n",
1316 			iommu->name);
1317 		return -ENOMEM;
1318 	}
1319 
1320 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1321 
1322 	spin_lock_irqsave(&iommu->lock, flags);
1323 	iommu->root_entry = root;
1324 	spin_unlock_irqrestore(&iommu->lock, flags);
1325 
1326 	return 0;
1327 }
1328 
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331 	u64 addr;
1332 	u32 sts;
1333 	unsigned long flag;
1334 
1335 	addr = virt_to_phys(iommu->root_entry);
1336 	if (sm_supported(iommu))
1337 		addr |= DMA_RTADDR_SMT;
1338 
1339 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341 
1342 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343 
1344 	/* Make sure hardware complete it */
1345 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346 		      readl, (sts & DMA_GSTS_RTPS), sts);
1347 
1348 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 
1350 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1351 	if (sm_supported(iommu))
1352 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1353 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1354 }
1355 
1356 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1357 {
1358 	u32 val;
1359 	unsigned long flag;
1360 
1361 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1362 		return;
1363 
1364 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1365 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1366 
1367 	/* Make sure hardware complete it */
1368 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1369 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1370 
1371 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1372 }
1373 
1374 /* return value determine if we need a write buffer flush */
1375 static void __iommu_flush_context(struct intel_iommu *iommu,
1376 				  u16 did, u16 source_id, u8 function_mask,
1377 				  u64 type)
1378 {
1379 	u64 val = 0;
1380 	unsigned long flag;
1381 
1382 	switch (type) {
1383 	case DMA_CCMD_GLOBAL_INVL:
1384 		val = DMA_CCMD_GLOBAL_INVL;
1385 		break;
1386 	case DMA_CCMD_DOMAIN_INVL:
1387 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1388 		break;
1389 	case DMA_CCMD_DEVICE_INVL:
1390 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1391 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1392 		break;
1393 	default:
1394 		BUG();
1395 	}
1396 	val |= DMA_CCMD_ICC;
1397 
1398 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1399 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1400 
1401 	/* Make sure hardware complete it */
1402 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1403 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1404 
1405 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406 }
1407 
1408 /* return value determine if we need a write buffer flush */
1409 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1410 				u64 addr, unsigned int size_order, u64 type)
1411 {
1412 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1413 	u64 val = 0, val_iva = 0;
1414 	unsigned long flag;
1415 
1416 	switch (type) {
1417 	case DMA_TLB_GLOBAL_FLUSH:
1418 		/* global flush doesn't need set IVA_REG */
1419 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1420 		break;
1421 	case DMA_TLB_DSI_FLUSH:
1422 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1423 		break;
1424 	case DMA_TLB_PSI_FLUSH:
1425 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1426 		/* IH bit is passed in as part of address */
1427 		val_iva = size_order | addr;
1428 		break;
1429 	default:
1430 		BUG();
1431 	}
1432 	/* Note: set drain read/write */
1433 #if 0
1434 	/*
1435 	 * This is probably to be super secure.. Looks like we can
1436 	 * ignore it without any impact.
1437 	 */
1438 	if (cap_read_drain(iommu->cap))
1439 		val |= DMA_TLB_READ_DRAIN;
1440 #endif
1441 	if (cap_write_drain(iommu->cap))
1442 		val |= DMA_TLB_WRITE_DRAIN;
1443 
1444 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1445 	/* Note: Only uses first TLB reg currently */
1446 	if (val_iva)
1447 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1448 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1449 
1450 	/* Make sure hardware complete it */
1451 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1452 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1453 
1454 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1455 
1456 	/* check IOTLB invalidation granularity */
1457 	if (DMA_TLB_IAIG(val) == 0)
1458 		pr_err("Flush IOTLB failed\n");
1459 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1460 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1461 			(unsigned long long)DMA_TLB_IIRG(type),
1462 			(unsigned long long)DMA_TLB_IAIG(val));
1463 }
1464 
1465 static struct device_domain_info *
1466 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1467 			 u8 bus, u8 devfn)
1468 {
1469 	struct device_domain_info *info;
1470 
1471 	assert_spin_locked(&device_domain_lock);
1472 
1473 	if (!iommu->qi)
1474 		return NULL;
1475 
1476 	list_for_each_entry(info, &domain->devices, link)
1477 		if (info->iommu == iommu && info->bus == bus &&
1478 		    info->devfn == devfn) {
1479 			if (info->ats_supported && info->dev)
1480 				return info;
1481 			break;
1482 		}
1483 
1484 	return NULL;
1485 }
1486 
1487 static void domain_update_iotlb(struct dmar_domain *domain)
1488 {
1489 	struct device_domain_info *info;
1490 	bool has_iotlb_device = false;
1491 
1492 	assert_spin_locked(&device_domain_lock);
1493 
1494 	list_for_each_entry(info, &domain->devices, link)
1495 		if (info->ats_enabled) {
1496 			has_iotlb_device = true;
1497 			break;
1498 		}
1499 
1500 	if (!has_iotlb_device) {
1501 		struct subdev_domain_info *sinfo;
1502 
1503 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1504 			info = get_domain_info(sinfo->pdev);
1505 			if (info && info->ats_enabled) {
1506 				has_iotlb_device = true;
1507 				break;
1508 			}
1509 		}
1510 	}
1511 
1512 	domain->has_iotlb_device = has_iotlb_device;
1513 }
1514 
1515 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1516 {
1517 	struct pci_dev *pdev;
1518 
1519 	assert_spin_locked(&device_domain_lock);
1520 
1521 	if (!info || !dev_is_pci(info->dev))
1522 		return;
1523 
1524 	pdev = to_pci_dev(info->dev);
1525 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1526 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1527 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1528 	 * reserved, which should be set to 0.
1529 	 */
1530 	if (!ecap_dit(info->iommu->ecap))
1531 		info->pfsid = 0;
1532 	else {
1533 		struct pci_dev *pf_pdev;
1534 
1535 		/* pdev will be returned if device is not a vf */
1536 		pf_pdev = pci_physfn(pdev);
1537 		info->pfsid = pci_dev_id(pf_pdev);
1538 	}
1539 
1540 #ifdef CONFIG_INTEL_IOMMU_SVM
1541 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1542 	   the device if you enable PASID support after ATS support is
1543 	   undefined. So always enable PASID support on devices which
1544 	   have it, even if we can't yet know if we're ever going to
1545 	   use it. */
1546 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1547 		info->pasid_enabled = 1;
1548 
1549 	if (info->pri_supported &&
1550 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1551 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1552 		info->pri_enabled = 1;
1553 #endif
1554 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1555 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1556 		info->ats_enabled = 1;
1557 		domain_update_iotlb(info->domain);
1558 		info->ats_qdep = pci_ats_queue_depth(pdev);
1559 	}
1560 }
1561 
1562 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1563 {
1564 	struct pci_dev *pdev;
1565 
1566 	assert_spin_locked(&device_domain_lock);
1567 
1568 	if (!dev_is_pci(info->dev))
1569 		return;
1570 
1571 	pdev = to_pci_dev(info->dev);
1572 
1573 	if (info->ats_enabled) {
1574 		pci_disable_ats(pdev);
1575 		info->ats_enabled = 0;
1576 		domain_update_iotlb(info->domain);
1577 	}
1578 #ifdef CONFIG_INTEL_IOMMU_SVM
1579 	if (info->pri_enabled) {
1580 		pci_disable_pri(pdev);
1581 		info->pri_enabled = 0;
1582 	}
1583 	if (info->pasid_enabled) {
1584 		pci_disable_pasid(pdev);
1585 		info->pasid_enabled = 0;
1586 	}
1587 #endif
1588 }
1589 
1590 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1591 				    u64 addr, unsigned int mask)
1592 {
1593 	u16 sid, qdep;
1594 
1595 	if (!info || !info->ats_enabled)
1596 		return;
1597 
1598 	sid = info->bus << 8 | info->devfn;
1599 	qdep = info->ats_qdep;
1600 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1601 			   qdep, addr, mask);
1602 }
1603 
1604 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1605 				  u64 addr, unsigned mask)
1606 {
1607 	unsigned long flags;
1608 	struct device_domain_info *info;
1609 	struct subdev_domain_info *sinfo;
1610 
1611 	if (!domain->has_iotlb_device)
1612 		return;
1613 
1614 	spin_lock_irqsave(&device_domain_lock, flags);
1615 	list_for_each_entry(info, &domain->devices, link)
1616 		__iommu_flush_dev_iotlb(info, addr, mask);
1617 
1618 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1619 		info = get_domain_info(sinfo->pdev);
1620 		__iommu_flush_dev_iotlb(info, addr, mask);
1621 	}
1622 	spin_unlock_irqrestore(&device_domain_lock, flags);
1623 }
1624 
1625 static void domain_flush_piotlb(struct intel_iommu *iommu,
1626 				struct dmar_domain *domain,
1627 				u64 addr, unsigned long npages, bool ih)
1628 {
1629 	u16 did = domain->iommu_did[iommu->seq_id];
1630 
1631 	if (domain->default_pasid)
1632 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1633 				addr, npages, ih);
1634 
1635 	if (!list_empty(&domain->devices))
1636 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1637 }
1638 
1639 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1640 				  struct dmar_domain *domain,
1641 				  unsigned long pfn, unsigned int pages,
1642 				  int ih, int map)
1643 {
1644 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1645 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1646 	u16 did = domain->iommu_did[iommu->seq_id];
1647 
1648 	BUG_ON(pages == 0);
1649 
1650 	if (ih)
1651 		ih = 1 << 6;
1652 
1653 	if (domain_use_first_level(domain)) {
1654 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1655 	} else {
1656 		/*
1657 		 * Fallback to domain selective flush if no PSI support or
1658 		 * the size is too big. PSI requires page size to be 2 ^ x,
1659 		 * and the base address is naturally aligned to the size.
1660 		 */
1661 		if (!cap_pgsel_inv(iommu->cap) ||
1662 		    mask > cap_max_amask_val(iommu->cap))
1663 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664 							DMA_TLB_DSI_FLUSH);
1665 		else
1666 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1667 							DMA_TLB_PSI_FLUSH);
1668 	}
1669 
1670 	/*
1671 	 * In caching mode, changes of pages from non-present to present require
1672 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1673 	 */
1674 	if (!cap_caching_mode(iommu->cap) || !map)
1675 		iommu_flush_dev_iotlb(domain, addr, mask);
1676 }
1677 
1678 /* Notification for newly created mappings */
1679 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1680 					struct dmar_domain *domain,
1681 					unsigned long pfn, unsigned int pages)
1682 {
1683 	/*
1684 	 * It's a non-present to present mapping. Only flush if caching mode
1685 	 * and second level.
1686 	 */
1687 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1688 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1689 	else
1690 		iommu_flush_write_buffer(iommu);
1691 }
1692 
1693 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1694 {
1695 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1696 	int idx;
1697 
1698 	for_each_domain_iommu(idx, dmar_domain) {
1699 		struct intel_iommu *iommu = g_iommus[idx];
1700 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1701 
1702 		if (domain_use_first_level(dmar_domain))
1703 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1704 		else
1705 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1706 						 DMA_TLB_DSI_FLUSH);
1707 
1708 		if (!cap_caching_mode(iommu->cap))
1709 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1710 					      0, MAX_AGAW_PFN_WIDTH);
1711 	}
1712 }
1713 
1714 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1715 {
1716 	u32 pmen;
1717 	unsigned long flags;
1718 
1719 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1720 		return;
1721 
1722 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1723 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1724 	pmen &= ~DMA_PMEN_EPM;
1725 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1726 
1727 	/* wait for the protected region status bit to clear */
1728 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1729 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1730 
1731 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1732 }
1733 
1734 static void iommu_enable_translation(struct intel_iommu *iommu)
1735 {
1736 	u32 sts;
1737 	unsigned long flags;
1738 
1739 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1740 	iommu->gcmd |= DMA_GCMD_TE;
1741 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1742 
1743 	/* Make sure hardware complete it */
1744 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1745 		      readl, (sts & DMA_GSTS_TES), sts);
1746 
1747 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1748 }
1749 
1750 static void iommu_disable_translation(struct intel_iommu *iommu)
1751 {
1752 	u32 sts;
1753 	unsigned long flag;
1754 
1755 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1756 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1757 		return;
1758 
1759 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1760 	iommu->gcmd &= ~DMA_GCMD_TE;
1761 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1762 
1763 	/* Make sure hardware complete it */
1764 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1765 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1766 
1767 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1768 }
1769 
1770 static int iommu_init_domains(struct intel_iommu *iommu)
1771 {
1772 	u32 ndomains, nlongs;
1773 	size_t size;
1774 
1775 	ndomains = cap_ndoms(iommu->cap);
1776 	pr_debug("%s: Number of Domains supported <%d>\n",
1777 		 iommu->name, ndomains);
1778 	nlongs = BITS_TO_LONGS(ndomains);
1779 
1780 	spin_lock_init(&iommu->lock);
1781 
1782 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1783 	if (!iommu->domain_ids) {
1784 		pr_err("%s: Allocating domain id array failed\n",
1785 		       iommu->name);
1786 		return -ENOMEM;
1787 	}
1788 
1789 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1790 	iommu->domains = kzalloc(size, GFP_KERNEL);
1791 
1792 	if (iommu->domains) {
1793 		size = 256 * sizeof(struct dmar_domain *);
1794 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1795 	}
1796 
1797 	if (!iommu->domains || !iommu->domains[0]) {
1798 		pr_err("%s: Allocating domain array failed\n",
1799 		       iommu->name);
1800 		kfree(iommu->domain_ids);
1801 		kfree(iommu->domains);
1802 		iommu->domain_ids = NULL;
1803 		iommu->domains    = NULL;
1804 		return -ENOMEM;
1805 	}
1806 
1807 	/*
1808 	 * If Caching mode is set, then invalid translations are tagged
1809 	 * with domain-id 0, hence we need to pre-allocate it. We also
1810 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1811 	 * make sure it is not used for a real domain.
1812 	 */
1813 	set_bit(0, iommu->domain_ids);
1814 
1815 	/*
1816 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1817 	 * entry for first-level or pass-through translation modes should
1818 	 * be programmed with a domain id different from those used for
1819 	 * second-level or nested translation. We reserve a domain id for
1820 	 * this purpose.
1821 	 */
1822 	if (sm_supported(iommu))
1823 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1824 
1825 	return 0;
1826 }
1827 
1828 static void disable_dmar_iommu(struct intel_iommu *iommu)
1829 {
1830 	struct device_domain_info *info, *tmp;
1831 	unsigned long flags;
1832 
1833 	if (!iommu->domains || !iommu->domain_ids)
1834 		return;
1835 
1836 	spin_lock_irqsave(&device_domain_lock, flags);
1837 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1838 		if (info->iommu != iommu)
1839 			continue;
1840 
1841 		if (!info->dev || !info->domain)
1842 			continue;
1843 
1844 		__dmar_remove_one_dev_info(info);
1845 	}
1846 	spin_unlock_irqrestore(&device_domain_lock, flags);
1847 
1848 	if (iommu->gcmd & DMA_GCMD_TE)
1849 		iommu_disable_translation(iommu);
1850 }
1851 
1852 static void free_dmar_iommu(struct intel_iommu *iommu)
1853 {
1854 	if ((iommu->domains) && (iommu->domain_ids)) {
1855 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1856 		int i;
1857 
1858 		for (i = 0; i < elems; i++)
1859 			kfree(iommu->domains[i]);
1860 		kfree(iommu->domains);
1861 		kfree(iommu->domain_ids);
1862 		iommu->domains = NULL;
1863 		iommu->domain_ids = NULL;
1864 	}
1865 
1866 	g_iommus[iommu->seq_id] = NULL;
1867 
1868 	/* free context mapping */
1869 	free_context_table(iommu);
1870 
1871 #ifdef CONFIG_INTEL_IOMMU_SVM
1872 	if (pasid_supported(iommu)) {
1873 		if (ecap_prs(iommu->ecap))
1874 			intel_svm_finish_prq(iommu);
1875 	}
1876 	if (vccap_pasid(iommu->vccap))
1877 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1878 
1879 #endif
1880 }
1881 
1882 /*
1883  * Check and return whether first level is used by default for
1884  * DMA translation.
1885  */
1886 static bool first_level_by_default(void)
1887 {
1888 	return scalable_mode_support() && intel_cap_flts_sanity();
1889 }
1890 
1891 static struct dmar_domain *alloc_domain(int flags)
1892 {
1893 	struct dmar_domain *domain;
1894 
1895 	domain = alloc_domain_mem();
1896 	if (!domain)
1897 		return NULL;
1898 
1899 	memset(domain, 0, sizeof(*domain));
1900 	domain->nid = NUMA_NO_NODE;
1901 	domain->flags = flags;
1902 	if (first_level_by_default())
1903 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1904 	domain->has_iotlb_device = false;
1905 	INIT_LIST_HEAD(&domain->devices);
1906 	INIT_LIST_HEAD(&domain->subdevices);
1907 
1908 	return domain;
1909 }
1910 
1911 /* Must be called with iommu->lock */
1912 static int domain_attach_iommu(struct dmar_domain *domain,
1913 			       struct intel_iommu *iommu)
1914 {
1915 	unsigned long ndomains;
1916 	int num;
1917 
1918 	assert_spin_locked(&device_domain_lock);
1919 	assert_spin_locked(&iommu->lock);
1920 
1921 	domain->iommu_refcnt[iommu->seq_id] += 1;
1922 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1923 		ndomains = cap_ndoms(iommu->cap);
1924 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1925 
1926 		if (num >= ndomains) {
1927 			pr_err("%s: No free domain ids\n", iommu->name);
1928 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1929 			return -ENOSPC;
1930 		}
1931 
1932 		set_bit(num, iommu->domain_ids);
1933 		set_iommu_domain(iommu, num, domain);
1934 
1935 		domain->iommu_did[iommu->seq_id] = num;
1936 		domain->nid			 = iommu->node;
1937 
1938 		domain_update_iommu_cap(domain);
1939 	}
1940 
1941 	return 0;
1942 }
1943 
1944 static void domain_detach_iommu(struct dmar_domain *domain,
1945 				struct intel_iommu *iommu)
1946 {
1947 	int num;
1948 
1949 	assert_spin_locked(&device_domain_lock);
1950 	assert_spin_locked(&iommu->lock);
1951 
1952 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1953 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1954 		num = domain->iommu_did[iommu->seq_id];
1955 		clear_bit(num, iommu->domain_ids);
1956 		set_iommu_domain(iommu, num, NULL);
1957 
1958 		domain_update_iommu_cap(domain);
1959 		domain->iommu_did[iommu->seq_id] = 0;
1960 	}
1961 }
1962 
1963 static inline int guestwidth_to_adjustwidth(int gaw)
1964 {
1965 	int agaw;
1966 	int r = (gaw - 12) % 9;
1967 
1968 	if (r == 0)
1969 		agaw = gaw;
1970 	else
1971 		agaw = gaw + 9 - r;
1972 	if (agaw > 64)
1973 		agaw = 64;
1974 	return agaw;
1975 }
1976 
1977 static void domain_exit(struct dmar_domain *domain)
1978 {
1979 
1980 	/* Remove associated devices and clear attached or cached domains */
1981 	domain_remove_dev_info(domain);
1982 
1983 	/* destroy iovas */
1984 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1985 		iommu_put_dma_cookie(&domain->domain);
1986 
1987 	if (domain->pgd) {
1988 		struct page *freelist;
1989 
1990 		freelist = domain_unmap(domain, 0,
1991 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1992 		dma_free_pagelist(freelist);
1993 	}
1994 
1995 	free_domain_mem(domain);
1996 }
1997 
1998 /*
1999  * Get the PASID directory size for scalable mode context entry.
2000  * Value of X in the PDTS field of a scalable mode context entry
2001  * indicates PASID directory with 2^(X + 7) entries.
2002  */
2003 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2004 {
2005 	int pds, max_pde;
2006 
2007 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2008 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2009 	if (pds < 7)
2010 		return 0;
2011 
2012 	return pds - 7;
2013 }
2014 
2015 /*
2016  * Set the RID_PASID field of a scalable mode context entry. The
2017  * IOMMU hardware will use the PASID value set in this field for
2018  * DMA translations of DMA requests without PASID.
2019  */
2020 static inline void
2021 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2022 {
2023 	context->hi |= pasid & ((1 << 20) - 1);
2024 }
2025 
2026 /*
2027  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2028  * entry.
2029  */
2030 static inline void context_set_sm_dte(struct context_entry *context)
2031 {
2032 	context->lo |= (1 << 2);
2033 }
2034 
2035 /*
2036  * Set the PRE(Page Request Enable) field of a scalable mode context
2037  * entry.
2038  */
2039 static inline void context_set_sm_pre(struct context_entry *context)
2040 {
2041 	context->lo |= (1 << 4);
2042 }
2043 
2044 /* Convert value to context PASID directory size field coding. */
2045 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2046 
2047 static int domain_context_mapping_one(struct dmar_domain *domain,
2048 				      struct intel_iommu *iommu,
2049 				      struct pasid_table *table,
2050 				      u8 bus, u8 devfn)
2051 {
2052 	u16 did = domain->iommu_did[iommu->seq_id];
2053 	int translation = CONTEXT_TT_MULTI_LEVEL;
2054 	struct device_domain_info *info = NULL;
2055 	struct context_entry *context;
2056 	unsigned long flags;
2057 	int ret;
2058 
2059 	WARN_ON(did == 0);
2060 
2061 	if (hw_pass_through && domain_type_is_si(domain))
2062 		translation = CONTEXT_TT_PASS_THROUGH;
2063 
2064 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2065 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2066 
2067 	BUG_ON(!domain->pgd);
2068 
2069 	spin_lock_irqsave(&device_domain_lock, flags);
2070 	spin_lock(&iommu->lock);
2071 
2072 	ret = -ENOMEM;
2073 	context = iommu_context_addr(iommu, bus, devfn, 1);
2074 	if (!context)
2075 		goto out_unlock;
2076 
2077 	ret = 0;
2078 	if (context_present(context))
2079 		goto out_unlock;
2080 
2081 	/*
2082 	 * For kdump cases, old valid entries may be cached due to the
2083 	 * in-flight DMA and copied pgtable, but there is no unmapping
2084 	 * behaviour for them, thus we need an explicit cache flush for
2085 	 * the newly-mapped device. For kdump, at this point, the device
2086 	 * is supposed to finish reset at its driver probe stage, so no
2087 	 * in-flight DMA will exist, and we don't need to worry anymore
2088 	 * hereafter.
2089 	 */
2090 	if (context_copied(context)) {
2091 		u16 did_old = context_domain_id(context);
2092 
2093 		if (did_old < cap_ndoms(iommu->cap)) {
2094 			iommu->flush.flush_context(iommu, did_old,
2095 						   (((u16)bus) << 8) | devfn,
2096 						   DMA_CCMD_MASK_NOBIT,
2097 						   DMA_CCMD_DEVICE_INVL);
2098 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2099 						 DMA_TLB_DSI_FLUSH);
2100 		}
2101 	}
2102 
2103 	context_clear_entry(context);
2104 
2105 	if (sm_supported(iommu)) {
2106 		unsigned long pds;
2107 
2108 		WARN_ON(!table);
2109 
2110 		/* Setup the PASID DIR pointer: */
2111 		pds = context_get_sm_pds(table);
2112 		context->lo = (u64)virt_to_phys(table->table) |
2113 				context_pdts(pds);
2114 
2115 		/* Setup the RID_PASID field: */
2116 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2117 
2118 		/*
2119 		 * Setup the Device-TLB enable bit and Page request
2120 		 * Enable bit:
2121 		 */
2122 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2123 		if (info && info->ats_supported)
2124 			context_set_sm_dte(context);
2125 		if (info && info->pri_supported)
2126 			context_set_sm_pre(context);
2127 	} else {
2128 		struct dma_pte *pgd = domain->pgd;
2129 		int agaw;
2130 
2131 		context_set_domain_id(context, did);
2132 
2133 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2134 			/*
2135 			 * Skip top levels of page tables for iommu which has
2136 			 * less agaw than default. Unnecessary for PT mode.
2137 			 */
2138 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2139 				ret = -ENOMEM;
2140 				pgd = phys_to_virt(dma_pte_addr(pgd));
2141 				if (!dma_pte_present(pgd))
2142 					goto out_unlock;
2143 			}
2144 
2145 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146 			if (info && info->ats_supported)
2147 				translation = CONTEXT_TT_DEV_IOTLB;
2148 			else
2149 				translation = CONTEXT_TT_MULTI_LEVEL;
2150 
2151 			context_set_address_root(context, virt_to_phys(pgd));
2152 			context_set_address_width(context, agaw);
2153 		} else {
2154 			/*
2155 			 * In pass through mode, AW must be programmed to
2156 			 * indicate the largest AGAW value supported by
2157 			 * hardware. And ASR is ignored by hardware.
2158 			 */
2159 			context_set_address_width(context, iommu->msagaw);
2160 		}
2161 
2162 		context_set_translation_type(context, translation);
2163 	}
2164 
2165 	context_set_fault_enable(context);
2166 	context_set_present(context);
2167 	if (!ecap_coherent(iommu->ecap))
2168 		clflush_cache_range(context, sizeof(*context));
2169 
2170 	/*
2171 	 * It's a non-present to present mapping. If hardware doesn't cache
2172 	 * non-present entry we only need to flush the write-buffer. If the
2173 	 * _does_ cache non-present entries, then it does so in the special
2174 	 * domain #0, which we have to flush:
2175 	 */
2176 	if (cap_caching_mode(iommu->cap)) {
2177 		iommu->flush.flush_context(iommu, 0,
2178 					   (((u16)bus) << 8) | devfn,
2179 					   DMA_CCMD_MASK_NOBIT,
2180 					   DMA_CCMD_DEVICE_INVL);
2181 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2182 	} else {
2183 		iommu_flush_write_buffer(iommu);
2184 	}
2185 	iommu_enable_dev_iotlb(info);
2186 
2187 	ret = 0;
2188 
2189 out_unlock:
2190 	spin_unlock(&iommu->lock);
2191 	spin_unlock_irqrestore(&device_domain_lock, flags);
2192 
2193 	return ret;
2194 }
2195 
2196 struct domain_context_mapping_data {
2197 	struct dmar_domain *domain;
2198 	struct intel_iommu *iommu;
2199 	struct pasid_table *table;
2200 };
2201 
2202 static int domain_context_mapping_cb(struct pci_dev *pdev,
2203 				     u16 alias, void *opaque)
2204 {
2205 	struct domain_context_mapping_data *data = opaque;
2206 
2207 	return domain_context_mapping_one(data->domain, data->iommu,
2208 					  data->table, PCI_BUS_NUM(alias),
2209 					  alias & 0xff);
2210 }
2211 
2212 static int
2213 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2214 {
2215 	struct domain_context_mapping_data data;
2216 	struct pasid_table *table;
2217 	struct intel_iommu *iommu;
2218 	u8 bus, devfn;
2219 
2220 	iommu = device_to_iommu(dev, &bus, &devfn);
2221 	if (!iommu)
2222 		return -ENODEV;
2223 
2224 	table = intel_pasid_get_table(dev);
2225 
2226 	if (!dev_is_pci(dev))
2227 		return domain_context_mapping_one(domain, iommu, table,
2228 						  bus, devfn);
2229 
2230 	data.domain = domain;
2231 	data.iommu = iommu;
2232 	data.table = table;
2233 
2234 	return pci_for_each_dma_alias(to_pci_dev(dev),
2235 				      &domain_context_mapping_cb, &data);
2236 }
2237 
2238 static int domain_context_mapped_cb(struct pci_dev *pdev,
2239 				    u16 alias, void *opaque)
2240 {
2241 	struct intel_iommu *iommu = opaque;
2242 
2243 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2244 }
2245 
2246 static int domain_context_mapped(struct device *dev)
2247 {
2248 	struct intel_iommu *iommu;
2249 	u8 bus, devfn;
2250 
2251 	iommu = device_to_iommu(dev, &bus, &devfn);
2252 	if (!iommu)
2253 		return -ENODEV;
2254 
2255 	if (!dev_is_pci(dev))
2256 		return device_context_mapped(iommu, bus, devfn);
2257 
2258 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2259 				       domain_context_mapped_cb, iommu);
2260 }
2261 
2262 /* Returns a number of VTD pages, but aligned to MM page size */
2263 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2264 					    size_t size)
2265 {
2266 	host_addr &= ~PAGE_MASK;
2267 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2268 }
2269 
2270 /* Return largest possible superpage level for a given mapping */
2271 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2272 					  unsigned long iov_pfn,
2273 					  unsigned long phy_pfn,
2274 					  unsigned long pages)
2275 {
2276 	int support, level = 1;
2277 	unsigned long pfnmerge;
2278 
2279 	support = domain->iommu_superpage;
2280 
2281 	/* To use a large page, the virtual *and* physical addresses
2282 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2283 	   of them will mean we have to use smaller pages. So just
2284 	   merge them and check both at once. */
2285 	pfnmerge = iov_pfn | phy_pfn;
2286 
2287 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2288 		pages >>= VTD_STRIDE_SHIFT;
2289 		if (!pages)
2290 			break;
2291 		pfnmerge >>= VTD_STRIDE_SHIFT;
2292 		level++;
2293 		support--;
2294 	}
2295 	return level;
2296 }
2297 
2298 /*
2299  * Ensure that old small page tables are removed to make room for superpage(s).
2300  * We're going to add new large pages, so make sure we don't remove their parent
2301  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2302  */
2303 static void switch_to_super_page(struct dmar_domain *domain,
2304 				 unsigned long start_pfn,
2305 				 unsigned long end_pfn, int level)
2306 {
2307 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2308 	struct dma_pte *pte = NULL;
2309 	int i;
2310 
2311 	while (start_pfn <= end_pfn) {
2312 		if (!pte)
2313 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2314 
2315 		if (dma_pte_present(pte)) {
2316 			dma_pte_free_pagetable(domain, start_pfn,
2317 					       start_pfn + lvl_pages - 1,
2318 					       level + 1);
2319 
2320 			for_each_domain_iommu(i, domain)
2321 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2322 						      start_pfn, lvl_pages,
2323 						      0, 0);
2324 		}
2325 
2326 		pte++;
2327 		start_pfn += lvl_pages;
2328 		if (first_pte_in_page(pte))
2329 			pte = NULL;
2330 	}
2331 }
2332 
2333 static int
2334 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2335 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2336 {
2337 	unsigned int largepage_lvl = 0;
2338 	unsigned long lvl_pages = 0;
2339 	struct dma_pte *pte = NULL;
2340 	phys_addr_t pteval;
2341 	u64 attr;
2342 
2343 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2344 
2345 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2346 		return -EINVAL;
2347 
2348 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2349 	attr |= DMA_FL_PTE_PRESENT;
2350 	if (domain_use_first_level(domain)) {
2351 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2352 
2353 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2354 			attr |= DMA_FL_PTE_ACCESS;
2355 			if (prot & DMA_PTE_WRITE)
2356 				attr |= DMA_FL_PTE_DIRTY;
2357 		}
2358 	}
2359 
2360 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2361 
2362 	while (nr_pages > 0) {
2363 		uint64_t tmp;
2364 
2365 		if (!pte) {
2366 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2367 					phys_pfn, nr_pages);
2368 
2369 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2370 			if (!pte)
2371 				return -ENOMEM;
2372 			/* It is large page*/
2373 			if (largepage_lvl > 1) {
2374 				unsigned long end_pfn;
2375 
2376 				pteval |= DMA_PTE_LARGE_PAGE;
2377 				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2378 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2379 			} else {
2380 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2381 			}
2382 
2383 		}
2384 		/* We don't need lock here, nobody else
2385 		 * touches the iova range
2386 		 */
2387 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2388 		if (tmp) {
2389 			static int dumps = 5;
2390 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2391 				iov_pfn, tmp, (unsigned long long)pteval);
2392 			if (dumps) {
2393 				dumps--;
2394 				debug_dma_dump_mappings(NULL);
2395 			}
2396 			WARN_ON(1);
2397 		}
2398 
2399 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2400 
2401 		BUG_ON(nr_pages < lvl_pages);
2402 
2403 		nr_pages -= lvl_pages;
2404 		iov_pfn += lvl_pages;
2405 		phys_pfn += lvl_pages;
2406 		pteval += lvl_pages * VTD_PAGE_SIZE;
2407 
2408 		/* If the next PTE would be the first in a new page, then we
2409 		 * need to flush the cache on the entries we've just written.
2410 		 * And then we'll need to recalculate 'pte', so clear it and
2411 		 * let it get set again in the if (!pte) block above.
2412 		 *
2413 		 * If we're done (!nr_pages) we need to flush the cache too.
2414 		 *
2415 		 * Also if we've been setting superpages, we may need to
2416 		 * recalculate 'pte' and switch back to smaller pages for the
2417 		 * end of the mapping, if the trailing size is not enough to
2418 		 * use another superpage (i.e. nr_pages < lvl_pages).
2419 		 *
2420 		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2421 		 * callback.
2422 		 */
2423 		pte++;
2424 		if (!nr_pages || first_pte_in_page(pte) ||
2425 		    (largepage_lvl > 1 && nr_pages < lvl_pages))
2426 			pte = NULL;
2427 	}
2428 
2429 	return 0;
2430 }
2431 
2432 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2433 {
2434 	unsigned long flags;
2435 	struct context_entry *context;
2436 	u16 did_old;
2437 
2438 	if (!iommu)
2439 		return;
2440 
2441 	spin_lock_irqsave(&iommu->lock, flags);
2442 	context = iommu_context_addr(iommu, bus, devfn, 0);
2443 	if (!context) {
2444 		spin_unlock_irqrestore(&iommu->lock, flags);
2445 		return;
2446 	}
2447 	did_old = context_domain_id(context);
2448 	context_clear_entry(context);
2449 	__iommu_flush_cache(iommu, context, sizeof(*context));
2450 	spin_unlock_irqrestore(&iommu->lock, flags);
2451 	iommu->flush.flush_context(iommu,
2452 				   did_old,
2453 				   (((u16)bus) << 8) | devfn,
2454 				   DMA_CCMD_MASK_NOBIT,
2455 				   DMA_CCMD_DEVICE_INVL);
2456 
2457 	if (sm_supported(iommu))
2458 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2459 
2460 	iommu->flush.flush_iotlb(iommu,
2461 				 did_old,
2462 				 0,
2463 				 0,
2464 				 DMA_TLB_DSI_FLUSH);
2465 }
2466 
2467 static inline void unlink_domain_info(struct device_domain_info *info)
2468 {
2469 	assert_spin_locked(&device_domain_lock);
2470 	list_del(&info->link);
2471 	list_del(&info->global);
2472 	if (info->dev)
2473 		dev_iommu_priv_set(info->dev, NULL);
2474 }
2475 
2476 static void domain_remove_dev_info(struct dmar_domain *domain)
2477 {
2478 	struct device_domain_info *info, *tmp;
2479 	unsigned long flags;
2480 
2481 	spin_lock_irqsave(&device_domain_lock, flags);
2482 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2483 		__dmar_remove_one_dev_info(info);
2484 	spin_unlock_irqrestore(&device_domain_lock, flags);
2485 }
2486 
2487 struct dmar_domain *find_domain(struct device *dev)
2488 {
2489 	struct device_domain_info *info;
2490 
2491 	if (unlikely(!dev || !dev->iommu))
2492 		return NULL;
2493 
2494 	if (unlikely(attach_deferred(dev)))
2495 		return NULL;
2496 
2497 	/* No lock here, assumes no domain exit in normal case */
2498 	info = get_domain_info(dev);
2499 	if (likely(info))
2500 		return info->domain;
2501 
2502 	return NULL;
2503 }
2504 
2505 static inline struct device_domain_info *
2506 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2507 {
2508 	struct device_domain_info *info;
2509 
2510 	list_for_each_entry(info, &device_domain_list, global)
2511 		if (info->segment == segment && info->bus == bus &&
2512 		    info->devfn == devfn)
2513 			return info;
2514 
2515 	return NULL;
2516 }
2517 
2518 static int domain_setup_first_level(struct intel_iommu *iommu,
2519 				    struct dmar_domain *domain,
2520 				    struct device *dev,
2521 				    u32 pasid)
2522 {
2523 	struct dma_pte *pgd = domain->pgd;
2524 	int agaw, level;
2525 	int flags = 0;
2526 
2527 	/*
2528 	 * Skip top levels of page tables for iommu which has
2529 	 * less agaw than default. Unnecessary for PT mode.
2530 	 */
2531 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2532 		pgd = phys_to_virt(dma_pte_addr(pgd));
2533 		if (!dma_pte_present(pgd))
2534 			return -ENOMEM;
2535 	}
2536 
2537 	level = agaw_to_level(agaw);
2538 	if (level != 4 && level != 5)
2539 		return -EINVAL;
2540 
2541 	if (pasid != PASID_RID2PASID)
2542 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2543 	if (level == 5)
2544 		flags |= PASID_FLAG_FL5LP;
2545 
2546 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2547 		flags |= PASID_FLAG_PAGE_SNOOP;
2548 
2549 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2550 					     domain->iommu_did[iommu->seq_id],
2551 					     flags);
2552 }
2553 
2554 static bool dev_is_real_dma_subdevice(struct device *dev)
2555 {
2556 	return dev && dev_is_pci(dev) &&
2557 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2558 }
2559 
2560 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2561 						    int bus, int devfn,
2562 						    struct device *dev,
2563 						    struct dmar_domain *domain)
2564 {
2565 	struct dmar_domain *found = NULL;
2566 	struct device_domain_info *info;
2567 	unsigned long flags;
2568 	int ret;
2569 
2570 	info = alloc_devinfo_mem();
2571 	if (!info)
2572 		return NULL;
2573 
2574 	if (!dev_is_real_dma_subdevice(dev)) {
2575 		info->bus = bus;
2576 		info->devfn = devfn;
2577 		info->segment = iommu->segment;
2578 	} else {
2579 		struct pci_dev *pdev = to_pci_dev(dev);
2580 
2581 		info->bus = pdev->bus->number;
2582 		info->devfn = pdev->devfn;
2583 		info->segment = pci_domain_nr(pdev->bus);
2584 	}
2585 
2586 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2587 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2588 	info->ats_qdep = 0;
2589 	info->dev = dev;
2590 	info->domain = domain;
2591 	info->iommu = iommu;
2592 	info->pasid_table = NULL;
2593 	info->auxd_enabled = 0;
2594 	INIT_LIST_HEAD(&info->subdevices);
2595 
2596 	if (dev && dev_is_pci(dev)) {
2597 		struct pci_dev *pdev = to_pci_dev(info->dev);
2598 
2599 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2600 		    pci_ats_supported(pdev) &&
2601 		    dmar_find_matched_atsr_unit(pdev))
2602 			info->ats_supported = 1;
2603 
2604 		if (sm_supported(iommu)) {
2605 			if (pasid_supported(iommu)) {
2606 				int features = pci_pasid_features(pdev);
2607 				if (features >= 0)
2608 					info->pasid_supported = features | 1;
2609 			}
2610 
2611 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2612 			    pci_pri_supported(pdev))
2613 				info->pri_supported = 1;
2614 		}
2615 	}
2616 
2617 	spin_lock_irqsave(&device_domain_lock, flags);
2618 	if (dev)
2619 		found = find_domain(dev);
2620 
2621 	if (!found) {
2622 		struct device_domain_info *info2;
2623 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2624 						       info->devfn);
2625 		if (info2) {
2626 			found      = info2->domain;
2627 			info2->dev = dev;
2628 		}
2629 	}
2630 
2631 	if (found) {
2632 		spin_unlock_irqrestore(&device_domain_lock, flags);
2633 		free_devinfo_mem(info);
2634 		/* Caller must free the original domain */
2635 		return found;
2636 	}
2637 
2638 	spin_lock(&iommu->lock);
2639 	ret = domain_attach_iommu(domain, iommu);
2640 	spin_unlock(&iommu->lock);
2641 
2642 	if (ret) {
2643 		spin_unlock_irqrestore(&device_domain_lock, flags);
2644 		free_devinfo_mem(info);
2645 		return NULL;
2646 	}
2647 
2648 	list_add(&info->link, &domain->devices);
2649 	list_add(&info->global, &device_domain_list);
2650 	if (dev)
2651 		dev_iommu_priv_set(dev, info);
2652 	spin_unlock_irqrestore(&device_domain_lock, flags);
2653 
2654 	/* PASID table is mandatory for a PCI device in scalable mode. */
2655 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2656 		ret = intel_pasid_alloc_table(dev);
2657 		if (ret) {
2658 			dev_err(dev, "PASID table allocation failed\n");
2659 			dmar_remove_one_dev_info(dev);
2660 			return NULL;
2661 		}
2662 
2663 		/* Setup the PASID entry for requests without PASID: */
2664 		spin_lock_irqsave(&iommu->lock, flags);
2665 		if (hw_pass_through && domain_type_is_si(domain))
2666 			ret = intel_pasid_setup_pass_through(iommu, domain,
2667 					dev, PASID_RID2PASID);
2668 		else if (domain_use_first_level(domain))
2669 			ret = domain_setup_first_level(iommu, domain, dev,
2670 					PASID_RID2PASID);
2671 		else
2672 			ret = intel_pasid_setup_second_level(iommu, domain,
2673 					dev, PASID_RID2PASID);
2674 		spin_unlock_irqrestore(&iommu->lock, flags);
2675 		if (ret) {
2676 			dev_err(dev, "Setup RID2PASID failed\n");
2677 			dmar_remove_one_dev_info(dev);
2678 			return NULL;
2679 		}
2680 	}
2681 
2682 	if (dev && domain_context_mapping(domain, dev)) {
2683 		dev_err(dev, "Domain context map failed\n");
2684 		dmar_remove_one_dev_info(dev);
2685 		return NULL;
2686 	}
2687 
2688 	return domain;
2689 }
2690 
2691 static int iommu_domain_identity_map(struct dmar_domain *domain,
2692 				     unsigned long first_vpfn,
2693 				     unsigned long last_vpfn)
2694 {
2695 	/*
2696 	 * RMRR range might have overlap with physical memory range,
2697 	 * clear it first
2698 	 */
2699 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2700 
2701 	return __domain_mapping(domain, first_vpfn,
2702 				first_vpfn, last_vpfn - first_vpfn + 1,
2703 				DMA_PTE_READ|DMA_PTE_WRITE);
2704 }
2705 
2706 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2707 
2708 static int __init si_domain_init(int hw)
2709 {
2710 	struct dmar_rmrr_unit *rmrr;
2711 	struct device *dev;
2712 	int i, nid, ret;
2713 
2714 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2715 	if (!si_domain)
2716 		return -EFAULT;
2717 
2718 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2719 		domain_exit(si_domain);
2720 		return -EFAULT;
2721 	}
2722 
2723 	if (hw)
2724 		return 0;
2725 
2726 	for_each_online_node(nid) {
2727 		unsigned long start_pfn, end_pfn;
2728 		int i;
2729 
2730 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2731 			ret = iommu_domain_identity_map(si_domain,
2732 					mm_to_dma_pfn(start_pfn),
2733 					mm_to_dma_pfn(end_pfn));
2734 			if (ret)
2735 				return ret;
2736 		}
2737 	}
2738 
2739 	/*
2740 	 * Identity map the RMRRs so that devices with RMRRs could also use
2741 	 * the si_domain.
2742 	 */
2743 	for_each_rmrr_units(rmrr) {
2744 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2745 					  i, dev) {
2746 			unsigned long long start = rmrr->base_address;
2747 			unsigned long long end = rmrr->end_address;
2748 
2749 			if (WARN_ON(end < start ||
2750 				    end >> agaw_to_width(si_domain->agaw)))
2751 				continue;
2752 
2753 			ret = iommu_domain_identity_map(si_domain,
2754 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2755 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2756 			if (ret)
2757 				return ret;
2758 		}
2759 	}
2760 
2761 	return 0;
2762 }
2763 
2764 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2765 {
2766 	struct dmar_domain *ndomain;
2767 	struct intel_iommu *iommu;
2768 	u8 bus, devfn;
2769 
2770 	iommu = device_to_iommu(dev, &bus, &devfn);
2771 	if (!iommu)
2772 		return -ENODEV;
2773 
2774 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2775 	if (ndomain != domain)
2776 		return -EBUSY;
2777 
2778 	return 0;
2779 }
2780 
2781 static bool device_has_rmrr(struct device *dev)
2782 {
2783 	struct dmar_rmrr_unit *rmrr;
2784 	struct device *tmp;
2785 	int i;
2786 
2787 	rcu_read_lock();
2788 	for_each_rmrr_units(rmrr) {
2789 		/*
2790 		 * Return TRUE if this RMRR contains the device that
2791 		 * is passed in.
2792 		 */
2793 		for_each_active_dev_scope(rmrr->devices,
2794 					  rmrr->devices_cnt, i, tmp)
2795 			if (tmp == dev ||
2796 			    is_downstream_to_pci_bridge(dev, tmp)) {
2797 				rcu_read_unlock();
2798 				return true;
2799 			}
2800 	}
2801 	rcu_read_unlock();
2802 	return false;
2803 }
2804 
2805 /**
2806  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2807  * is relaxable (ie. is allowed to be not enforced under some conditions)
2808  * @dev: device handle
2809  *
2810  * We assume that PCI USB devices with RMRRs have them largely
2811  * for historical reasons and that the RMRR space is not actively used post
2812  * boot.  This exclusion may change if vendors begin to abuse it.
2813  *
2814  * The same exception is made for graphics devices, with the requirement that
2815  * any use of the RMRR regions will be torn down before assigning the device
2816  * to a guest.
2817  *
2818  * Return: true if the RMRR is relaxable, false otherwise
2819  */
2820 static bool device_rmrr_is_relaxable(struct device *dev)
2821 {
2822 	struct pci_dev *pdev;
2823 
2824 	if (!dev_is_pci(dev))
2825 		return false;
2826 
2827 	pdev = to_pci_dev(dev);
2828 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2829 		return true;
2830 	else
2831 		return false;
2832 }
2833 
2834 /*
2835  * There are a couple cases where we need to restrict the functionality of
2836  * devices associated with RMRRs.  The first is when evaluating a device for
2837  * identity mapping because problems exist when devices are moved in and out
2838  * of domains and their respective RMRR information is lost.  This means that
2839  * a device with associated RMRRs will never be in a "passthrough" domain.
2840  * The second is use of the device through the IOMMU API.  This interface
2841  * expects to have full control of the IOVA space for the device.  We cannot
2842  * satisfy both the requirement that RMRR access is maintained and have an
2843  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2844  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2845  * We therefore prevent devices associated with an RMRR from participating in
2846  * the IOMMU API, which eliminates them from device assignment.
2847  *
2848  * In both cases, devices which have relaxable RMRRs are not concerned by this
2849  * restriction. See device_rmrr_is_relaxable comment.
2850  */
2851 static bool device_is_rmrr_locked(struct device *dev)
2852 {
2853 	if (!device_has_rmrr(dev))
2854 		return false;
2855 
2856 	if (device_rmrr_is_relaxable(dev))
2857 		return false;
2858 
2859 	return true;
2860 }
2861 
2862 /*
2863  * Return the required default domain type for a specific device.
2864  *
2865  * @dev: the device in query
2866  * @startup: true if this is during early boot
2867  *
2868  * Returns:
2869  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2870  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2871  *  - 0: both identity and dynamic domains work for this device
2872  */
2873 static int device_def_domain_type(struct device *dev)
2874 {
2875 	if (dev_is_pci(dev)) {
2876 		struct pci_dev *pdev = to_pci_dev(dev);
2877 
2878 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2879 			return IOMMU_DOMAIN_IDENTITY;
2880 
2881 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2882 			return IOMMU_DOMAIN_IDENTITY;
2883 	}
2884 
2885 	return 0;
2886 }
2887 
2888 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2889 {
2890 	/*
2891 	 * Start from the sane iommu hardware state.
2892 	 * If the queued invalidation is already initialized by us
2893 	 * (for example, while enabling interrupt-remapping) then
2894 	 * we got the things already rolling from a sane state.
2895 	 */
2896 	if (!iommu->qi) {
2897 		/*
2898 		 * Clear any previous faults.
2899 		 */
2900 		dmar_fault(-1, iommu);
2901 		/*
2902 		 * Disable queued invalidation if supported and already enabled
2903 		 * before OS handover.
2904 		 */
2905 		dmar_disable_qi(iommu);
2906 	}
2907 
2908 	if (dmar_enable_qi(iommu)) {
2909 		/*
2910 		 * Queued Invalidate not enabled, use Register Based Invalidate
2911 		 */
2912 		iommu->flush.flush_context = __iommu_flush_context;
2913 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2914 		pr_info("%s: Using Register based invalidation\n",
2915 			iommu->name);
2916 	} else {
2917 		iommu->flush.flush_context = qi_flush_context;
2918 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2919 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2920 	}
2921 }
2922 
2923 static int copy_context_table(struct intel_iommu *iommu,
2924 			      struct root_entry *old_re,
2925 			      struct context_entry **tbl,
2926 			      int bus, bool ext)
2927 {
2928 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2929 	struct context_entry *new_ce = NULL, ce;
2930 	struct context_entry *old_ce = NULL;
2931 	struct root_entry re;
2932 	phys_addr_t old_ce_phys;
2933 
2934 	tbl_idx = ext ? bus * 2 : bus;
2935 	memcpy(&re, old_re, sizeof(re));
2936 
2937 	for (devfn = 0; devfn < 256; devfn++) {
2938 		/* First calculate the correct index */
2939 		idx = (ext ? devfn * 2 : devfn) % 256;
2940 
2941 		if (idx == 0) {
2942 			/* First save what we may have and clean up */
2943 			if (new_ce) {
2944 				tbl[tbl_idx] = new_ce;
2945 				__iommu_flush_cache(iommu, new_ce,
2946 						    VTD_PAGE_SIZE);
2947 				pos = 1;
2948 			}
2949 
2950 			if (old_ce)
2951 				memunmap(old_ce);
2952 
2953 			ret = 0;
2954 			if (devfn < 0x80)
2955 				old_ce_phys = root_entry_lctp(&re);
2956 			else
2957 				old_ce_phys = root_entry_uctp(&re);
2958 
2959 			if (!old_ce_phys) {
2960 				if (ext && devfn == 0) {
2961 					/* No LCTP, try UCTP */
2962 					devfn = 0x7f;
2963 					continue;
2964 				} else {
2965 					goto out;
2966 				}
2967 			}
2968 
2969 			ret = -ENOMEM;
2970 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2971 					MEMREMAP_WB);
2972 			if (!old_ce)
2973 				goto out;
2974 
2975 			new_ce = alloc_pgtable_page(iommu->node);
2976 			if (!new_ce)
2977 				goto out_unmap;
2978 
2979 			ret = 0;
2980 		}
2981 
2982 		/* Now copy the context entry */
2983 		memcpy(&ce, old_ce + idx, sizeof(ce));
2984 
2985 		if (!__context_present(&ce))
2986 			continue;
2987 
2988 		did = context_domain_id(&ce);
2989 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2990 			set_bit(did, iommu->domain_ids);
2991 
2992 		/*
2993 		 * We need a marker for copied context entries. This
2994 		 * marker needs to work for the old format as well as
2995 		 * for extended context entries.
2996 		 *
2997 		 * Bit 67 of the context entry is used. In the old
2998 		 * format this bit is available to software, in the
2999 		 * extended format it is the PGE bit, but PGE is ignored
3000 		 * by HW if PASIDs are disabled (and thus still
3001 		 * available).
3002 		 *
3003 		 * So disable PASIDs first and then mark the entry
3004 		 * copied. This means that we don't copy PASID
3005 		 * translations from the old kernel, but this is fine as
3006 		 * faults there are not fatal.
3007 		 */
3008 		context_clear_pasid_enable(&ce);
3009 		context_set_copied(&ce);
3010 
3011 		new_ce[idx] = ce;
3012 	}
3013 
3014 	tbl[tbl_idx + pos] = new_ce;
3015 
3016 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3017 
3018 out_unmap:
3019 	memunmap(old_ce);
3020 
3021 out:
3022 	return ret;
3023 }
3024 
3025 static int copy_translation_tables(struct intel_iommu *iommu)
3026 {
3027 	struct context_entry **ctxt_tbls;
3028 	struct root_entry *old_rt;
3029 	phys_addr_t old_rt_phys;
3030 	int ctxt_table_entries;
3031 	unsigned long flags;
3032 	u64 rtaddr_reg;
3033 	int bus, ret;
3034 	bool new_ext, ext;
3035 
3036 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3037 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3038 	new_ext    = !!ecap_ecs(iommu->ecap);
3039 
3040 	/*
3041 	 * The RTT bit can only be changed when translation is disabled,
3042 	 * but disabling translation means to open a window for data
3043 	 * corruption. So bail out and don't copy anything if we would
3044 	 * have to change the bit.
3045 	 */
3046 	if (new_ext != ext)
3047 		return -EINVAL;
3048 
3049 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3050 	if (!old_rt_phys)
3051 		return -EINVAL;
3052 
3053 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3054 	if (!old_rt)
3055 		return -ENOMEM;
3056 
3057 	/* This is too big for the stack - allocate it from slab */
3058 	ctxt_table_entries = ext ? 512 : 256;
3059 	ret = -ENOMEM;
3060 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3061 	if (!ctxt_tbls)
3062 		goto out_unmap;
3063 
3064 	for (bus = 0; bus < 256; bus++) {
3065 		ret = copy_context_table(iommu, &old_rt[bus],
3066 					 ctxt_tbls, bus, ext);
3067 		if (ret) {
3068 			pr_err("%s: Failed to copy context table for bus %d\n",
3069 				iommu->name, bus);
3070 			continue;
3071 		}
3072 	}
3073 
3074 	spin_lock_irqsave(&iommu->lock, flags);
3075 
3076 	/* Context tables are copied, now write them to the root_entry table */
3077 	for (bus = 0; bus < 256; bus++) {
3078 		int idx = ext ? bus * 2 : bus;
3079 		u64 val;
3080 
3081 		if (ctxt_tbls[idx]) {
3082 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3083 			iommu->root_entry[bus].lo = val;
3084 		}
3085 
3086 		if (!ext || !ctxt_tbls[idx + 1])
3087 			continue;
3088 
3089 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3090 		iommu->root_entry[bus].hi = val;
3091 	}
3092 
3093 	spin_unlock_irqrestore(&iommu->lock, flags);
3094 
3095 	kfree(ctxt_tbls);
3096 
3097 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3098 
3099 	ret = 0;
3100 
3101 out_unmap:
3102 	memunmap(old_rt);
3103 
3104 	return ret;
3105 }
3106 
3107 #ifdef CONFIG_INTEL_IOMMU_SVM
3108 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3109 {
3110 	struct intel_iommu *iommu = data;
3111 	ioasid_t ioasid;
3112 
3113 	if (!iommu)
3114 		return INVALID_IOASID;
3115 	/*
3116 	 * VT-d virtual command interface always uses the full 20 bit
3117 	 * PASID range. Host can partition guest PASID range based on
3118 	 * policies but it is out of guest's control.
3119 	 */
3120 	if (min < PASID_MIN || max > intel_pasid_max_id)
3121 		return INVALID_IOASID;
3122 
3123 	if (vcmd_alloc_pasid(iommu, &ioasid))
3124 		return INVALID_IOASID;
3125 
3126 	return ioasid;
3127 }
3128 
3129 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3130 {
3131 	struct intel_iommu *iommu = data;
3132 
3133 	if (!iommu)
3134 		return;
3135 	/*
3136 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3137 	 * We can only free the PASID when all the devices are unbound.
3138 	 */
3139 	if (ioasid_find(NULL, ioasid, NULL)) {
3140 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3141 		return;
3142 	}
3143 	vcmd_free_pasid(iommu, ioasid);
3144 }
3145 
3146 static void register_pasid_allocator(struct intel_iommu *iommu)
3147 {
3148 	/*
3149 	 * If we are running in the host, no need for custom allocator
3150 	 * in that PASIDs are allocated from the host system-wide.
3151 	 */
3152 	if (!cap_caching_mode(iommu->cap))
3153 		return;
3154 
3155 	if (!sm_supported(iommu)) {
3156 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3157 		return;
3158 	}
3159 
3160 	/*
3161 	 * Register a custom PASID allocator if we are running in a guest,
3162 	 * guest PASID must be obtained via virtual command interface.
3163 	 * There can be multiple vIOMMUs in each guest but only one allocator
3164 	 * is active. All vIOMMU allocators will eventually be calling the same
3165 	 * host allocator.
3166 	 */
3167 	if (!vccap_pasid(iommu->vccap))
3168 		return;
3169 
3170 	pr_info("Register custom PASID allocator\n");
3171 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3172 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3173 	iommu->pasid_allocator.pdata = (void *)iommu;
3174 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3175 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3176 		/*
3177 		 * Disable scalable mode on this IOMMU if there
3178 		 * is no custom allocator. Mixing SM capable vIOMMU
3179 		 * and non-SM vIOMMU are not supported.
3180 		 */
3181 		intel_iommu_sm = 0;
3182 	}
3183 }
3184 #endif
3185 
3186 static int __init init_dmars(void)
3187 {
3188 	struct dmar_drhd_unit *drhd;
3189 	struct intel_iommu *iommu;
3190 	int ret;
3191 
3192 	/*
3193 	 * for each drhd
3194 	 *    allocate root
3195 	 *    initialize and program root entry to not present
3196 	 * endfor
3197 	 */
3198 	for_each_drhd_unit(drhd) {
3199 		/*
3200 		 * lock not needed as this is only incremented in the single
3201 		 * threaded kernel __init code path all other access are read
3202 		 * only
3203 		 */
3204 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3205 			g_num_of_iommus++;
3206 			continue;
3207 		}
3208 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3209 	}
3210 
3211 	/* Preallocate enough resources for IOMMU hot-addition */
3212 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3213 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3214 
3215 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3216 			GFP_KERNEL);
3217 	if (!g_iommus) {
3218 		pr_err("Allocating global iommu array failed\n");
3219 		ret = -ENOMEM;
3220 		goto error;
3221 	}
3222 
3223 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3224 	if (ret)
3225 		goto free_iommu;
3226 
3227 	for_each_iommu(iommu, drhd) {
3228 		if (drhd->ignored) {
3229 			iommu_disable_translation(iommu);
3230 			continue;
3231 		}
3232 
3233 		/*
3234 		 * Find the max pasid size of all IOMMU's in the system.
3235 		 * We need to ensure the system pasid table is no bigger
3236 		 * than the smallest supported.
3237 		 */
3238 		if (pasid_supported(iommu)) {
3239 			u32 temp = 2 << ecap_pss(iommu->ecap);
3240 
3241 			intel_pasid_max_id = min_t(u32, temp,
3242 						   intel_pasid_max_id);
3243 		}
3244 
3245 		g_iommus[iommu->seq_id] = iommu;
3246 
3247 		intel_iommu_init_qi(iommu);
3248 
3249 		ret = iommu_init_domains(iommu);
3250 		if (ret)
3251 			goto free_iommu;
3252 
3253 		init_translation_status(iommu);
3254 
3255 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3256 			iommu_disable_translation(iommu);
3257 			clear_translation_pre_enabled(iommu);
3258 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3259 				iommu->name);
3260 		}
3261 
3262 		/*
3263 		 * TBD:
3264 		 * we could share the same root & context tables
3265 		 * among all IOMMU's. Need to Split it later.
3266 		 */
3267 		ret = iommu_alloc_root_entry(iommu);
3268 		if (ret)
3269 			goto free_iommu;
3270 
3271 		if (translation_pre_enabled(iommu)) {
3272 			pr_info("Translation already enabled - trying to copy translation structures\n");
3273 
3274 			ret = copy_translation_tables(iommu);
3275 			if (ret) {
3276 				/*
3277 				 * We found the IOMMU with translation
3278 				 * enabled - but failed to copy over the
3279 				 * old root-entry table. Try to proceed
3280 				 * by disabling translation now and
3281 				 * allocating a clean root-entry table.
3282 				 * This might cause DMAR faults, but
3283 				 * probably the dump will still succeed.
3284 				 */
3285 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3286 				       iommu->name);
3287 				iommu_disable_translation(iommu);
3288 				clear_translation_pre_enabled(iommu);
3289 			} else {
3290 				pr_info("Copied translation tables from previous kernel for %s\n",
3291 					iommu->name);
3292 			}
3293 		}
3294 
3295 		if (!ecap_pass_through(iommu->ecap))
3296 			hw_pass_through = 0;
3297 		intel_svm_check(iommu);
3298 	}
3299 
3300 	/*
3301 	 * Now that qi is enabled on all iommus, set the root entry and flush
3302 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3303 	 * flush_context function will loop forever and the boot hangs.
3304 	 */
3305 	for_each_active_iommu(iommu, drhd) {
3306 		iommu_flush_write_buffer(iommu);
3307 #ifdef CONFIG_INTEL_IOMMU_SVM
3308 		register_pasid_allocator(iommu);
3309 #endif
3310 		iommu_set_root_entry(iommu);
3311 	}
3312 
3313 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3314 	dmar_map_gfx = 0;
3315 #endif
3316 
3317 	if (!dmar_map_gfx)
3318 		iommu_identity_mapping |= IDENTMAP_GFX;
3319 
3320 	check_tylersburg_isoch();
3321 
3322 	ret = si_domain_init(hw_pass_through);
3323 	if (ret)
3324 		goto free_iommu;
3325 
3326 	/*
3327 	 * for each drhd
3328 	 *   enable fault log
3329 	 *   global invalidate context cache
3330 	 *   global invalidate iotlb
3331 	 *   enable translation
3332 	 */
3333 	for_each_iommu(iommu, drhd) {
3334 		if (drhd->ignored) {
3335 			/*
3336 			 * we always have to disable PMRs or DMA may fail on
3337 			 * this device
3338 			 */
3339 			if (force_on)
3340 				iommu_disable_protect_mem_regions(iommu);
3341 			continue;
3342 		}
3343 
3344 		iommu_flush_write_buffer(iommu);
3345 
3346 #ifdef CONFIG_INTEL_IOMMU_SVM
3347 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3348 			/*
3349 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3350 			 * could cause possible lock race condition.
3351 			 */
3352 			up_write(&dmar_global_lock);
3353 			ret = intel_svm_enable_prq(iommu);
3354 			down_write(&dmar_global_lock);
3355 			if (ret)
3356 				goto free_iommu;
3357 		}
3358 #endif
3359 		ret = dmar_set_interrupt(iommu);
3360 		if (ret)
3361 			goto free_iommu;
3362 	}
3363 
3364 	return 0;
3365 
3366 free_iommu:
3367 	for_each_active_iommu(iommu, drhd) {
3368 		disable_dmar_iommu(iommu);
3369 		free_dmar_iommu(iommu);
3370 	}
3371 
3372 	kfree(g_iommus);
3373 
3374 error:
3375 	return ret;
3376 }
3377 
3378 static inline int iommu_domain_cache_init(void)
3379 {
3380 	int ret = 0;
3381 
3382 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3383 					 sizeof(struct dmar_domain),
3384 					 0,
3385 					 SLAB_HWCACHE_ALIGN,
3386 
3387 					 NULL);
3388 	if (!iommu_domain_cache) {
3389 		pr_err("Couldn't create iommu_domain cache\n");
3390 		ret = -ENOMEM;
3391 	}
3392 
3393 	return ret;
3394 }
3395 
3396 static inline int iommu_devinfo_cache_init(void)
3397 {
3398 	int ret = 0;
3399 
3400 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3401 					 sizeof(struct device_domain_info),
3402 					 0,
3403 					 SLAB_HWCACHE_ALIGN,
3404 					 NULL);
3405 	if (!iommu_devinfo_cache) {
3406 		pr_err("Couldn't create devinfo cache\n");
3407 		ret = -ENOMEM;
3408 	}
3409 
3410 	return ret;
3411 }
3412 
3413 static int __init iommu_init_mempool(void)
3414 {
3415 	int ret;
3416 	ret = iova_cache_get();
3417 	if (ret)
3418 		return ret;
3419 
3420 	ret = iommu_domain_cache_init();
3421 	if (ret)
3422 		goto domain_error;
3423 
3424 	ret = iommu_devinfo_cache_init();
3425 	if (!ret)
3426 		return ret;
3427 
3428 	kmem_cache_destroy(iommu_domain_cache);
3429 domain_error:
3430 	iova_cache_put();
3431 
3432 	return -ENOMEM;
3433 }
3434 
3435 static void __init iommu_exit_mempool(void)
3436 {
3437 	kmem_cache_destroy(iommu_devinfo_cache);
3438 	kmem_cache_destroy(iommu_domain_cache);
3439 	iova_cache_put();
3440 }
3441 
3442 static void __init init_no_remapping_devices(void)
3443 {
3444 	struct dmar_drhd_unit *drhd;
3445 	struct device *dev;
3446 	int i;
3447 
3448 	for_each_drhd_unit(drhd) {
3449 		if (!drhd->include_all) {
3450 			for_each_active_dev_scope(drhd->devices,
3451 						  drhd->devices_cnt, i, dev)
3452 				break;
3453 			/* ignore DMAR unit if no devices exist */
3454 			if (i == drhd->devices_cnt)
3455 				drhd->ignored = 1;
3456 		}
3457 	}
3458 
3459 	for_each_active_drhd_unit(drhd) {
3460 		if (drhd->include_all)
3461 			continue;
3462 
3463 		for_each_active_dev_scope(drhd->devices,
3464 					  drhd->devices_cnt, i, dev)
3465 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3466 				break;
3467 		if (i < drhd->devices_cnt)
3468 			continue;
3469 
3470 		/* This IOMMU has *only* gfx devices. Either bypass it or
3471 		   set the gfx_mapped flag, as appropriate */
3472 		drhd->gfx_dedicated = 1;
3473 		if (!dmar_map_gfx)
3474 			drhd->ignored = 1;
3475 	}
3476 }
3477 
3478 #ifdef CONFIG_SUSPEND
3479 static int init_iommu_hw(void)
3480 {
3481 	struct dmar_drhd_unit *drhd;
3482 	struct intel_iommu *iommu = NULL;
3483 
3484 	for_each_active_iommu(iommu, drhd)
3485 		if (iommu->qi)
3486 			dmar_reenable_qi(iommu);
3487 
3488 	for_each_iommu(iommu, drhd) {
3489 		if (drhd->ignored) {
3490 			/*
3491 			 * we always have to disable PMRs or DMA may fail on
3492 			 * this device
3493 			 */
3494 			if (force_on)
3495 				iommu_disable_protect_mem_regions(iommu);
3496 			continue;
3497 		}
3498 
3499 		iommu_flush_write_buffer(iommu);
3500 		iommu_set_root_entry(iommu);
3501 		iommu_enable_translation(iommu);
3502 		iommu_disable_protect_mem_regions(iommu);
3503 	}
3504 
3505 	return 0;
3506 }
3507 
3508 static void iommu_flush_all(void)
3509 {
3510 	struct dmar_drhd_unit *drhd;
3511 	struct intel_iommu *iommu;
3512 
3513 	for_each_active_iommu(iommu, drhd) {
3514 		iommu->flush.flush_context(iommu, 0, 0, 0,
3515 					   DMA_CCMD_GLOBAL_INVL);
3516 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3517 					 DMA_TLB_GLOBAL_FLUSH);
3518 	}
3519 }
3520 
3521 static int iommu_suspend(void)
3522 {
3523 	struct dmar_drhd_unit *drhd;
3524 	struct intel_iommu *iommu = NULL;
3525 	unsigned long flag;
3526 
3527 	for_each_active_iommu(iommu, drhd) {
3528 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3529 					     GFP_KERNEL);
3530 		if (!iommu->iommu_state)
3531 			goto nomem;
3532 	}
3533 
3534 	iommu_flush_all();
3535 
3536 	for_each_active_iommu(iommu, drhd) {
3537 		iommu_disable_translation(iommu);
3538 
3539 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3540 
3541 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3542 			readl(iommu->reg + DMAR_FECTL_REG);
3543 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3544 			readl(iommu->reg + DMAR_FEDATA_REG);
3545 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3546 			readl(iommu->reg + DMAR_FEADDR_REG);
3547 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3548 			readl(iommu->reg + DMAR_FEUADDR_REG);
3549 
3550 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3551 	}
3552 	return 0;
3553 
3554 nomem:
3555 	for_each_active_iommu(iommu, drhd)
3556 		kfree(iommu->iommu_state);
3557 
3558 	return -ENOMEM;
3559 }
3560 
3561 static void iommu_resume(void)
3562 {
3563 	struct dmar_drhd_unit *drhd;
3564 	struct intel_iommu *iommu = NULL;
3565 	unsigned long flag;
3566 
3567 	if (init_iommu_hw()) {
3568 		if (force_on)
3569 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3570 		else
3571 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3572 		return;
3573 	}
3574 
3575 	for_each_active_iommu(iommu, drhd) {
3576 
3577 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3578 
3579 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3580 			iommu->reg + DMAR_FECTL_REG);
3581 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3582 			iommu->reg + DMAR_FEDATA_REG);
3583 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3584 			iommu->reg + DMAR_FEADDR_REG);
3585 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3586 			iommu->reg + DMAR_FEUADDR_REG);
3587 
3588 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3589 	}
3590 
3591 	for_each_active_iommu(iommu, drhd)
3592 		kfree(iommu->iommu_state);
3593 }
3594 
3595 static struct syscore_ops iommu_syscore_ops = {
3596 	.resume		= iommu_resume,
3597 	.suspend	= iommu_suspend,
3598 };
3599 
3600 static void __init init_iommu_pm_ops(void)
3601 {
3602 	register_syscore_ops(&iommu_syscore_ops);
3603 }
3604 
3605 #else
3606 static inline void init_iommu_pm_ops(void) {}
3607 #endif	/* CONFIG_PM */
3608 
3609 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3610 {
3611 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3612 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3613 	    rmrr->end_address <= rmrr->base_address ||
3614 	    arch_rmrr_sanity_check(rmrr))
3615 		return -EINVAL;
3616 
3617 	return 0;
3618 }
3619 
3620 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3621 {
3622 	struct acpi_dmar_reserved_memory *rmrr;
3623 	struct dmar_rmrr_unit *rmrru;
3624 
3625 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3626 	if (rmrr_sanity_check(rmrr)) {
3627 		pr_warn(FW_BUG
3628 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3629 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3630 			   rmrr->base_address, rmrr->end_address,
3631 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3632 			   dmi_get_system_info(DMI_BIOS_VERSION),
3633 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3634 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3635 	}
3636 
3637 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3638 	if (!rmrru)
3639 		goto out;
3640 
3641 	rmrru->hdr = header;
3642 
3643 	rmrru->base_address = rmrr->base_address;
3644 	rmrru->end_address = rmrr->end_address;
3645 
3646 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3647 				((void *)rmrr) + rmrr->header.length,
3648 				&rmrru->devices_cnt);
3649 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3650 		goto free_rmrru;
3651 
3652 	list_add(&rmrru->list, &dmar_rmrr_units);
3653 
3654 	return 0;
3655 free_rmrru:
3656 	kfree(rmrru);
3657 out:
3658 	return -ENOMEM;
3659 }
3660 
3661 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3662 {
3663 	struct dmar_atsr_unit *atsru;
3664 	struct acpi_dmar_atsr *tmp;
3665 
3666 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3667 				dmar_rcu_check()) {
3668 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3669 		if (atsr->segment != tmp->segment)
3670 			continue;
3671 		if (atsr->header.length != tmp->header.length)
3672 			continue;
3673 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3674 			return atsru;
3675 	}
3676 
3677 	return NULL;
3678 }
3679 
3680 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3681 {
3682 	struct acpi_dmar_atsr *atsr;
3683 	struct dmar_atsr_unit *atsru;
3684 
3685 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3686 		return 0;
3687 
3688 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3689 	atsru = dmar_find_atsr(atsr);
3690 	if (atsru)
3691 		return 0;
3692 
3693 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3694 	if (!atsru)
3695 		return -ENOMEM;
3696 
3697 	/*
3698 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3699 	 * copy the memory content because the memory buffer will be freed
3700 	 * on return.
3701 	 */
3702 	atsru->hdr = (void *)(atsru + 1);
3703 	memcpy(atsru->hdr, hdr, hdr->length);
3704 	atsru->include_all = atsr->flags & 0x1;
3705 	if (!atsru->include_all) {
3706 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3707 				(void *)atsr + atsr->header.length,
3708 				&atsru->devices_cnt);
3709 		if (atsru->devices_cnt && atsru->devices == NULL) {
3710 			kfree(atsru);
3711 			return -ENOMEM;
3712 		}
3713 	}
3714 
3715 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3716 
3717 	return 0;
3718 }
3719 
3720 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3721 {
3722 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3723 	kfree(atsru);
3724 }
3725 
3726 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3727 {
3728 	struct acpi_dmar_atsr *atsr;
3729 	struct dmar_atsr_unit *atsru;
3730 
3731 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3732 	atsru = dmar_find_atsr(atsr);
3733 	if (atsru) {
3734 		list_del_rcu(&atsru->list);
3735 		synchronize_rcu();
3736 		intel_iommu_free_atsr(atsru);
3737 	}
3738 
3739 	return 0;
3740 }
3741 
3742 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3743 {
3744 	int i;
3745 	struct device *dev;
3746 	struct acpi_dmar_atsr *atsr;
3747 	struct dmar_atsr_unit *atsru;
3748 
3749 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3750 	atsru = dmar_find_atsr(atsr);
3751 	if (!atsru)
3752 		return 0;
3753 
3754 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3755 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3756 					  i, dev)
3757 			return -EBUSY;
3758 	}
3759 
3760 	return 0;
3761 }
3762 
3763 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3764 {
3765 	struct dmar_satc_unit *satcu;
3766 	struct acpi_dmar_satc *tmp;
3767 
3768 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3769 				dmar_rcu_check()) {
3770 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3771 		if (satc->segment != tmp->segment)
3772 			continue;
3773 		if (satc->header.length != tmp->header.length)
3774 			continue;
3775 		if (memcmp(satc, tmp, satc->header.length) == 0)
3776 			return satcu;
3777 	}
3778 
3779 	return NULL;
3780 }
3781 
3782 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3783 {
3784 	struct acpi_dmar_satc *satc;
3785 	struct dmar_satc_unit *satcu;
3786 
3787 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3788 		return 0;
3789 
3790 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3791 	satcu = dmar_find_satc(satc);
3792 	if (satcu)
3793 		return 0;
3794 
3795 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3796 	if (!satcu)
3797 		return -ENOMEM;
3798 
3799 	satcu->hdr = (void *)(satcu + 1);
3800 	memcpy(satcu->hdr, hdr, hdr->length);
3801 	satcu->atc_required = satc->flags & 0x1;
3802 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3803 					      (void *)satc + satc->header.length,
3804 					      &satcu->devices_cnt);
3805 	if (satcu->devices_cnt && !satcu->devices) {
3806 		kfree(satcu);
3807 		return -ENOMEM;
3808 	}
3809 	list_add_rcu(&satcu->list, &dmar_satc_units);
3810 
3811 	return 0;
3812 }
3813 
3814 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3815 {
3816 	int sp, ret;
3817 	struct intel_iommu *iommu = dmaru->iommu;
3818 
3819 	if (g_iommus[iommu->seq_id])
3820 		return 0;
3821 
3822 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3823 	if (ret)
3824 		goto out;
3825 
3826 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3827 		pr_warn("%s: Doesn't support hardware pass through.\n",
3828 			iommu->name);
3829 		return -ENXIO;
3830 	}
3831 	if (!ecap_sc_support(iommu->ecap) &&
3832 	    domain_update_iommu_snooping(iommu)) {
3833 		pr_warn("%s: Doesn't support snooping.\n",
3834 			iommu->name);
3835 		return -ENXIO;
3836 	}
3837 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3838 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3839 		pr_warn("%s: Doesn't support large page.\n",
3840 			iommu->name);
3841 		return -ENXIO;
3842 	}
3843 
3844 	/*
3845 	 * Disable translation if already enabled prior to OS handover.
3846 	 */
3847 	if (iommu->gcmd & DMA_GCMD_TE)
3848 		iommu_disable_translation(iommu);
3849 
3850 	g_iommus[iommu->seq_id] = iommu;
3851 	ret = iommu_init_domains(iommu);
3852 	if (ret == 0)
3853 		ret = iommu_alloc_root_entry(iommu);
3854 	if (ret)
3855 		goto out;
3856 
3857 	intel_svm_check(iommu);
3858 
3859 	if (dmaru->ignored) {
3860 		/*
3861 		 * we always have to disable PMRs or DMA may fail on this device
3862 		 */
3863 		if (force_on)
3864 			iommu_disable_protect_mem_regions(iommu);
3865 		return 0;
3866 	}
3867 
3868 	intel_iommu_init_qi(iommu);
3869 	iommu_flush_write_buffer(iommu);
3870 
3871 #ifdef CONFIG_INTEL_IOMMU_SVM
3872 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3873 		ret = intel_svm_enable_prq(iommu);
3874 		if (ret)
3875 			goto disable_iommu;
3876 	}
3877 #endif
3878 	ret = dmar_set_interrupt(iommu);
3879 	if (ret)
3880 		goto disable_iommu;
3881 
3882 	iommu_set_root_entry(iommu);
3883 	iommu_enable_translation(iommu);
3884 
3885 	iommu_disable_protect_mem_regions(iommu);
3886 	return 0;
3887 
3888 disable_iommu:
3889 	disable_dmar_iommu(iommu);
3890 out:
3891 	free_dmar_iommu(iommu);
3892 	return ret;
3893 }
3894 
3895 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3896 {
3897 	int ret = 0;
3898 	struct intel_iommu *iommu = dmaru->iommu;
3899 
3900 	if (!intel_iommu_enabled)
3901 		return 0;
3902 	if (iommu == NULL)
3903 		return -EINVAL;
3904 
3905 	if (insert) {
3906 		ret = intel_iommu_add(dmaru);
3907 	} else {
3908 		disable_dmar_iommu(iommu);
3909 		free_dmar_iommu(iommu);
3910 	}
3911 
3912 	return ret;
3913 }
3914 
3915 static void intel_iommu_free_dmars(void)
3916 {
3917 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3918 	struct dmar_atsr_unit *atsru, *atsr_n;
3919 	struct dmar_satc_unit *satcu, *satc_n;
3920 
3921 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3922 		list_del(&rmrru->list);
3923 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3924 		kfree(rmrru);
3925 	}
3926 
3927 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3928 		list_del(&atsru->list);
3929 		intel_iommu_free_atsr(atsru);
3930 	}
3931 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3932 		list_del(&satcu->list);
3933 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3934 		kfree(satcu);
3935 	}
3936 }
3937 
3938 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3939 {
3940 	int i, ret = 1;
3941 	struct pci_bus *bus;
3942 	struct pci_dev *bridge = NULL;
3943 	struct device *tmp;
3944 	struct acpi_dmar_atsr *atsr;
3945 	struct dmar_atsr_unit *atsru;
3946 
3947 	dev = pci_physfn(dev);
3948 	for (bus = dev->bus; bus; bus = bus->parent) {
3949 		bridge = bus->self;
3950 		/* If it's an integrated device, allow ATS */
3951 		if (!bridge)
3952 			return 1;
3953 		/* Connected via non-PCIe: no ATS */
3954 		if (!pci_is_pcie(bridge) ||
3955 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3956 			return 0;
3957 		/* If we found the root port, look it up in the ATSR */
3958 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3959 			break;
3960 	}
3961 
3962 	rcu_read_lock();
3963 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3964 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3965 		if (atsr->segment != pci_domain_nr(dev->bus))
3966 			continue;
3967 
3968 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3969 			if (tmp == &bridge->dev)
3970 				goto out;
3971 
3972 		if (atsru->include_all)
3973 			goto out;
3974 	}
3975 	ret = 0;
3976 out:
3977 	rcu_read_unlock();
3978 
3979 	return ret;
3980 }
3981 
3982 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3983 {
3984 	int ret;
3985 	struct dmar_rmrr_unit *rmrru;
3986 	struct dmar_atsr_unit *atsru;
3987 	struct dmar_satc_unit *satcu;
3988 	struct acpi_dmar_atsr *atsr;
3989 	struct acpi_dmar_reserved_memory *rmrr;
3990 	struct acpi_dmar_satc *satc;
3991 
3992 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3993 		return 0;
3994 
3995 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3996 		rmrr = container_of(rmrru->hdr,
3997 				    struct acpi_dmar_reserved_memory, header);
3998 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3999 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4000 				((void *)rmrr) + rmrr->header.length,
4001 				rmrr->segment, rmrru->devices,
4002 				rmrru->devices_cnt);
4003 			if (ret < 0)
4004 				return ret;
4005 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4006 			dmar_remove_dev_scope(info, rmrr->segment,
4007 				rmrru->devices, rmrru->devices_cnt);
4008 		}
4009 	}
4010 
4011 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4012 		if (atsru->include_all)
4013 			continue;
4014 
4015 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4016 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4017 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4018 					(void *)atsr + atsr->header.length,
4019 					atsr->segment, atsru->devices,
4020 					atsru->devices_cnt);
4021 			if (ret > 0)
4022 				break;
4023 			else if (ret < 0)
4024 				return ret;
4025 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4026 			if (dmar_remove_dev_scope(info, atsr->segment,
4027 					atsru->devices, atsru->devices_cnt))
4028 				break;
4029 		}
4030 	}
4031 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4032 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4033 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4034 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4035 					(void *)satc + satc->header.length,
4036 					satc->segment, satcu->devices,
4037 					satcu->devices_cnt);
4038 			if (ret > 0)
4039 				break;
4040 			else if (ret < 0)
4041 				return ret;
4042 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4043 			if (dmar_remove_dev_scope(info, satc->segment,
4044 					satcu->devices, satcu->devices_cnt))
4045 				break;
4046 		}
4047 	}
4048 
4049 	return 0;
4050 }
4051 
4052 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4053 				       unsigned long val, void *v)
4054 {
4055 	struct memory_notify *mhp = v;
4056 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4057 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4058 			mhp->nr_pages - 1);
4059 
4060 	switch (val) {
4061 	case MEM_GOING_ONLINE:
4062 		if (iommu_domain_identity_map(si_domain,
4063 					      start_vpfn, last_vpfn)) {
4064 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4065 				start_vpfn, last_vpfn);
4066 			return NOTIFY_BAD;
4067 		}
4068 		break;
4069 
4070 	case MEM_OFFLINE:
4071 	case MEM_CANCEL_ONLINE:
4072 		{
4073 			struct dmar_drhd_unit *drhd;
4074 			struct intel_iommu *iommu;
4075 			struct page *freelist;
4076 
4077 			freelist = domain_unmap(si_domain,
4078 						start_vpfn, last_vpfn,
4079 						NULL);
4080 
4081 			rcu_read_lock();
4082 			for_each_active_iommu(iommu, drhd)
4083 				iommu_flush_iotlb_psi(iommu, si_domain,
4084 					start_vpfn, mhp->nr_pages,
4085 					!freelist, 0);
4086 			rcu_read_unlock();
4087 			dma_free_pagelist(freelist);
4088 		}
4089 		break;
4090 	}
4091 
4092 	return NOTIFY_OK;
4093 }
4094 
4095 static struct notifier_block intel_iommu_memory_nb = {
4096 	.notifier_call = intel_iommu_memory_notifier,
4097 	.priority = 0
4098 };
4099 
4100 static void intel_disable_iommus(void)
4101 {
4102 	struct intel_iommu *iommu = NULL;
4103 	struct dmar_drhd_unit *drhd;
4104 
4105 	for_each_iommu(iommu, drhd)
4106 		iommu_disable_translation(iommu);
4107 }
4108 
4109 void intel_iommu_shutdown(void)
4110 {
4111 	struct dmar_drhd_unit *drhd;
4112 	struct intel_iommu *iommu = NULL;
4113 
4114 	if (no_iommu || dmar_disabled)
4115 		return;
4116 
4117 	down_write(&dmar_global_lock);
4118 
4119 	/* Disable PMRs explicitly here. */
4120 	for_each_iommu(iommu, drhd)
4121 		iommu_disable_protect_mem_regions(iommu);
4122 
4123 	/* Make sure the IOMMUs are switched off */
4124 	intel_disable_iommus();
4125 
4126 	up_write(&dmar_global_lock);
4127 }
4128 
4129 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4130 {
4131 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4132 
4133 	return container_of(iommu_dev, struct intel_iommu, iommu);
4134 }
4135 
4136 static ssize_t version_show(struct device *dev,
4137 			    struct device_attribute *attr, char *buf)
4138 {
4139 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4140 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4141 	return sprintf(buf, "%d:%d\n",
4142 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4143 }
4144 static DEVICE_ATTR_RO(version);
4145 
4146 static ssize_t address_show(struct device *dev,
4147 			    struct device_attribute *attr, char *buf)
4148 {
4149 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4150 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4151 }
4152 static DEVICE_ATTR_RO(address);
4153 
4154 static ssize_t cap_show(struct device *dev,
4155 			struct device_attribute *attr, char *buf)
4156 {
4157 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4158 	return sprintf(buf, "%llx\n", iommu->cap);
4159 }
4160 static DEVICE_ATTR_RO(cap);
4161 
4162 static ssize_t ecap_show(struct device *dev,
4163 			 struct device_attribute *attr, char *buf)
4164 {
4165 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4166 	return sprintf(buf, "%llx\n", iommu->ecap);
4167 }
4168 static DEVICE_ATTR_RO(ecap);
4169 
4170 static ssize_t domains_supported_show(struct device *dev,
4171 				      struct device_attribute *attr, char *buf)
4172 {
4173 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4174 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4175 }
4176 static DEVICE_ATTR_RO(domains_supported);
4177 
4178 static ssize_t domains_used_show(struct device *dev,
4179 				 struct device_attribute *attr, char *buf)
4180 {
4181 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4182 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4183 						  cap_ndoms(iommu->cap)));
4184 }
4185 static DEVICE_ATTR_RO(domains_used);
4186 
4187 static struct attribute *intel_iommu_attrs[] = {
4188 	&dev_attr_version.attr,
4189 	&dev_attr_address.attr,
4190 	&dev_attr_cap.attr,
4191 	&dev_attr_ecap.attr,
4192 	&dev_attr_domains_supported.attr,
4193 	&dev_attr_domains_used.attr,
4194 	NULL,
4195 };
4196 
4197 static struct attribute_group intel_iommu_group = {
4198 	.name = "intel-iommu",
4199 	.attrs = intel_iommu_attrs,
4200 };
4201 
4202 const struct attribute_group *intel_iommu_groups[] = {
4203 	&intel_iommu_group,
4204 	NULL,
4205 };
4206 
4207 static inline bool has_external_pci(void)
4208 {
4209 	struct pci_dev *pdev = NULL;
4210 
4211 	for_each_pci_dev(pdev)
4212 		if (pdev->external_facing)
4213 			return true;
4214 
4215 	return false;
4216 }
4217 
4218 static int __init platform_optin_force_iommu(void)
4219 {
4220 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4221 		return 0;
4222 
4223 	if (no_iommu || dmar_disabled)
4224 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4225 
4226 	/*
4227 	 * If Intel-IOMMU is disabled by default, we will apply identity
4228 	 * map for all devices except those marked as being untrusted.
4229 	 */
4230 	if (dmar_disabled)
4231 		iommu_set_default_passthrough(false);
4232 
4233 	dmar_disabled = 0;
4234 	no_iommu = 0;
4235 
4236 	return 1;
4237 }
4238 
4239 static int __init probe_acpi_namespace_devices(void)
4240 {
4241 	struct dmar_drhd_unit *drhd;
4242 	/* To avoid a -Wunused-but-set-variable warning. */
4243 	struct intel_iommu *iommu __maybe_unused;
4244 	struct device *dev;
4245 	int i, ret = 0;
4246 
4247 	for_each_active_iommu(iommu, drhd) {
4248 		for_each_active_dev_scope(drhd->devices,
4249 					  drhd->devices_cnt, i, dev) {
4250 			struct acpi_device_physical_node *pn;
4251 			struct iommu_group *group;
4252 			struct acpi_device *adev;
4253 
4254 			if (dev->bus != &acpi_bus_type)
4255 				continue;
4256 
4257 			adev = to_acpi_device(dev);
4258 			mutex_lock(&adev->physical_node_lock);
4259 			list_for_each_entry(pn,
4260 					    &adev->physical_node_list, node) {
4261 				group = iommu_group_get(pn->dev);
4262 				if (group) {
4263 					iommu_group_put(group);
4264 					continue;
4265 				}
4266 
4267 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4268 				ret = iommu_probe_device(pn->dev);
4269 				if (ret)
4270 					break;
4271 			}
4272 			mutex_unlock(&adev->physical_node_lock);
4273 
4274 			if (ret)
4275 				return ret;
4276 		}
4277 	}
4278 
4279 	return 0;
4280 }
4281 
4282 int __init intel_iommu_init(void)
4283 {
4284 	int ret = -ENODEV;
4285 	struct dmar_drhd_unit *drhd;
4286 	struct intel_iommu *iommu;
4287 
4288 	/*
4289 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4290 	 * opt in, so enforce that.
4291 	 */
4292 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4293 		    platform_optin_force_iommu();
4294 
4295 	if (iommu_init_mempool()) {
4296 		if (force_on)
4297 			panic("tboot: Failed to initialize iommu memory\n");
4298 		return -ENOMEM;
4299 	}
4300 
4301 	down_write(&dmar_global_lock);
4302 	if (dmar_table_init()) {
4303 		if (force_on)
4304 			panic("tboot: Failed to initialize DMAR table\n");
4305 		goto out_free_dmar;
4306 	}
4307 
4308 	if (dmar_dev_scope_init() < 0) {
4309 		if (force_on)
4310 			panic("tboot: Failed to initialize DMAR device scope\n");
4311 		goto out_free_dmar;
4312 	}
4313 
4314 	up_write(&dmar_global_lock);
4315 
4316 	/*
4317 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4318 	 * complain later when we register it under the lock.
4319 	 */
4320 	dmar_register_bus_notifier();
4321 
4322 	down_write(&dmar_global_lock);
4323 
4324 	if (!no_iommu)
4325 		intel_iommu_debugfs_init();
4326 
4327 	if (no_iommu || dmar_disabled) {
4328 		/*
4329 		 * We exit the function here to ensure IOMMU's remapping and
4330 		 * mempool aren't setup, which means that the IOMMU's PMRs
4331 		 * won't be disabled via the call to init_dmars(). So disable
4332 		 * it explicitly here. The PMRs were setup by tboot prior to
4333 		 * calling SENTER, but the kernel is expected to reset/tear
4334 		 * down the PMRs.
4335 		 */
4336 		if (intel_iommu_tboot_noforce) {
4337 			for_each_iommu(iommu, drhd)
4338 				iommu_disable_protect_mem_regions(iommu);
4339 		}
4340 
4341 		/*
4342 		 * Make sure the IOMMUs are switched off, even when we
4343 		 * boot into a kexec kernel and the previous kernel left
4344 		 * them enabled
4345 		 */
4346 		intel_disable_iommus();
4347 		goto out_free_dmar;
4348 	}
4349 
4350 	if (list_empty(&dmar_rmrr_units))
4351 		pr_info("No RMRR found\n");
4352 
4353 	if (list_empty(&dmar_atsr_units))
4354 		pr_info("No ATSR found\n");
4355 
4356 	if (list_empty(&dmar_satc_units))
4357 		pr_info("No SATC found\n");
4358 
4359 	if (dmar_map_gfx)
4360 		intel_iommu_gfx_mapped = 1;
4361 
4362 	init_no_remapping_devices();
4363 
4364 	ret = init_dmars();
4365 	if (ret) {
4366 		if (force_on)
4367 			panic("tboot: Failed to initialize DMARs\n");
4368 		pr_err("Initialization failed\n");
4369 		goto out_free_dmar;
4370 	}
4371 	up_write(&dmar_global_lock);
4372 
4373 	init_iommu_pm_ops();
4374 
4375 	down_read(&dmar_global_lock);
4376 	for_each_active_iommu(iommu, drhd) {
4377 		/*
4378 		 * The flush queue implementation does not perform
4379 		 * page-selective invalidations that are required for efficient
4380 		 * TLB flushes in virtual environments.  The benefit of batching
4381 		 * is likely to be much lower than the overhead of synchronizing
4382 		 * the virtual and physical IOMMU page-tables.
4383 		 */
4384 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
4385 			pr_warn("IOMMU batching is disabled due to virtualization");
4386 			intel_iommu_strict = 1;
4387 		}
4388 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4389 				       intel_iommu_groups,
4390 				       "%s", iommu->name);
4391 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4392 	}
4393 	up_read(&dmar_global_lock);
4394 
4395 	iommu_set_dma_strict(intel_iommu_strict);
4396 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4397 	if (si_domain && !hw_pass_through)
4398 		register_memory_notifier(&intel_iommu_memory_nb);
4399 
4400 	down_read(&dmar_global_lock);
4401 	if (probe_acpi_namespace_devices())
4402 		pr_warn("ACPI name space devices didn't probe correctly\n");
4403 
4404 	/* Finally, we enable the DMA remapping hardware. */
4405 	for_each_iommu(iommu, drhd) {
4406 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4407 			iommu_enable_translation(iommu);
4408 
4409 		iommu_disable_protect_mem_regions(iommu);
4410 	}
4411 	up_read(&dmar_global_lock);
4412 
4413 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4414 
4415 	intel_iommu_enabled = 1;
4416 
4417 	return 0;
4418 
4419 out_free_dmar:
4420 	intel_iommu_free_dmars();
4421 	up_write(&dmar_global_lock);
4422 	iommu_exit_mempool();
4423 	return ret;
4424 }
4425 
4426 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4427 {
4428 	struct intel_iommu *iommu = opaque;
4429 
4430 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4431 	return 0;
4432 }
4433 
4434 /*
4435  * NB - intel-iommu lacks any sort of reference counting for the users of
4436  * dependent devices.  If multiple endpoints have intersecting dependent
4437  * devices, unbinding the driver from any one of them will possibly leave
4438  * the others unable to operate.
4439  */
4440 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4441 {
4442 	if (!iommu || !dev || !dev_is_pci(dev))
4443 		return;
4444 
4445 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4446 }
4447 
4448 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4449 {
4450 	struct dmar_domain *domain;
4451 	struct intel_iommu *iommu;
4452 	unsigned long flags;
4453 
4454 	assert_spin_locked(&device_domain_lock);
4455 
4456 	if (WARN_ON(!info))
4457 		return;
4458 
4459 	iommu = info->iommu;
4460 	domain = info->domain;
4461 
4462 	if (info->dev) {
4463 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4464 			intel_pasid_tear_down_entry(iommu, info->dev,
4465 					PASID_RID2PASID, false);
4466 
4467 		iommu_disable_dev_iotlb(info);
4468 		if (!dev_is_real_dma_subdevice(info->dev))
4469 			domain_context_clear(iommu, info->dev);
4470 		intel_pasid_free_table(info->dev);
4471 	}
4472 
4473 	unlink_domain_info(info);
4474 
4475 	spin_lock_irqsave(&iommu->lock, flags);
4476 	domain_detach_iommu(domain, iommu);
4477 	spin_unlock_irqrestore(&iommu->lock, flags);
4478 
4479 	free_devinfo_mem(info);
4480 }
4481 
4482 static void dmar_remove_one_dev_info(struct device *dev)
4483 {
4484 	struct device_domain_info *info;
4485 	unsigned long flags;
4486 
4487 	spin_lock_irqsave(&device_domain_lock, flags);
4488 	info = get_domain_info(dev);
4489 	if (info)
4490 		__dmar_remove_one_dev_info(info);
4491 	spin_unlock_irqrestore(&device_domain_lock, flags);
4492 }
4493 
4494 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4495 {
4496 	int adjust_width;
4497 
4498 	/* calculate AGAW */
4499 	domain->gaw = guest_width;
4500 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4501 	domain->agaw = width_to_agaw(adjust_width);
4502 
4503 	domain->iommu_coherency = false;
4504 	domain->iommu_snooping = false;
4505 	domain->iommu_superpage = 0;
4506 	domain->max_addr = 0;
4507 
4508 	/* always allocate the top pgd */
4509 	domain->pgd = alloc_pgtable_page(domain->nid);
4510 	if (!domain->pgd)
4511 		return -ENOMEM;
4512 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4513 	return 0;
4514 }
4515 
4516 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4517 {
4518 	struct dmar_domain *dmar_domain;
4519 	struct iommu_domain *domain;
4520 
4521 	switch (type) {
4522 	case IOMMU_DOMAIN_DMA:
4523 	case IOMMU_DOMAIN_UNMANAGED:
4524 		dmar_domain = alloc_domain(0);
4525 		if (!dmar_domain) {
4526 			pr_err("Can't allocate dmar_domain\n");
4527 			return NULL;
4528 		}
4529 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4530 			pr_err("Domain initialization failed\n");
4531 			domain_exit(dmar_domain);
4532 			return NULL;
4533 		}
4534 
4535 		if (type == IOMMU_DOMAIN_DMA &&
4536 		    iommu_get_dma_cookie(&dmar_domain->domain))
4537 			return NULL;
4538 
4539 		domain = &dmar_domain->domain;
4540 		domain->geometry.aperture_start = 0;
4541 		domain->geometry.aperture_end   =
4542 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4543 		domain->geometry.force_aperture = true;
4544 
4545 		return domain;
4546 	case IOMMU_DOMAIN_IDENTITY:
4547 		return &si_domain->domain;
4548 	default:
4549 		return NULL;
4550 	}
4551 
4552 	return NULL;
4553 }
4554 
4555 static void intel_iommu_domain_free(struct iommu_domain *domain)
4556 {
4557 	if (domain != &si_domain->domain)
4558 		domain_exit(to_dmar_domain(domain));
4559 }
4560 
4561 /*
4562  * Check whether a @domain could be attached to the @dev through the
4563  * aux-domain attach/detach APIs.
4564  */
4565 static inline bool
4566 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4567 {
4568 	struct device_domain_info *info = get_domain_info(dev);
4569 
4570 	return info && info->auxd_enabled &&
4571 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4572 }
4573 
4574 static inline struct subdev_domain_info *
4575 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4576 {
4577 	struct subdev_domain_info *sinfo;
4578 
4579 	if (!list_empty(&domain->subdevices)) {
4580 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4581 			if (sinfo->pdev == dev)
4582 				return sinfo;
4583 		}
4584 	}
4585 
4586 	return NULL;
4587 }
4588 
4589 static int auxiliary_link_device(struct dmar_domain *domain,
4590 				 struct device *dev)
4591 {
4592 	struct device_domain_info *info = get_domain_info(dev);
4593 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4594 
4595 	assert_spin_locked(&device_domain_lock);
4596 	if (WARN_ON(!info))
4597 		return -EINVAL;
4598 
4599 	if (!sinfo) {
4600 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4601 		if (!sinfo)
4602 			return -ENOMEM;
4603 		sinfo->domain = domain;
4604 		sinfo->pdev = dev;
4605 		list_add(&sinfo->link_phys, &info->subdevices);
4606 		list_add(&sinfo->link_domain, &domain->subdevices);
4607 	}
4608 
4609 	return ++sinfo->users;
4610 }
4611 
4612 static int auxiliary_unlink_device(struct dmar_domain *domain,
4613 				   struct device *dev)
4614 {
4615 	struct device_domain_info *info = get_domain_info(dev);
4616 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4617 	int ret;
4618 
4619 	assert_spin_locked(&device_domain_lock);
4620 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4621 		return -EINVAL;
4622 
4623 	ret = --sinfo->users;
4624 	if (!ret) {
4625 		list_del(&sinfo->link_phys);
4626 		list_del(&sinfo->link_domain);
4627 		kfree(sinfo);
4628 	}
4629 
4630 	return ret;
4631 }
4632 
4633 static int aux_domain_add_dev(struct dmar_domain *domain,
4634 			      struct device *dev)
4635 {
4636 	int ret;
4637 	unsigned long flags;
4638 	struct intel_iommu *iommu;
4639 
4640 	iommu = device_to_iommu(dev, NULL, NULL);
4641 	if (!iommu)
4642 		return -ENODEV;
4643 
4644 	if (domain->default_pasid <= 0) {
4645 		u32 pasid;
4646 
4647 		/* No private data needed for the default pasid */
4648 		pasid = ioasid_alloc(NULL, PASID_MIN,
4649 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4650 				     NULL);
4651 		if (pasid == INVALID_IOASID) {
4652 			pr_err("Can't allocate default pasid\n");
4653 			return -ENODEV;
4654 		}
4655 		domain->default_pasid = pasid;
4656 	}
4657 
4658 	spin_lock_irqsave(&device_domain_lock, flags);
4659 	ret = auxiliary_link_device(domain, dev);
4660 	if (ret <= 0)
4661 		goto link_failed;
4662 
4663 	/*
4664 	 * Subdevices from the same physical device can be attached to the
4665 	 * same domain. For such cases, only the first subdevice attachment
4666 	 * needs to go through the full steps in this function. So if ret >
4667 	 * 1, just goto out.
4668 	 */
4669 	if (ret > 1)
4670 		goto out;
4671 
4672 	/*
4673 	 * iommu->lock must be held to attach domain to iommu and setup the
4674 	 * pasid entry for second level translation.
4675 	 */
4676 	spin_lock(&iommu->lock);
4677 	ret = domain_attach_iommu(domain, iommu);
4678 	if (ret)
4679 		goto attach_failed;
4680 
4681 	/* Setup the PASID entry for mediated devices: */
4682 	if (domain_use_first_level(domain))
4683 		ret = domain_setup_first_level(iommu, domain, dev,
4684 					       domain->default_pasid);
4685 	else
4686 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4687 						     domain->default_pasid);
4688 	if (ret)
4689 		goto table_failed;
4690 
4691 	spin_unlock(&iommu->lock);
4692 out:
4693 	spin_unlock_irqrestore(&device_domain_lock, flags);
4694 
4695 	return 0;
4696 
4697 table_failed:
4698 	domain_detach_iommu(domain, iommu);
4699 attach_failed:
4700 	spin_unlock(&iommu->lock);
4701 	auxiliary_unlink_device(domain, dev);
4702 link_failed:
4703 	spin_unlock_irqrestore(&device_domain_lock, flags);
4704 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4705 		ioasid_put(domain->default_pasid);
4706 
4707 	return ret;
4708 }
4709 
4710 static void aux_domain_remove_dev(struct dmar_domain *domain,
4711 				  struct device *dev)
4712 {
4713 	struct device_domain_info *info;
4714 	struct intel_iommu *iommu;
4715 	unsigned long flags;
4716 
4717 	if (!is_aux_domain(dev, &domain->domain))
4718 		return;
4719 
4720 	spin_lock_irqsave(&device_domain_lock, flags);
4721 	info = get_domain_info(dev);
4722 	iommu = info->iommu;
4723 
4724 	if (!auxiliary_unlink_device(domain, dev)) {
4725 		spin_lock(&iommu->lock);
4726 		intel_pasid_tear_down_entry(iommu, dev,
4727 					    domain->default_pasid, false);
4728 		domain_detach_iommu(domain, iommu);
4729 		spin_unlock(&iommu->lock);
4730 	}
4731 
4732 	spin_unlock_irqrestore(&device_domain_lock, flags);
4733 
4734 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4735 		ioasid_put(domain->default_pasid);
4736 }
4737 
4738 static int prepare_domain_attach_device(struct iommu_domain *domain,
4739 					struct device *dev)
4740 {
4741 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4742 	struct intel_iommu *iommu;
4743 	int addr_width;
4744 
4745 	iommu = device_to_iommu(dev, NULL, NULL);
4746 	if (!iommu)
4747 		return -ENODEV;
4748 
4749 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4750 	    !ecap_nest(iommu->ecap)) {
4751 		dev_err(dev, "%s: iommu not support nested translation\n",
4752 			iommu->name);
4753 		return -EINVAL;
4754 	}
4755 
4756 	/* check if this iommu agaw is sufficient for max mapped address */
4757 	addr_width = agaw_to_width(iommu->agaw);
4758 	if (addr_width > cap_mgaw(iommu->cap))
4759 		addr_width = cap_mgaw(iommu->cap);
4760 
4761 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4762 		dev_err(dev, "%s: iommu width (%d) is not "
4763 		        "sufficient for the mapped address (%llx)\n",
4764 		        __func__, addr_width, dmar_domain->max_addr);
4765 		return -EFAULT;
4766 	}
4767 	dmar_domain->gaw = addr_width;
4768 
4769 	/*
4770 	 * Knock out extra levels of page tables if necessary
4771 	 */
4772 	while (iommu->agaw < dmar_domain->agaw) {
4773 		struct dma_pte *pte;
4774 
4775 		pte = dmar_domain->pgd;
4776 		if (dma_pte_present(pte)) {
4777 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4778 			free_pgtable_page(pte);
4779 		}
4780 		dmar_domain->agaw--;
4781 	}
4782 
4783 	return 0;
4784 }
4785 
4786 static int intel_iommu_attach_device(struct iommu_domain *domain,
4787 				     struct device *dev)
4788 {
4789 	int ret;
4790 
4791 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4792 	    device_is_rmrr_locked(dev)) {
4793 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4794 		return -EPERM;
4795 	}
4796 
4797 	if (is_aux_domain(dev, domain))
4798 		return -EPERM;
4799 
4800 	/* normally dev is not mapped */
4801 	if (unlikely(domain_context_mapped(dev))) {
4802 		struct dmar_domain *old_domain;
4803 
4804 		old_domain = find_domain(dev);
4805 		if (old_domain)
4806 			dmar_remove_one_dev_info(dev);
4807 	}
4808 
4809 	ret = prepare_domain_attach_device(domain, dev);
4810 	if (ret)
4811 		return ret;
4812 
4813 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4814 }
4815 
4816 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4817 					 struct device *dev)
4818 {
4819 	int ret;
4820 
4821 	if (!is_aux_domain(dev, domain))
4822 		return -EPERM;
4823 
4824 	ret = prepare_domain_attach_device(domain, dev);
4825 	if (ret)
4826 		return ret;
4827 
4828 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4829 }
4830 
4831 static void intel_iommu_detach_device(struct iommu_domain *domain,
4832 				      struct device *dev)
4833 {
4834 	dmar_remove_one_dev_info(dev);
4835 }
4836 
4837 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4838 					  struct device *dev)
4839 {
4840 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4841 }
4842 
4843 #ifdef CONFIG_INTEL_IOMMU_SVM
4844 /*
4845  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4846  * VT-d granularity. Invalidation is typically included in the unmap operation
4847  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4848  * owns the first level page tables. Invalidations of translation caches in the
4849  * guest are trapped and passed down to the host.
4850  *
4851  * vIOMMU in the guest will only expose first level page tables, therefore
4852  * we do not support IOTLB granularity for request without PASID (second level).
4853  *
4854  * For example, to find the VT-d granularity encoding for IOTLB
4855  * type and page selective granularity within PASID:
4856  * X: indexed by iommu cache type
4857  * Y: indexed by enum iommu_inv_granularity
4858  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4859  */
4860 
4861 static const int
4862 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4863 	/*
4864 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4865 	 * page selective (address granularity)
4866 	 */
4867 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4868 	/* PASID based dev TLBs */
4869 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4870 	/* PASID cache */
4871 	{-EINVAL, -EINVAL, -EINVAL}
4872 };
4873 
4874 static inline int to_vtd_granularity(int type, int granu)
4875 {
4876 	return inv_type_granu_table[type][granu];
4877 }
4878 
4879 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4880 {
4881 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4882 
4883 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4884 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4885 	 * granu size in contiguous memory.
4886 	 */
4887 	return order_base_2(nr_pages);
4888 }
4889 
4890 static int
4891 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4892 			   struct iommu_cache_invalidate_info *inv_info)
4893 {
4894 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4895 	struct device_domain_info *info;
4896 	struct intel_iommu *iommu;
4897 	unsigned long flags;
4898 	int cache_type;
4899 	u8 bus, devfn;
4900 	u16 did, sid;
4901 	int ret = 0;
4902 	u64 size = 0;
4903 
4904 	if (!inv_info || !dmar_domain)
4905 		return -EINVAL;
4906 
4907 	if (!dev || !dev_is_pci(dev))
4908 		return -ENODEV;
4909 
4910 	iommu = device_to_iommu(dev, &bus, &devfn);
4911 	if (!iommu)
4912 		return -ENODEV;
4913 
4914 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4915 		return -EINVAL;
4916 
4917 	spin_lock_irqsave(&device_domain_lock, flags);
4918 	spin_lock(&iommu->lock);
4919 	info = get_domain_info(dev);
4920 	if (!info) {
4921 		ret = -EINVAL;
4922 		goto out_unlock;
4923 	}
4924 	did = dmar_domain->iommu_did[iommu->seq_id];
4925 	sid = PCI_DEVID(bus, devfn);
4926 
4927 	/* Size is only valid in address selective invalidation */
4928 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4929 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4930 				   inv_info->granu.addr_info.nb_granules);
4931 
4932 	for_each_set_bit(cache_type,
4933 			 (unsigned long *)&inv_info->cache,
4934 			 IOMMU_CACHE_INV_TYPE_NR) {
4935 		int granu = 0;
4936 		u64 pasid = 0;
4937 		u64 addr = 0;
4938 
4939 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4940 		if (granu == -EINVAL) {
4941 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4942 					   cache_type, inv_info->granularity);
4943 			break;
4944 		}
4945 
4946 		/*
4947 		 * PASID is stored in different locations based on the
4948 		 * granularity.
4949 		 */
4950 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4951 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4952 			pasid = inv_info->granu.pasid_info.pasid;
4953 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4954 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4955 			pasid = inv_info->granu.addr_info.pasid;
4956 
4957 		switch (BIT(cache_type)) {
4958 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4959 			/* HW will ignore LSB bits based on address mask */
4960 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4961 			    size &&
4962 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4963 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4964 						   inv_info->granu.addr_info.addr, size);
4965 			}
4966 
4967 			/*
4968 			 * If granu is PASID-selective, address is ignored.
4969 			 * We use npages = -1 to indicate that.
4970 			 */
4971 			qi_flush_piotlb(iommu, did, pasid,
4972 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4973 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4974 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4975 
4976 			if (!info->ats_enabled)
4977 				break;
4978 			/*
4979 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4980 			 * in the guest may assume IOTLB flush is inclusive,
4981 			 * which is more efficient.
4982 			 */
4983 			fallthrough;
4984 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4985 			/*
4986 			 * PASID based device TLB invalidation does not support
4987 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4988 			 * IOMMU_INV_GRANU_ADDR.
4989 			 * The equivalent of that is we set the size to be the
4990 			 * entire range of 64 bit. User only provides PASID info
4991 			 * without address info. So we set addr to 0.
4992 			 */
4993 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4994 				size = 64 - VTD_PAGE_SHIFT;
4995 				addr = 0;
4996 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4997 				addr = inv_info->granu.addr_info.addr;
4998 			}
4999 
5000 			if (info->ats_enabled)
5001 				qi_flush_dev_iotlb_pasid(iommu, sid,
5002 						info->pfsid, pasid,
5003 						info->ats_qdep, addr,
5004 						size);
5005 			else
5006 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5007 			break;
5008 		default:
5009 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5010 					    cache_type);
5011 			ret = -EINVAL;
5012 		}
5013 	}
5014 out_unlock:
5015 	spin_unlock(&iommu->lock);
5016 	spin_unlock_irqrestore(&device_domain_lock, flags);
5017 
5018 	return ret;
5019 }
5020 #endif
5021 
5022 static int intel_iommu_map(struct iommu_domain *domain,
5023 			   unsigned long iova, phys_addr_t hpa,
5024 			   size_t size, int iommu_prot, gfp_t gfp)
5025 {
5026 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5027 	u64 max_addr;
5028 	int prot = 0;
5029 
5030 	if (iommu_prot & IOMMU_READ)
5031 		prot |= DMA_PTE_READ;
5032 	if (iommu_prot & IOMMU_WRITE)
5033 		prot |= DMA_PTE_WRITE;
5034 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5035 		prot |= DMA_PTE_SNP;
5036 
5037 	max_addr = iova + size;
5038 	if (dmar_domain->max_addr < max_addr) {
5039 		u64 end;
5040 
5041 		/* check if minimum agaw is sufficient for mapped address */
5042 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5043 		if (end < max_addr) {
5044 			pr_err("%s: iommu width (%d) is not "
5045 			       "sufficient for the mapped address (%llx)\n",
5046 			       __func__, dmar_domain->gaw, max_addr);
5047 			return -EFAULT;
5048 		}
5049 		dmar_domain->max_addr = max_addr;
5050 	}
5051 	/* Round up size to next multiple of PAGE_SIZE, if it and
5052 	   the low bits of hpa would take us onto the next page */
5053 	size = aligned_nrpages(hpa, size);
5054 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5055 				hpa >> VTD_PAGE_SHIFT, size, prot);
5056 }
5057 
5058 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5059 				unsigned long iova, size_t size,
5060 				struct iommu_iotlb_gather *gather)
5061 {
5062 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5063 	unsigned long start_pfn, last_pfn;
5064 	int level = 0;
5065 
5066 	/* Cope with horrid API which requires us to unmap more than the
5067 	   size argument if it happens to be a large-page mapping. */
5068 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5069 
5070 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5071 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5072 
5073 	start_pfn = iova >> VTD_PAGE_SHIFT;
5074 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5075 
5076 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5077 					last_pfn, gather->freelist);
5078 
5079 	if (dmar_domain->max_addr == iova + size)
5080 		dmar_domain->max_addr = iova;
5081 
5082 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5083 
5084 	return size;
5085 }
5086 
5087 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5088 				 struct iommu_iotlb_gather *gather)
5089 {
5090 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5091 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5092 	size_t size = gather->end - gather->start;
5093 	unsigned long start_pfn;
5094 	unsigned long nrpages;
5095 	int iommu_id;
5096 
5097 	nrpages = aligned_nrpages(gather->start, size);
5098 	start_pfn = mm_to_dma_pfn(iova_pfn);
5099 
5100 	for_each_domain_iommu(iommu_id, dmar_domain)
5101 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5102 				      start_pfn, nrpages, !gather->freelist, 0);
5103 
5104 	dma_free_pagelist(gather->freelist);
5105 }
5106 
5107 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5108 					    dma_addr_t iova)
5109 {
5110 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5111 	struct dma_pte *pte;
5112 	int level = 0;
5113 	u64 phys = 0;
5114 
5115 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5116 	if (pte && dma_pte_present(pte))
5117 		phys = dma_pte_addr(pte) +
5118 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5119 						VTD_PAGE_SHIFT) - 1));
5120 
5121 	return phys;
5122 }
5123 
5124 static bool intel_iommu_capable(enum iommu_cap cap)
5125 {
5126 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5127 		return domain_update_iommu_snooping(NULL);
5128 	if (cap == IOMMU_CAP_INTR_REMAP)
5129 		return irq_remapping_enabled == 1;
5130 
5131 	return false;
5132 }
5133 
5134 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5135 {
5136 	struct intel_iommu *iommu;
5137 
5138 	iommu = device_to_iommu(dev, NULL, NULL);
5139 	if (!iommu)
5140 		return ERR_PTR(-ENODEV);
5141 
5142 	if (translation_pre_enabled(iommu))
5143 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5144 
5145 	return &iommu->iommu;
5146 }
5147 
5148 static void intel_iommu_release_device(struct device *dev)
5149 {
5150 	struct intel_iommu *iommu;
5151 
5152 	iommu = device_to_iommu(dev, NULL, NULL);
5153 	if (!iommu)
5154 		return;
5155 
5156 	dmar_remove_one_dev_info(dev);
5157 
5158 	set_dma_ops(dev, NULL);
5159 }
5160 
5161 static void intel_iommu_probe_finalize(struct device *dev)
5162 {
5163 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5164 
5165 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5166 		iommu_setup_dma_ops(dev, 0, U64_MAX);
5167 	else
5168 		set_dma_ops(dev, NULL);
5169 }
5170 
5171 static void intel_iommu_get_resv_regions(struct device *device,
5172 					 struct list_head *head)
5173 {
5174 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5175 	struct iommu_resv_region *reg;
5176 	struct dmar_rmrr_unit *rmrr;
5177 	struct device *i_dev;
5178 	int i;
5179 
5180 	down_read(&dmar_global_lock);
5181 	for_each_rmrr_units(rmrr) {
5182 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5183 					  i, i_dev) {
5184 			struct iommu_resv_region *resv;
5185 			enum iommu_resv_type type;
5186 			size_t length;
5187 
5188 			if (i_dev != device &&
5189 			    !is_downstream_to_pci_bridge(device, i_dev))
5190 				continue;
5191 
5192 			length = rmrr->end_address - rmrr->base_address + 1;
5193 
5194 			type = device_rmrr_is_relaxable(device) ?
5195 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5196 
5197 			resv = iommu_alloc_resv_region(rmrr->base_address,
5198 						       length, prot, type);
5199 			if (!resv)
5200 				break;
5201 
5202 			list_add_tail(&resv->list, head);
5203 		}
5204 	}
5205 	up_read(&dmar_global_lock);
5206 
5207 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5208 	if (dev_is_pci(device)) {
5209 		struct pci_dev *pdev = to_pci_dev(device);
5210 
5211 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5212 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5213 						   IOMMU_RESV_DIRECT_RELAXABLE);
5214 			if (reg)
5215 				list_add_tail(&reg->list, head);
5216 		}
5217 	}
5218 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5219 
5220 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5221 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5222 				      0, IOMMU_RESV_MSI);
5223 	if (!reg)
5224 		return;
5225 	list_add_tail(&reg->list, head);
5226 }
5227 
5228 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5229 {
5230 	struct device_domain_info *info;
5231 	struct context_entry *context;
5232 	struct dmar_domain *domain;
5233 	unsigned long flags;
5234 	u64 ctx_lo;
5235 	int ret;
5236 
5237 	domain = find_domain(dev);
5238 	if (!domain)
5239 		return -EINVAL;
5240 
5241 	spin_lock_irqsave(&device_domain_lock, flags);
5242 	spin_lock(&iommu->lock);
5243 
5244 	ret = -EINVAL;
5245 	info = get_domain_info(dev);
5246 	if (!info || !info->pasid_supported)
5247 		goto out;
5248 
5249 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5250 	if (WARN_ON(!context))
5251 		goto out;
5252 
5253 	ctx_lo = context[0].lo;
5254 
5255 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5256 		ctx_lo |= CONTEXT_PASIDE;
5257 		context[0].lo = ctx_lo;
5258 		wmb();
5259 		iommu->flush.flush_context(iommu,
5260 					   domain->iommu_did[iommu->seq_id],
5261 					   PCI_DEVID(info->bus, info->devfn),
5262 					   DMA_CCMD_MASK_NOBIT,
5263 					   DMA_CCMD_DEVICE_INVL);
5264 	}
5265 
5266 	/* Enable PASID support in the device, if it wasn't already */
5267 	if (!info->pasid_enabled)
5268 		iommu_enable_dev_iotlb(info);
5269 
5270 	ret = 0;
5271 
5272  out:
5273 	spin_unlock(&iommu->lock);
5274 	spin_unlock_irqrestore(&device_domain_lock, flags);
5275 
5276 	return ret;
5277 }
5278 
5279 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5280 {
5281 	if (dev_is_pci(dev))
5282 		return pci_device_group(dev);
5283 	return generic_device_group(dev);
5284 }
5285 
5286 static int intel_iommu_enable_auxd(struct device *dev)
5287 {
5288 	struct device_domain_info *info;
5289 	struct intel_iommu *iommu;
5290 	unsigned long flags;
5291 	int ret;
5292 
5293 	iommu = device_to_iommu(dev, NULL, NULL);
5294 	if (!iommu || dmar_disabled)
5295 		return -EINVAL;
5296 
5297 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5298 		return -EINVAL;
5299 
5300 	ret = intel_iommu_enable_pasid(iommu, dev);
5301 	if (ret)
5302 		return -ENODEV;
5303 
5304 	spin_lock_irqsave(&device_domain_lock, flags);
5305 	info = get_domain_info(dev);
5306 	info->auxd_enabled = 1;
5307 	spin_unlock_irqrestore(&device_domain_lock, flags);
5308 
5309 	return 0;
5310 }
5311 
5312 static int intel_iommu_disable_auxd(struct device *dev)
5313 {
5314 	struct device_domain_info *info;
5315 	unsigned long flags;
5316 
5317 	spin_lock_irqsave(&device_domain_lock, flags);
5318 	info = get_domain_info(dev);
5319 	if (!WARN_ON(!info))
5320 		info->auxd_enabled = 0;
5321 	spin_unlock_irqrestore(&device_domain_lock, flags);
5322 
5323 	return 0;
5324 }
5325 
5326 static int intel_iommu_enable_sva(struct device *dev)
5327 {
5328 	struct device_domain_info *info = get_domain_info(dev);
5329 	struct intel_iommu *iommu;
5330 	int ret;
5331 
5332 	if (!info || dmar_disabled)
5333 		return -EINVAL;
5334 
5335 	iommu = info->iommu;
5336 	if (!iommu)
5337 		return -EINVAL;
5338 
5339 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5340 		return -ENODEV;
5341 
5342 	if (intel_iommu_enable_pasid(iommu, dev))
5343 		return -ENODEV;
5344 
5345 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5346 		return -EINVAL;
5347 
5348 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5349 	if (!ret)
5350 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5351 
5352 	return ret;
5353 }
5354 
5355 static int intel_iommu_disable_sva(struct device *dev)
5356 {
5357 	struct device_domain_info *info = get_domain_info(dev);
5358 	struct intel_iommu *iommu = info->iommu;
5359 	int ret;
5360 
5361 	ret = iommu_unregister_device_fault_handler(dev);
5362 	if (!ret)
5363 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5364 
5365 	return ret;
5366 }
5367 
5368 /*
5369  * A PCI express designated vendor specific extended capability is defined
5370  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5371  * for system software and tools to detect endpoint devices supporting the
5372  * Intel scalable IO virtualization without host driver dependency.
5373  *
5374  * Returns the address of the matching extended capability structure within
5375  * the device's PCI configuration space or 0 if the device does not support
5376  * it.
5377  */
5378 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5379 {
5380 	int pos;
5381 	u16 vendor, id;
5382 
5383 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5384 	while (pos) {
5385 		pci_read_config_word(pdev, pos + 4, &vendor);
5386 		pci_read_config_word(pdev, pos + 8, &id);
5387 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5388 			return pos;
5389 
5390 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5391 	}
5392 
5393 	return 0;
5394 }
5395 
5396 static bool
5397 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5398 {
5399 	struct device_domain_info *info = get_domain_info(dev);
5400 
5401 	if (feat == IOMMU_DEV_FEAT_AUX) {
5402 		int ret;
5403 
5404 		if (!dev_is_pci(dev) || dmar_disabled ||
5405 		    !scalable_mode_support() || !pasid_mode_support())
5406 			return false;
5407 
5408 		ret = pci_pasid_features(to_pci_dev(dev));
5409 		if (ret < 0)
5410 			return false;
5411 
5412 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5413 	}
5414 
5415 	if (feat == IOMMU_DEV_FEAT_IOPF)
5416 		return info && info->pri_supported;
5417 
5418 	if (feat == IOMMU_DEV_FEAT_SVA)
5419 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5420 			info->pasid_supported && info->pri_supported &&
5421 			info->ats_supported;
5422 
5423 	return false;
5424 }
5425 
5426 static int
5427 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5428 {
5429 	switch (feat) {
5430 	case IOMMU_DEV_FEAT_AUX:
5431 		return intel_iommu_enable_auxd(dev);
5432 
5433 	case IOMMU_DEV_FEAT_IOPF:
5434 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5435 
5436 	case IOMMU_DEV_FEAT_SVA:
5437 		return intel_iommu_enable_sva(dev);
5438 
5439 	default:
5440 		return -ENODEV;
5441 	}
5442 }
5443 
5444 static int
5445 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5446 {
5447 	switch (feat) {
5448 	case IOMMU_DEV_FEAT_AUX:
5449 		return intel_iommu_disable_auxd(dev);
5450 
5451 	case IOMMU_DEV_FEAT_IOPF:
5452 		return 0;
5453 
5454 	case IOMMU_DEV_FEAT_SVA:
5455 		return intel_iommu_disable_sva(dev);
5456 
5457 	default:
5458 		return -ENODEV;
5459 	}
5460 }
5461 
5462 static bool
5463 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5464 {
5465 	struct device_domain_info *info = get_domain_info(dev);
5466 
5467 	if (feat == IOMMU_DEV_FEAT_AUX)
5468 		return scalable_mode_support() && info && info->auxd_enabled;
5469 
5470 	return false;
5471 }
5472 
5473 static int
5474 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5475 {
5476 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5477 
5478 	return dmar_domain->default_pasid > 0 ?
5479 			dmar_domain->default_pasid : -EINVAL;
5480 }
5481 
5482 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5483 					   struct device *dev)
5484 {
5485 	return attach_deferred(dev);
5486 }
5487 
5488 static int
5489 intel_iommu_enable_nesting(struct iommu_domain *domain)
5490 {
5491 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5492 	unsigned long flags;
5493 	int ret = -ENODEV;
5494 
5495 	spin_lock_irqsave(&device_domain_lock, flags);
5496 	if (list_empty(&dmar_domain->devices)) {
5497 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5498 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5499 		ret = 0;
5500 	}
5501 	spin_unlock_irqrestore(&device_domain_lock, flags);
5502 
5503 	return ret;
5504 }
5505 
5506 /*
5507  * Check that the device does not live on an external facing PCI port that is
5508  * marked as untrusted. Such devices should not be able to apply quirks and
5509  * thus not be able to bypass the IOMMU restrictions.
5510  */
5511 static bool risky_device(struct pci_dev *pdev)
5512 {
5513 	if (pdev->untrusted) {
5514 		pci_info(pdev,
5515 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5516 			 pdev->vendor, pdev->device);
5517 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5518 		return true;
5519 	}
5520 	return false;
5521 }
5522 
5523 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5524 			     unsigned long clf_pages)
5525 {
5526 	struct dma_pte *first_pte = NULL, *pte = NULL;
5527 	unsigned long lvl_pages = 0;
5528 	int level = 0;
5529 
5530 	while (clf_pages > 0) {
5531 		if (!pte) {
5532 			level = 0;
5533 			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5534 			if (WARN_ON(!pte))
5535 				return;
5536 			first_pte = pte;
5537 			lvl_pages = lvl_to_nr_pages(level);
5538 		}
5539 
5540 		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5541 			return;
5542 
5543 		clf_pages -= lvl_pages;
5544 		clf_pfn += lvl_pages;
5545 		pte++;
5546 
5547 		if (!clf_pages || first_pte_in_page(pte) ||
5548 		    (level > 1 && clf_pages < lvl_pages)) {
5549 			domain_flush_cache(domain, first_pte,
5550 					   (void *)pte - (void *)first_pte);
5551 			pte = NULL;
5552 		}
5553 	}
5554 }
5555 
5556 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5557 				       unsigned long iova, size_t size)
5558 {
5559 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5560 	unsigned long pages = aligned_nrpages(iova, size);
5561 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5562 	struct intel_iommu *iommu;
5563 	int iommu_id;
5564 
5565 	if (!dmar_domain->iommu_coherency)
5566 		clflush_sync_map(dmar_domain, pfn, pages);
5567 
5568 	for_each_domain_iommu(iommu_id, dmar_domain) {
5569 		iommu = g_iommus[iommu_id];
5570 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5571 	}
5572 }
5573 
5574 const struct iommu_ops intel_iommu_ops = {
5575 	.capable		= intel_iommu_capable,
5576 	.domain_alloc		= intel_iommu_domain_alloc,
5577 	.domain_free		= intel_iommu_domain_free,
5578 	.enable_nesting		= intel_iommu_enable_nesting,
5579 	.attach_dev		= intel_iommu_attach_device,
5580 	.detach_dev		= intel_iommu_detach_device,
5581 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5582 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5583 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5584 	.map			= intel_iommu_map,
5585 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5586 	.unmap			= intel_iommu_unmap,
5587 	.flush_iotlb_all        = intel_flush_iotlb_all,
5588 	.iotlb_sync		= intel_iommu_tlb_sync,
5589 	.iova_to_phys		= intel_iommu_iova_to_phys,
5590 	.probe_device		= intel_iommu_probe_device,
5591 	.probe_finalize		= intel_iommu_probe_finalize,
5592 	.release_device		= intel_iommu_release_device,
5593 	.get_resv_regions	= intel_iommu_get_resv_regions,
5594 	.put_resv_regions	= generic_iommu_put_resv_regions,
5595 	.device_group		= intel_iommu_device_group,
5596 	.dev_has_feat		= intel_iommu_dev_has_feat,
5597 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5598 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5599 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5600 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5601 	.def_domain_type	= device_def_domain_type,
5602 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5603 #ifdef CONFIG_INTEL_IOMMU_SVM
5604 	.cache_invalidate	= intel_iommu_sva_invalidate,
5605 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5606 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5607 	.sva_bind		= intel_svm_bind,
5608 	.sva_unbind		= intel_svm_unbind,
5609 	.sva_get_pasid		= intel_svm_get_pasid,
5610 	.page_response		= intel_svm_page_response,
5611 #endif
5612 };
5613 
5614 static void quirk_iommu_igfx(struct pci_dev *dev)
5615 {
5616 	if (risky_device(dev))
5617 		return;
5618 
5619 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5620 	dmar_map_gfx = 0;
5621 }
5622 
5623 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5624 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5625 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5631 
5632 /* Broadwell igfx malfunctions with dmar */
5633 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5657 
5658 static void quirk_iommu_rwbf(struct pci_dev *dev)
5659 {
5660 	if (risky_device(dev))
5661 		return;
5662 
5663 	/*
5664 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5665 	 * but needs it. Same seems to hold for the desktop versions.
5666 	 */
5667 	pci_info(dev, "Forcing write-buffer flush capability\n");
5668 	rwbf_quirk = 1;
5669 }
5670 
5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5678 
5679 #define GGC 0x52
5680 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5681 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5682 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5683 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5684 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5685 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5686 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5687 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5688 
5689 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5690 {
5691 	unsigned short ggc;
5692 
5693 	if (risky_device(dev))
5694 		return;
5695 
5696 	if (pci_read_config_word(dev, GGC, &ggc))
5697 		return;
5698 
5699 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5700 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5701 		dmar_map_gfx = 0;
5702 	} else if (dmar_map_gfx) {
5703 		/* we have to ensure the gfx device is idle before we flush */
5704 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5705 		intel_iommu_strict = 1;
5706        }
5707 }
5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5712 
5713 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5714 {
5715 	unsigned short ver;
5716 
5717 	if (!IS_GFX_DEVICE(dev))
5718 		return;
5719 
5720 	ver = (dev->device >> 8) & 0xff;
5721 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5722 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5723 	    ver != 0x9a)
5724 		return;
5725 
5726 	if (risky_device(dev))
5727 		return;
5728 
5729 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5730 	iommu_skip_te_disable = 1;
5731 }
5732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5733 
5734 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5735    ISOCH DMAR unit for the Azalia sound device, but not give it any
5736    TLB entries, which causes it to deadlock. Check for that.  We do
5737    this in a function called from init_dmars(), instead of in a PCI
5738    quirk, because we don't want to print the obnoxious "BIOS broken"
5739    message if VT-d is actually disabled.
5740 */
5741 static void __init check_tylersburg_isoch(void)
5742 {
5743 	struct pci_dev *pdev;
5744 	uint32_t vtisochctrl;
5745 
5746 	/* If there's no Azalia in the system anyway, forget it. */
5747 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5748 	if (!pdev)
5749 		return;
5750 
5751 	if (risky_device(pdev)) {
5752 		pci_dev_put(pdev);
5753 		return;
5754 	}
5755 
5756 	pci_dev_put(pdev);
5757 
5758 	/* System Management Registers. Might be hidden, in which case
5759 	   we can't do the sanity check. But that's OK, because the
5760 	   known-broken BIOSes _don't_ actually hide it, so far. */
5761 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5762 	if (!pdev)
5763 		return;
5764 
5765 	if (risky_device(pdev)) {
5766 		pci_dev_put(pdev);
5767 		return;
5768 	}
5769 
5770 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5771 		pci_dev_put(pdev);
5772 		return;
5773 	}
5774 
5775 	pci_dev_put(pdev);
5776 
5777 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5778 	if (vtisochctrl & 1)
5779 		return;
5780 
5781 	/* Drop all bits other than the number of TLB entries */
5782 	vtisochctrl &= 0x1c;
5783 
5784 	/* If we have the recommended number of TLB entries (16), fine. */
5785 	if (vtisochctrl == 0x10)
5786 		return;
5787 
5788 	/* Zero TLB entries? You get to ride the short bus to school. */
5789 	if (!vtisochctrl) {
5790 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5791 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5792 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5793 		     dmi_get_system_info(DMI_BIOS_VERSION),
5794 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5795 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5796 		return;
5797 	}
5798 
5799 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5800 	       vtisochctrl);
5801 }
5802