xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 28dce2c4)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 
48 #include "../irq_remapping.h"
49 #include "pasid.h"
50 #include "cap_audit.h"
51 
52 #define ROOT_SIZE		VTD_PAGE_SIZE
53 #define CONTEXT_SIZE		VTD_PAGE_SIZE
54 
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 
60 #define IOAPIC_RANGE_START	(0xfee00000)
61 #define IOAPIC_RANGE_END	(0xfeefffff)
62 #define IOVA_START_ADDR		(0x1000)
63 
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 
69 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
71 
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
75 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 
78 /* IO virtual address start page frame number */
79 #define IOVA_START_PFN		(1)
80 
81 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
82 
83 /* page table handling */
84 #define LEVEL_STRIDE		(9)
85 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
86 
87 /*
88  * This bitmap is used to advertise the page sizes our hardware support
89  * to the IOMMU core, which will then use this information to split
90  * physically contiguous memory regions it is mapping into page sizes
91  * that we support.
92  *
93  * Traditionally the IOMMU core just handed us the mappings directly,
94  * after making sure the size is an order of a 4KiB page and that the
95  * mapping has natural alignment.
96  *
97  * To retain this behavior, we currently advertise that we support
98  * all page sizes that are an order of 4KiB.
99  *
100  * If at some point we'd like to utilize the IOMMU core's new behavior,
101  * we could change this to advertise the real page sizes we support.
102  */
103 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
104 
105 static inline int agaw_to_level(int agaw)
106 {
107 	return agaw + 2;
108 }
109 
110 static inline int agaw_to_width(int agaw)
111 {
112 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 }
114 
115 static inline int width_to_agaw(int width)
116 {
117 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 }
119 
120 static inline unsigned int level_to_offset_bits(int level)
121 {
122 	return (level - 1) * LEVEL_STRIDE;
123 }
124 
125 static inline int pfn_level_offset(u64 pfn, int level)
126 {
127 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 }
129 
130 static inline u64 level_mask(int level)
131 {
132 	return -1ULL << level_to_offset_bits(level);
133 }
134 
135 static inline u64 level_size(int level)
136 {
137 	return 1ULL << level_to_offset_bits(level);
138 }
139 
140 static inline u64 align_to_level(u64 pfn, int level)
141 {
142 	return (pfn + level_size(level) - 1) & level_mask(level);
143 }
144 
145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 {
147 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 }
149 
150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
151    are never going to work. */
152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 {
154 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 
157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 {
159 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 {
163 	return mm_to_dma_pfn(page_to_pfn(pg));
164 }
165 static inline unsigned long virt_to_dma_pfn(void *p)
166 {
167 	return page_to_dma_pfn(virt_to_page(p));
168 }
169 
170 /* global iommu list, set NULL for ignored DMAR units */
171 static struct intel_iommu **g_iommus;
172 
173 static void __init check_tylersburg_isoch(void);
174 static int rwbf_quirk;
175 
176 /*
177  * set to 1 to panic kernel if can't successfully enable VT-d
178  * (used when kernel is launched w/ TXT)
179  */
180 static int force_on = 0;
181 static int intel_iommu_tboot_noforce;
182 static int no_platform_optin;
183 
184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
185 
186 /*
187  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
188  * if marked present.
189  */
190 static phys_addr_t root_entry_lctp(struct root_entry *re)
191 {
192 	if (!(re->lo & 1))
193 		return 0;
194 
195 	return re->lo & VTD_PAGE_MASK;
196 }
197 
198 /*
199  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_uctp(struct root_entry *re)
203 {
204 	if (!(re->hi & 1))
205 		return 0;
206 
207 	return re->hi & VTD_PAGE_MASK;
208 }
209 
210 static inline void context_clear_pasid_enable(struct context_entry *context)
211 {
212 	context->lo &= ~(1ULL << 11);
213 }
214 
215 static inline bool context_pasid_enabled(struct context_entry *context)
216 {
217 	return !!(context->lo & (1ULL << 11));
218 }
219 
220 static inline void context_set_copied(struct context_entry *context)
221 {
222 	context->hi |= (1ull << 3);
223 }
224 
225 static inline bool context_copied(struct context_entry *context)
226 {
227 	return !!(context->hi & (1ULL << 3));
228 }
229 
230 static inline bool __context_present(struct context_entry *context)
231 {
232 	return (context->lo & 1);
233 }
234 
235 bool context_present(struct context_entry *context)
236 {
237 	return context_pasid_enabled(context) ?
238 	     __context_present(context) :
239 	     __context_present(context) && !context_copied(context);
240 }
241 
242 static inline void context_set_present(struct context_entry *context)
243 {
244 	context->lo |= 1;
245 }
246 
247 static inline void context_set_fault_enable(struct context_entry *context)
248 {
249 	context->lo &= (((u64)-1) << 2) | 1;
250 }
251 
252 static inline void context_set_translation_type(struct context_entry *context,
253 						unsigned long value)
254 {
255 	context->lo &= (((u64)-1) << 4) | 3;
256 	context->lo |= (value & 3) << 2;
257 }
258 
259 static inline void context_set_address_root(struct context_entry *context,
260 					    unsigned long value)
261 {
262 	context->lo &= ~VTD_PAGE_MASK;
263 	context->lo |= value & VTD_PAGE_MASK;
264 }
265 
266 static inline void context_set_address_width(struct context_entry *context,
267 					     unsigned long value)
268 {
269 	context->hi |= value & 7;
270 }
271 
272 static inline void context_set_domain_id(struct context_entry *context,
273 					 unsigned long value)
274 {
275 	context->hi |= (value & ((1 << 16) - 1)) << 8;
276 }
277 
278 static inline int context_domain_id(struct context_entry *c)
279 {
280 	return((c->hi >> 8) & 0xffff);
281 }
282 
283 static inline void context_clear_entry(struct context_entry *context)
284 {
285 	context->lo = 0;
286 	context->hi = 0;
287 }
288 
289 /*
290  * This domain is a statically identity mapping domain.
291  *	1. This domain creats a static 1:1 mapping to all usable memory.
292  * 	2. It maps to each iommu if successful.
293  *	3. Each iommu mapps to this domain if successful.
294  */
295 static struct dmar_domain *si_domain;
296 static int hw_pass_through = 1;
297 
298 #define for_each_domain_iommu(idx, domain)			\
299 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
300 		if (domain->iommu_refcnt[idx])
301 
302 struct dmar_rmrr_unit {
303 	struct list_head list;		/* list of rmrr units	*/
304 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
305 	u64	base_address;		/* reserved base address*/
306 	u64	end_address;		/* reserved end address */
307 	struct dmar_dev_scope *devices;	/* target devices */
308 	int	devices_cnt;		/* target device count */
309 };
310 
311 struct dmar_atsr_unit {
312 	struct list_head list;		/* list of ATSR units */
313 	struct acpi_dmar_header *hdr;	/* ACPI header */
314 	struct dmar_dev_scope *devices;	/* target devices */
315 	int devices_cnt;		/* target device count */
316 	u8 include_all:1;		/* include all ports */
317 };
318 
319 struct dmar_satc_unit {
320 	struct list_head list;		/* list of SATC units */
321 	struct acpi_dmar_header *hdr;	/* ACPI header */
322 	struct dmar_dev_scope *devices;	/* target devices */
323 	struct intel_iommu *iommu;	/* the corresponding iommu */
324 	int devices_cnt;		/* target device count */
325 	u8 atc_required:1;		/* ATS is required */
326 };
327 
328 static LIST_HEAD(dmar_atsr_units);
329 static LIST_HEAD(dmar_rmrr_units);
330 static LIST_HEAD(dmar_satc_units);
331 
332 #define for_each_rmrr_units(rmrr) \
333 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334 
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337 
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int intel_iommu_attach_device(struct iommu_domain *domain,
343 				     struct device *dev);
344 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
345 					    dma_addr_t iova);
346 
347 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
348 int dmar_disabled = 0;
349 #else
350 int dmar_disabled = 1;
351 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
352 
353 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
354 int intel_iommu_sm = 1;
355 #else
356 int intel_iommu_sm;
357 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
358 
359 int intel_iommu_enabled = 0;
360 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
361 
362 static int dmar_map_gfx = 1;
363 static int intel_iommu_strict;
364 static int intel_iommu_superpage = 1;
365 static int iommu_identity_mapping;
366 static int iommu_skip_te_disable;
367 
368 #define IDENTMAP_GFX		2
369 #define IDENTMAP_AZALIA		4
370 
371 int intel_iommu_gfx_mapped;
372 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
373 
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 struct device_domain_info *get_domain_info(struct device *dev)
376 {
377 	struct device_domain_info *info;
378 
379 	if (!dev)
380 		return NULL;
381 
382 	info = dev_iommu_priv_get(dev);
383 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
384 		return NULL;
385 
386 	return info;
387 }
388 
389 DEFINE_SPINLOCK(device_domain_lock);
390 static LIST_HEAD(device_domain_list);
391 
392 /*
393  * Iterate over elements in device_domain_list and call the specified
394  * callback @fn against each element.
395  */
396 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
397 				     void *data), void *data)
398 {
399 	int ret = 0;
400 	unsigned long flags;
401 	struct device_domain_info *info;
402 
403 	spin_lock_irqsave(&device_domain_lock, flags);
404 	list_for_each_entry(info, &device_domain_list, global) {
405 		ret = fn(info, data);
406 		if (ret) {
407 			spin_unlock_irqrestore(&device_domain_lock, flags);
408 			return ret;
409 		}
410 	}
411 	spin_unlock_irqrestore(&device_domain_lock, flags);
412 
413 	return 0;
414 }
415 
416 const struct iommu_ops intel_iommu_ops;
417 
418 static bool translation_pre_enabled(struct intel_iommu *iommu)
419 {
420 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
421 }
422 
423 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
424 {
425 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
426 }
427 
428 static void init_translation_status(struct intel_iommu *iommu)
429 {
430 	u32 gsts;
431 
432 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
433 	if (gsts & DMA_GSTS_TES)
434 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
435 }
436 
437 static int __init intel_iommu_setup(char *str)
438 {
439 	if (!str)
440 		return -EINVAL;
441 	while (*str) {
442 		if (!strncmp(str, "on", 2)) {
443 			dmar_disabled = 0;
444 			pr_info("IOMMU enabled\n");
445 		} else if (!strncmp(str, "off", 3)) {
446 			dmar_disabled = 1;
447 			no_platform_optin = 1;
448 			pr_info("IOMMU disabled\n");
449 		} else if (!strncmp(str, "igfx_off", 8)) {
450 			dmar_map_gfx = 0;
451 			pr_info("Disable GFX device mapping\n");
452 		} else if (!strncmp(str, "forcedac", 8)) {
453 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
454 			iommu_dma_forcedac = true;
455 		} else if (!strncmp(str, "strict", 6)) {
456 			pr_info("Disable batched IOTLB flush\n");
457 			intel_iommu_strict = 1;
458 		} else if (!strncmp(str, "sp_off", 6)) {
459 			pr_info("Disable supported super page\n");
460 			intel_iommu_superpage = 0;
461 		} else if (!strncmp(str, "sm_on", 5)) {
462 			pr_info("Intel-IOMMU: scalable mode supported\n");
463 			intel_iommu_sm = 1;
464 		} else if (!strncmp(str, "tboot_noforce", 13)) {
465 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
466 			intel_iommu_tboot_noforce = 1;
467 		}
468 
469 		str += strcspn(str, ",");
470 		while (*str == ',')
471 			str++;
472 	}
473 	return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476 
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479 
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482 	struct dmar_domain **domains;
483 	int idx = did >> 8;
484 
485 	domains = iommu->domains[idx];
486 	if (!domains)
487 		return NULL;
488 
489 	return domains[did & 0xff];
490 }
491 
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 			     struct dmar_domain *domain)
494 {
495 	struct dmar_domain **domains;
496 	int idx = did >> 8;
497 
498 	if (!iommu->domains[idx]) {
499 		size_t size = 256 * sizeof(struct dmar_domain *);
500 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501 	}
502 
503 	domains = iommu->domains[idx];
504 	if (WARN_ON(!domains))
505 		return;
506 	else
507 		domains[did & 0xff] = domain;
508 }
509 
510 void *alloc_pgtable_page(int node)
511 {
512 	struct page *page;
513 	void *vaddr = NULL;
514 
515 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516 	if (page)
517 		vaddr = page_address(page);
518 	return vaddr;
519 }
520 
521 void free_pgtable_page(void *vaddr)
522 {
523 	free_page((unsigned long)vaddr);
524 }
525 
526 static inline void *alloc_domain_mem(void)
527 {
528 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530 
531 static void free_domain_mem(void *vaddr)
532 {
533 	kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535 
536 static inline void * alloc_devinfo_mem(void)
537 {
538 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540 
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543 	kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545 
546 static inline int domain_type_is_si(struct dmar_domain *domain)
547 {
548 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
549 }
550 
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 {
553 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
554 }
555 
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
557 				       unsigned long pfn)
558 {
559 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 
561 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
562 }
563 
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 {
566 	unsigned long sagaw;
567 	int agaw = -1;
568 
569 	sagaw = cap_sagaw(iommu->cap);
570 	for (agaw = width_to_agaw(max_gaw);
571 	     agaw >= 0; agaw--) {
572 		if (test_bit(agaw, &sagaw))
573 			break;
574 	}
575 
576 	return agaw;
577 }
578 
579 /*
580  * Calculate max SAGAW for each iommu.
581  */
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 {
584 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
585 }
586 
587 /*
588  * calculate agaw for each iommu.
589  * "SAGAW" may be different across iommus, use a default agaw, and
590  * get a supported less agaw for iommus that don't support the default agaw.
591  */
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 {
594 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
595 }
596 
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 {
600 	int iommu_id;
601 
602 	/* si_domain and vm domain should not get here. */
603 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
604 		return NULL;
605 
606 	for_each_domain_iommu(iommu_id, domain)
607 		break;
608 
609 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
610 		return NULL;
611 
612 	return g_iommus[iommu_id];
613 }
614 
615 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 {
617 	return sm_supported(iommu) ?
618 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
619 }
620 
621 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 {
623 	struct dmar_drhd_unit *drhd;
624 	struct intel_iommu *iommu;
625 	bool found = false;
626 	int i;
627 
628 	domain->iommu_coherency = 1;
629 
630 	for_each_domain_iommu(i, domain) {
631 		found = true;
632 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
633 			domain->iommu_coherency = 0;
634 			break;
635 		}
636 	}
637 	if (found)
638 		return;
639 
640 	/* No hardware attached; use lowest common denominator */
641 	rcu_read_lock();
642 	for_each_active_iommu(iommu, drhd) {
643 		if (!iommu_paging_structure_coherency(iommu)) {
644 			domain->iommu_coherency = 0;
645 			break;
646 		}
647 	}
648 	rcu_read_unlock();
649 }
650 
651 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 {
653 	struct dmar_drhd_unit *drhd;
654 	struct intel_iommu *iommu;
655 	int ret = 1;
656 
657 	rcu_read_lock();
658 	for_each_active_iommu(iommu, drhd) {
659 		if (iommu != skip) {
660 			/*
661 			 * If the hardware is operating in the scalable mode,
662 			 * the snooping control is always supported since we
663 			 * always set PASID-table-entry.PGSNP bit if the domain
664 			 * is managed outside (UNMANAGED).
665 			 */
666 			if (!sm_supported(iommu) &&
667 			    !ecap_sc_support(iommu->ecap)) {
668 				ret = 0;
669 				break;
670 			}
671 		}
672 	}
673 	rcu_read_unlock();
674 
675 	return ret;
676 }
677 
678 static int domain_update_iommu_superpage(struct dmar_domain *domain,
679 					 struct intel_iommu *skip)
680 {
681 	struct dmar_drhd_unit *drhd;
682 	struct intel_iommu *iommu;
683 	int mask = 0x3;
684 
685 	if (!intel_iommu_superpage) {
686 		return 0;
687 	}
688 
689 	/* set iommu_superpage to the smallest common denominator */
690 	rcu_read_lock();
691 	for_each_active_iommu(iommu, drhd) {
692 		if (iommu != skip) {
693 			if (domain && domain_use_first_level(domain)) {
694 				if (!cap_fl1gp_support(iommu->cap))
695 					mask = 0x1;
696 			} else {
697 				mask &= cap_super_page_val(iommu->cap);
698 			}
699 
700 			if (!mask)
701 				break;
702 		}
703 	}
704 	rcu_read_unlock();
705 
706 	return fls(mask);
707 }
708 
709 static int domain_update_device_node(struct dmar_domain *domain)
710 {
711 	struct device_domain_info *info;
712 	int nid = NUMA_NO_NODE;
713 
714 	assert_spin_locked(&device_domain_lock);
715 
716 	if (list_empty(&domain->devices))
717 		return NUMA_NO_NODE;
718 
719 	list_for_each_entry(info, &domain->devices, link) {
720 		if (!info->dev)
721 			continue;
722 
723 		/*
724 		 * There could possibly be multiple device numa nodes as devices
725 		 * within the same domain may sit behind different IOMMUs. There
726 		 * isn't perfect answer in such situation, so we select first
727 		 * come first served policy.
728 		 */
729 		nid = dev_to_node(info->dev);
730 		if (nid != NUMA_NO_NODE)
731 			break;
732 	}
733 
734 	return nid;
735 }
736 
737 static void domain_update_iotlb(struct dmar_domain *domain);
738 
739 /* Some capabilities may be different across iommus */
740 static void domain_update_iommu_cap(struct dmar_domain *domain)
741 {
742 	domain_update_iommu_coherency(domain);
743 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
744 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
745 
746 	/*
747 	 * If RHSA is missing, we should default to the device numa domain
748 	 * as fall back.
749 	 */
750 	if (domain->nid == NUMA_NO_NODE)
751 		domain->nid = domain_update_device_node(domain);
752 
753 	/*
754 	 * First-level translation restricts the input-address to a
755 	 * canonical address (i.e., address bits 63:N have the same
756 	 * value as address bit [N-1], where N is 48-bits with 4-level
757 	 * paging and 57-bits with 5-level paging). Hence, skip bit
758 	 * [N-1].
759 	 */
760 	if (domain_use_first_level(domain))
761 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
762 	else
763 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
764 
765 	domain_update_iotlb(domain);
766 }
767 
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 					 u8 devfn, int alloc)
770 {
771 	struct root_entry *root = &iommu->root_entry[bus];
772 	struct context_entry *context;
773 	u64 *entry;
774 
775 	entry = &root->lo;
776 	if (sm_supported(iommu)) {
777 		if (devfn >= 0x80) {
778 			devfn -= 0x80;
779 			entry = &root->hi;
780 		}
781 		devfn *= 2;
782 	}
783 	if (*entry & 1)
784 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 	else {
786 		unsigned long phy_addr;
787 		if (!alloc)
788 			return NULL;
789 
790 		context = alloc_pgtable_page(iommu->node);
791 		if (!context)
792 			return NULL;
793 
794 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 		phy_addr = virt_to_phys((void *)context);
796 		*entry = phy_addr | 1;
797 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
798 	}
799 	return &context[devfn];
800 }
801 
802 static bool attach_deferred(struct device *dev)
803 {
804 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805 }
806 
807 /**
808  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809  *				 sub-hierarchy of a candidate PCI-PCI bridge
810  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811  * @bridge: the candidate PCI-PCI bridge
812  *
813  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814  */
815 static bool
816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817 {
818 	struct pci_dev *pdev, *pbridge;
819 
820 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 		return false;
822 
823 	pdev = to_pci_dev(dev);
824 	pbridge = to_pci_dev(bridge);
825 
826 	if (pbridge->subordinate &&
827 	    pbridge->subordinate->number <= pdev->bus->number &&
828 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 		return true;
830 
831 	return false;
832 }
833 
834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835 {
836 	struct dmar_drhd_unit *drhd;
837 	u32 vtbar;
838 	int rc;
839 
840 	/* We know that this device on this chipset has its own IOMMU.
841 	 * If we find it under a different IOMMU, then the BIOS is lying
842 	 * to us. Hope that the IOMMU for this device is actually
843 	 * disabled, and it needs no translation...
844 	 */
845 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 	if (rc) {
847 		/* "can't" happen */
848 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 		return false;
850 	}
851 	vtbar &= 0xffff0000;
852 
853 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
854 	drhd = dmar_find_matched_drhd_unit(pdev);
855 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 		return true;
859 	}
860 
861 	return false;
862 }
863 
864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865 {
866 	if (!iommu || iommu->drhd->ignored)
867 		return true;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pdev = to_pci_dev(dev);
871 
872 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 		    quirk_ioat_snb_local_iommu(pdev))
875 			return true;
876 	}
877 
878 	return false;
879 }
880 
881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882 {
883 	struct dmar_drhd_unit *drhd = NULL;
884 	struct pci_dev *pdev = NULL;
885 	struct intel_iommu *iommu;
886 	struct device *tmp;
887 	u16 segment = 0;
888 	int i;
889 
890 	if (!dev)
891 		return NULL;
892 
893 	if (dev_is_pci(dev)) {
894 		struct pci_dev *pf_pdev;
895 
896 		pdev = pci_real_dma_dev(to_pci_dev(dev));
897 
898 		/* VFs aren't listed in scope tables; we need to look up
899 		 * the PF instead to find the IOMMU. */
900 		pf_pdev = pci_physfn(pdev);
901 		dev = &pf_pdev->dev;
902 		segment = pci_domain_nr(pdev->bus);
903 	} else if (has_acpi_companion(dev))
904 		dev = &ACPI_COMPANION(dev)->dev;
905 
906 	rcu_read_lock();
907 	for_each_iommu(iommu, drhd) {
908 		if (pdev && segment != drhd->segment)
909 			continue;
910 
911 		for_each_active_dev_scope(drhd->devices,
912 					  drhd->devices_cnt, i, tmp) {
913 			if (tmp == dev) {
914 				/* For a VF use its original BDF# not that of the PF
915 				 * which we used for the IOMMU lookup. Strictly speaking
916 				 * we could do this for all PCI devices; we only need to
917 				 * get the BDF# from the scope table for ACPI matches. */
918 				if (pdev && pdev->is_virtfn)
919 					goto got_pdev;
920 
921 				if (bus && devfn) {
922 					*bus = drhd->devices[i].bus;
923 					*devfn = drhd->devices[i].devfn;
924 				}
925 				goto out;
926 			}
927 
928 			if (is_downstream_to_pci_bridge(dev, tmp))
929 				goto got_pdev;
930 		}
931 
932 		if (pdev && drhd->include_all) {
933 		got_pdev:
934 			if (bus && devfn) {
935 				*bus = pdev->bus->number;
936 				*devfn = pdev->devfn;
937 			}
938 			goto out;
939 		}
940 	}
941 	iommu = NULL;
942  out:
943 	if (iommu_is_dummy(iommu, dev))
944 		iommu = NULL;
945 
946 	rcu_read_unlock();
947 
948 	return iommu;
949 }
950 
951 static void domain_flush_cache(struct dmar_domain *domain,
952 			       void *addr, int size)
953 {
954 	if (!domain->iommu_coherency)
955 		clflush_cache_range(addr, size);
956 }
957 
958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959 {
960 	struct context_entry *context;
961 	int ret = 0;
962 	unsigned long flags;
963 
964 	spin_lock_irqsave(&iommu->lock, flags);
965 	context = iommu_context_addr(iommu, bus, devfn, 0);
966 	if (context)
967 		ret = context_present(context);
968 	spin_unlock_irqrestore(&iommu->lock, flags);
969 	return ret;
970 }
971 
972 static void free_context_table(struct intel_iommu *iommu)
973 {
974 	int i;
975 	unsigned long flags;
976 	struct context_entry *context;
977 
978 	spin_lock_irqsave(&iommu->lock, flags);
979 	if (!iommu->root_entry) {
980 		goto out;
981 	}
982 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 		context = iommu_context_addr(iommu, i, 0, 0);
984 		if (context)
985 			free_pgtable_page(context);
986 
987 		if (!sm_supported(iommu))
988 			continue;
989 
990 		context = iommu_context_addr(iommu, i, 0x80, 0);
991 		if (context)
992 			free_pgtable_page(context);
993 
994 	}
995 	free_pgtable_page(iommu->root_entry);
996 	iommu->root_entry = NULL;
997 out:
998 	spin_unlock_irqrestore(&iommu->lock, flags);
999 }
1000 
1001 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002 				      unsigned long pfn, int *target_level)
1003 {
1004 	struct dma_pte *parent, *pte;
1005 	int level = agaw_to_level(domain->agaw);
1006 	int offset;
1007 
1008 	BUG_ON(!domain->pgd);
1009 
1010 	if (!domain_pfn_supported(domain, pfn))
1011 		/* Address beyond IOMMU's addressing capabilities. */
1012 		return NULL;
1013 
1014 	parent = domain->pgd;
1015 
1016 	while (1) {
1017 		void *tmp_page;
1018 
1019 		offset = pfn_level_offset(pfn, level);
1020 		pte = &parent[offset];
1021 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022 			break;
1023 		if (level == *target_level)
1024 			break;
1025 
1026 		if (!dma_pte_present(pte)) {
1027 			uint64_t pteval;
1028 
1029 			tmp_page = alloc_pgtable_page(domain->nid);
1030 
1031 			if (!tmp_page)
1032 				return NULL;
1033 
1034 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036 			if (domain_use_first_level(domain)) {
1037 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1038 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1039 					pteval |= DMA_FL_PTE_ACCESS;
1040 			}
1041 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1042 				/* Someone else set it while we were thinking; use theirs. */
1043 				free_pgtable_page(tmp_page);
1044 			else
1045 				domain_flush_cache(domain, pte, sizeof(*pte));
1046 		}
1047 		if (level == 1)
1048 			break;
1049 
1050 		parent = phys_to_virt(dma_pte_addr(pte));
1051 		level--;
1052 	}
1053 
1054 	if (!*target_level)
1055 		*target_level = level;
1056 
1057 	return pte;
1058 }
1059 
1060 /* return address's pte at specific level */
1061 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1062 					 unsigned long pfn,
1063 					 int level, int *large_page)
1064 {
1065 	struct dma_pte *parent, *pte;
1066 	int total = agaw_to_level(domain->agaw);
1067 	int offset;
1068 
1069 	parent = domain->pgd;
1070 	while (level <= total) {
1071 		offset = pfn_level_offset(pfn, total);
1072 		pte = &parent[offset];
1073 		if (level == total)
1074 			return pte;
1075 
1076 		if (!dma_pte_present(pte)) {
1077 			*large_page = total;
1078 			break;
1079 		}
1080 
1081 		if (dma_pte_superpage(pte)) {
1082 			*large_page = total;
1083 			return pte;
1084 		}
1085 
1086 		parent = phys_to_virt(dma_pte_addr(pte));
1087 		total--;
1088 	}
1089 	return NULL;
1090 }
1091 
1092 /* clear last level pte, a tlb flush should be followed */
1093 static void dma_pte_clear_range(struct dmar_domain *domain,
1094 				unsigned long start_pfn,
1095 				unsigned long last_pfn)
1096 {
1097 	unsigned int large_page;
1098 	struct dma_pte *first_pte, *pte;
1099 
1100 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1101 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1102 	BUG_ON(start_pfn > last_pfn);
1103 
1104 	/* we don't need lock here; nobody else touches the iova range */
1105 	do {
1106 		large_page = 1;
1107 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1108 		if (!pte) {
1109 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1110 			continue;
1111 		}
1112 		do {
1113 			dma_clear_pte(pte);
1114 			start_pfn += lvl_to_nr_pages(large_page);
1115 			pte++;
1116 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1117 
1118 		domain_flush_cache(domain, first_pte,
1119 				   (void *)pte - (void *)first_pte);
1120 
1121 	} while (start_pfn && start_pfn <= last_pfn);
1122 }
1123 
1124 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1125 			       int retain_level, struct dma_pte *pte,
1126 			       unsigned long pfn, unsigned long start_pfn,
1127 			       unsigned long last_pfn)
1128 {
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn;
1134 		struct dma_pte *level_pte;
1135 
1136 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1137 			goto next;
1138 
1139 		level_pfn = pfn & level_mask(level);
1140 		level_pte = phys_to_virt(dma_pte_addr(pte));
1141 
1142 		if (level > 2) {
1143 			dma_pte_free_level(domain, level - 1, retain_level,
1144 					   level_pte, level_pfn, start_pfn,
1145 					   last_pfn);
1146 		}
1147 
1148 		/*
1149 		 * Free the page table if we're below the level we want to
1150 		 * retain and the range covers the entire table.
1151 		 */
1152 		if (level < retain_level && !(start_pfn > level_pfn ||
1153 		      last_pfn < level_pfn + level_size(level) - 1)) {
1154 			dma_clear_pte(pte);
1155 			domain_flush_cache(domain, pte, sizeof(*pte));
1156 			free_pgtable_page(level_pte);
1157 		}
1158 next:
1159 		pfn += level_size(level);
1160 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161 }
1162 
1163 /*
1164  * clear last level (leaf) ptes and free page table pages below the
1165  * level we wish to keep intact.
1166  */
1167 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168 				   unsigned long start_pfn,
1169 				   unsigned long last_pfn,
1170 				   int retain_level)
1171 {
1172 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174 	BUG_ON(start_pfn > last_pfn);
1175 
1176 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1177 
1178 	/* We don't need lock here; nobody else touches the iova range */
1179 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1180 			   domain->pgd, 0, start_pfn, last_pfn);
1181 
1182 	/* free pgd */
1183 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184 		free_pgtable_page(domain->pgd);
1185 		domain->pgd = NULL;
1186 	}
1187 }
1188 
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196 					    int level, struct dma_pte *pte,
1197 					    struct page *freelist)
1198 {
1199 	struct page *pg;
1200 
1201 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202 	pg->freelist = freelist;
1203 	freelist = pg;
1204 
1205 	if (level == 1)
1206 		return freelist;
1207 
1208 	pte = page_address(pg);
1209 	do {
1210 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211 			freelist = dma_pte_list_pagetables(domain, level - 1,
1212 							   pte, freelist);
1213 		pte++;
1214 	} while (!first_pte_in_page(pte));
1215 
1216 	return freelist;
1217 }
1218 
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 					struct dma_pte *pte, unsigned long pfn,
1221 					unsigned long start_pfn,
1222 					unsigned long last_pfn,
1223 					struct page *freelist)
1224 {
1225 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226 
1227 	pfn = max(start_pfn, pfn);
1228 	pte = &pte[pfn_level_offset(pfn, level)];
1229 
1230 	do {
1231 		unsigned long level_pfn;
1232 
1233 		if (!dma_pte_present(pte))
1234 			goto next;
1235 
1236 		level_pfn = pfn & level_mask(level);
1237 
1238 		/* If range covers entire pagetable, free it */
1239 		if (start_pfn <= level_pfn &&
1240 		    last_pfn >= level_pfn + level_size(level) - 1) {
1241 			/* These suborbinate page tables are going away entirely. Don't
1242 			   bother to clear them; we're just going to *free* them. */
1243 			if (level > 1 && !dma_pte_superpage(pte))
1244 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245 
1246 			dma_clear_pte(pte);
1247 			if (!first_pte)
1248 				first_pte = pte;
1249 			last_pte = pte;
1250 		} else if (level > 1) {
1251 			/* Recurse down into a level that isn't *entirely* obsolete */
1252 			freelist = dma_pte_clear_level(domain, level - 1,
1253 						       phys_to_virt(dma_pte_addr(pte)),
1254 						       level_pfn, start_pfn, last_pfn,
1255 						       freelist);
1256 		}
1257 next:
1258 		pfn += level_size(level);
1259 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260 
1261 	if (first_pte)
1262 		domain_flush_cache(domain, first_pte,
1263 				   (void *)++last_pte - (void *)first_pte);
1264 
1265 	return freelist;
1266 }
1267 
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272 				 unsigned long start_pfn,
1273 				 unsigned long last_pfn,
1274 				 struct page *freelist)
1275 {
1276 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278 	BUG_ON(start_pfn > last_pfn);
1279 
1280 	/* we don't need lock here; nobody else touches the iova range */
1281 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282 				       domain->pgd, 0, start_pfn, last_pfn,
1283 				       freelist);
1284 
1285 	/* free pgd */
1286 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287 		struct page *pgd_page = virt_to_page(domain->pgd);
1288 		pgd_page->freelist = freelist;
1289 		freelist = pgd_page;
1290 
1291 		domain->pgd = NULL;
1292 	}
1293 
1294 	return freelist;
1295 }
1296 
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299 	struct page *pg;
1300 
1301 	while ((pg = freelist)) {
1302 		freelist = pg->freelist;
1303 		free_pgtable_page(page_address(pg));
1304 	}
1305 }
1306 
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310 	struct root_entry *root;
1311 	unsigned long flags;
1312 
1313 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314 	if (!root) {
1315 		pr_err("Allocating root entry for %s failed\n",
1316 			iommu->name);
1317 		return -ENOMEM;
1318 	}
1319 
1320 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1321 
1322 	spin_lock_irqsave(&iommu->lock, flags);
1323 	iommu->root_entry = root;
1324 	spin_unlock_irqrestore(&iommu->lock, flags);
1325 
1326 	return 0;
1327 }
1328 
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331 	u64 addr;
1332 	u32 sts;
1333 	unsigned long flag;
1334 
1335 	addr = virt_to_phys(iommu->root_entry);
1336 	if (sm_supported(iommu))
1337 		addr |= DMA_RTADDR_SMT;
1338 
1339 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341 
1342 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343 
1344 	/* Make sure hardware complete it */
1345 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346 		      readl, (sts & DMA_GSTS_RTPS), sts);
1347 
1348 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 
1350 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1351 	if (sm_supported(iommu))
1352 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1353 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1354 }
1355 
1356 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1357 {
1358 	u32 val;
1359 	unsigned long flag;
1360 
1361 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1362 		return;
1363 
1364 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1365 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1366 
1367 	/* Make sure hardware complete it */
1368 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1369 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1370 
1371 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1372 }
1373 
1374 /* return value determine if we need a write buffer flush */
1375 static void __iommu_flush_context(struct intel_iommu *iommu,
1376 				  u16 did, u16 source_id, u8 function_mask,
1377 				  u64 type)
1378 {
1379 	u64 val = 0;
1380 	unsigned long flag;
1381 
1382 	switch (type) {
1383 	case DMA_CCMD_GLOBAL_INVL:
1384 		val = DMA_CCMD_GLOBAL_INVL;
1385 		break;
1386 	case DMA_CCMD_DOMAIN_INVL:
1387 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1388 		break;
1389 	case DMA_CCMD_DEVICE_INVL:
1390 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1391 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1392 		break;
1393 	default:
1394 		BUG();
1395 	}
1396 	val |= DMA_CCMD_ICC;
1397 
1398 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1399 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1400 
1401 	/* Make sure hardware complete it */
1402 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1403 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1404 
1405 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406 }
1407 
1408 /* return value determine if we need a write buffer flush */
1409 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1410 				u64 addr, unsigned int size_order, u64 type)
1411 {
1412 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1413 	u64 val = 0, val_iva = 0;
1414 	unsigned long flag;
1415 
1416 	switch (type) {
1417 	case DMA_TLB_GLOBAL_FLUSH:
1418 		/* global flush doesn't need set IVA_REG */
1419 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1420 		break;
1421 	case DMA_TLB_DSI_FLUSH:
1422 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1423 		break;
1424 	case DMA_TLB_PSI_FLUSH:
1425 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1426 		/* IH bit is passed in as part of address */
1427 		val_iva = size_order | addr;
1428 		break;
1429 	default:
1430 		BUG();
1431 	}
1432 	/* Note: set drain read/write */
1433 #if 0
1434 	/*
1435 	 * This is probably to be super secure.. Looks like we can
1436 	 * ignore it without any impact.
1437 	 */
1438 	if (cap_read_drain(iommu->cap))
1439 		val |= DMA_TLB_READ_DRAIN;
1440 #endif
1441 	if (cap_write_drain(iommu->cap))
1442 		val |= DMA_TLB_WRITE_DRAIN;
1443 
1444 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1445 	/* Note: Only uses first TLB reg currently */
1446 	if (val_iva)
1447 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1448 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1449 
1450 	/* Make sure hardware complete it */
1451 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1452 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1453 
1454 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1455 
1456 	/* check IOTLB invalidation granularity */
1457 	if (DMA_TLB_IAIG(val) == 0)
1458 		pr_err("Flush IOTLB failed\n");
1459 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1460 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1461 			(unsigned long long)DMA_TLB_IIRG(type),
1462 			(unsigned long long)DMA_TLB_IAIG(val));
1463 }
1464 
1465 static struct device_domain_info *
1466 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1467 			 u8 bus, u8 devfn)
1468 {
1469 	struct device_domain_info *info;
1470 
1471 	assert_spin_locked(&device_domain_lock);
1472 
1473 	if (!iommu->qi)
1474 		return NULL;
1475 
1476 	list_for_each_entry(info, &domain->devices, link)
1477 		if (info->iommu == iommu && info->bus == bus &&
1478 		    info->devfn == devfn) {
1479 			if (info->ats_supported && info->dev)
1480 				return info;
1481 			break;
1482 		}
1483 
1484 	return NULL;
1485 }
1486 
1487 static void domain_update_iotlb(struct dmar_domain *domain)
1488 {
1489 	struct device_domain_info *info;
1490 	bool has_iotlb_device = false;
1491 
1492 	assert_spin_locked(&device_domain_lock);
1493 
1494 	list_for_each_entry(info, &domain->devices, link)
1495 		if (info->ats_enabled) {
1496 			has_iotlb_device = true;
1497 			break;
1498 		}
1499 
1500 	if (!has_iotlb_device) {
1501 		struct subdev_domain_info *sinfo;
1502 
1503 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1504 			info = get_domain_info(sinfo->pdev);
1505 			if (info && info->ats_enabled) {
1506 				has_iotlb_device = true;
1507 				break;
1508 			}
1509 		}
1510 	}
1511 
1512 	domain->has_iotlb_device = has_iotlb_device;
1513 }
1514 
1515 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1516 {
1517 	struct pci_dev *pdev;
1518 
1519 	assert_spin_locked(&device_domain_lock);
1520 
1521 	if (!info || !dev_is_pci(info->dev))
1522 		return;
1523 
1524 	pdev = to_pci_dev(info->dev);
1525 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1526 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1527 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1528 	 * reserved, which should be set to 0.
1529 	 */
1530 	if (!ecap_dit(info->iommu->ecap))
1531 		info->pfsid = 0;
1532 	else {
1533 		struct pci_dev *pf_pdev;
1534 
1535 		/* pdev will be returned if device is not a vf */
1536 		pf_pdev = pci_physfn(pdev);
1537 		info->pfsid = pci_dev_id(pf_pdev);
1538 	}
1539 
1540 #ifdef CONFIG_INTEL_IOMMU_SVM
1541 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1542 	   the device if you enable PASID support after ATS support is
1543 	   undefined. So always enable PASID support on devices which
1544 	   have it, even if we can't yet know if we're ever going to
1545 	   use it. */
1546 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1547 		info->pasid_enabled = 1;
1548 
1549 	if (info->pri_supported &&
1550 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1551 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1552 		info->pri_enabled = 1;
1553 #endif
1554 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1555 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1556 		info->ats_enabled = 1;
1557 		domain_update_iotlb(info->domain);
1558 		info->ats_qdep = pci_ats_queue_depth(pdev);
1559 	}
1560 }
1561 
1562 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1563 {
1564 	struct pci_dev *pdev;
1565 
1566 	assert_spin_locked(&device_domain_lock);
1567 
1568 	if (!dev_is_pci(info->dev))
1569 		return;
1570 
1571 	pdev = to_pci_dev(info->dev);
1572 
1573 	if (info->ats_enabled) {
1574 		pci_disable_ats(pdev);
1575 		info->ats_enabled = 0;
1576 		domain_update_iotlb(info->domain);
1577 	}
1578 #ifdef CONFIG_INTEL_IOMMU_SVM
1579 	if (info->pri_enabled) {
1580 		pci_disable_pri(pdev);
1581 		info->pri_enabled = 0;
1582 	}
1583 	if (info->pasid_enabled) {
1584 		pci_disable_pasid(pdev);
1585 		info->pasid_enabled = 0;
1586 	}
1587 #endif
1588 }
1589 
1590 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1591 				    u64 addr, unsigned int mask)
1592 {
1593 	u16 sid, qdep;
1594 
1595 	if (!info || !info->ats_enabled)
1596 		return;
1597 
1598 	sid = info->bus << 8 | info->devfn;
1599 	qdep = info->ats_qdep;
1600 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1601 			   qdep, addr, mask);
1602 }
1603 
1604 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1605 				  u64 addr, unsigned mask)
1606 {
1607 	unsigned long flags;
1608 	struct device_domain_info *info;
1609 	struct subdev_domain_info *sinfo;
1610 
1611 	if (!domain->has_iotlb_device)
1612 		return;
1613 
1614 	spin_lock_irqsave(&device_domain_lock, flags);
1615 	list_for_each_entry(info, &domain->devices, link)
1616 		__iommu_flush_dev_iotlb(info, addr, mask);
1617 
1618 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1619 		info = get_domain_info(sinfo->pdev);
1620 		__iommu_flush_dev_iotlb(info, addr, mask);
1621 	}
1622 	spin_unlock_irqrestore(&device_domain_lock, flags);
1623 }
1624 
1625 static void domain_flush_piotlb(struct intel_iommu *iommu,
1626 				struct dmar_domain *domain,
1627 				u64 addr, unsigned long npages, bool ih)
1628 {
1629 	u16 did = domain->iommu_did[iommu->seq_id];
1630 
1631 	if (domain->default_pasid)
1632 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1633 				addr, npages, ih);
1634 
1635 	if (!list_empty(&domain->devices))
1636 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1637 }
1638 
1639 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1640 				  struct dmar_domain *domain,
1641 				  unsigned long pfn, unsigned int pages,
1642 				  int ih, int map)
1643 {
1644 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1645 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1646 	u16 did = domain->iommu_did[iommu->seq_id];
1647 
1648 	BUG_ON(pages == 0);
1649 
1650 	if (ih)
1651 		ih = 1 << 6;
1652 
1653 	if (domain_use_first_level(domain)) {
1654 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1655 	} else {
1656 		/*
1657 		 * Fallback to domain selective flush if no PSI support or
1658 		 * the size is too big. PSI requires page size to be 2 ^ x,
1659 		 * and the base address is naturally aligned to the size.
1660 		 */
1661 		if (!cap_pgsel_inv(iommu->cap) ||
1662 		    mask > cap_max_amask_val(iommu->cap))
1663 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664 							DMA_TLB_DSI_FLUSH);
1665 		else
1666 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1667 							DMA_TLB_PSI_FLUSH);
1668 	}
1669 
1670 	/*
1671 	 * In caching mode, changes of pages from non-present to present require
1672 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1673 	 */
1674 	if (!cap_caching_mode(iommu->cap) || !map)
1675 		iommu_flush_dev_iotlb(domain, addr, mask);
1676 }
1677 
1678 /* Notification for newly created mappings */
1679 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1680 					struct dmar_domain *domain,
1681 					unsigned long pfn, unsigned int pages)
1682 {
1683 	/*
1684 	 * It's a non-present to present mapping. Only flush if caching mode
1685 	 * and second level.
1686 	 */
1687 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1688 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1689 	else
1690 		iommu_flush_write_buffer(iommu);
1691 }
1692 
1693 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1694 {
1695 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1696 	int idx;
1697 
1698 	for_each_domain_iommu(idx, dmar_domain) {
1699 		struct intel_iommu *iommu = g_iommus[idx];
1700 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1701 
1702 		if (domain_use_first_level(dmar_domain))
1703 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1704 		else
1705 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1706 						 DMA_TLB_DSI_FLUSH);
1707 
1708 		if (!cap_caching_mode(iommu->cap))
1709 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1710 					      0, MAX_AGAW_PFN_WIDTH);
1711 	}
1712 }
1713 
1714 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1715 {
1716 	u32 pmen;
1717 	unsigned long flags;
1718 
1719 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1720 		return;
1721 
1722 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1723 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1724 	pmen &= ~DMA_PMEN_EPM;
1725 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1726 
1727 	/* wait for the protected region status bit to clear */
1728 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1729 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1730 
1731 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1732 }
1733 
1734 static void iommu_enable_translation(struct intel_iommu *iommu)
1735 {
1736 	u32 sts;
1737 	unsigned long flags;
1738 
1739 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1740 	iommu->gcmd |= DMA_GCMD_TE;
1741 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1742 
1743 	/* Make sure hardware complete it */
1744 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1745 		      readl, (sts & DMA_GSTS_TES), sts);
1746 
1747 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1748 }
1749 
1750 static void iommu_disable_translation(struct intel_iommu *iommu)
1751 {
1752 	u32 sts;
1753 	unsigned long flag;
1754 
1755 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1756 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1757 		return;
1758 
1759 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1760 	iommu->gcmd &= ~DMA_GCMD_TE;
1761 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1762 
1763 	/* Make sure hardware complete it */
1764 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1765 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1766 
1767 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1768 }
1769 
1770 static int iommu_init_domains(struct intel_iommu *iommu)
1771 {
1772 	u32 ndomains, nlongs;
1773 	size_t size;
1774 
1775 	ndomains = cap_ndoms(iommu->cap);
1776 	pr_debug("%s: Number of Domains supported <%d>\n",
1777 		 iommu->name, ndomains);
1778 	nlongs = BITS_TO_LONGS(ndomains);
1779 
1780 	spin_lock_init(&iommu->lock);
1781 
1782 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1783 	if (!iommu->domain_ids) {
1784 		pr_err("%s: Allocating domain id array failed\n",
1785 		       iommu->name);
1786 		return -ENOMEM;
1787 	}
1788 
1789 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1790 	iommu->domains = kzalloc(size, GFP_KERNEL);
1791 
1792 	if (iommu->domains) {
1793 		size = 256 * sizeof(struct dmar_domain *);
1794 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1795 	}
1796 
1797 	if (!iommu->domains || !iommu->domains[0]) {
1798 		pr_err("%s: Allocating domain array failed\n",
1799 		       iommu->name);
1800 		kfree(iommu->domain_ids);
1801 		kfree(iommu->domains);
1802 		iommu->domain_ids = NULL;
1803 		iommu->domains    = NULL;
1804 		return -ENOMEM;
1805 	}
1806 
1807 	/*
1808 	 * If Caching mode is set, then invalid translations are tagged
1809 	 * with domain-id 0, hence we need to pre-allocate it. We also
1810 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1811 	 * make sure it is not used for a real domain.
1812 	 */
1813 	set_bit(0, iommu->domain_ids);
1814 
1815 	/*
1816 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1817 	 * entry for first-level or pass-through translation modes should
1818 	 * be programmed with a domain id different from those used for
1819 	 * second-level or nested translation. We reserve a domain id for
1820 	 * this purpose.
1821 	 */
1822 	if (sm_supported(iommu))
1823 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1824 
1825 	return 0;
1826 }
1827 
1828 static void disable_dmar_iommu(struct intel_iommu *iommu)
1829 {
1830 	struct device_domain_info *info, *tmp;
1831 	unsigned long flags;
1832 
1833 	if (!iommu->domains || !iommu->domain_ids)
1834 		return;
1835 
1836 	spin_lock_irqsave(&device_domain_lock, flags);
1837 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1838 		if (info->iommu != iommu)
1839 			continue;
1840 
1841 		if (!info->dev || !info->domain)
1842 			continue;
1843 
1844 		__dmar_remove_one_dev_info(info);
1845 	}
1846 	spin_unlock_irqrestore(&device_domain_lock, flags);
1847 
1848 	if (iommu->gcmd & DMA_GCMD_TE)
1849 		iommu_disable_translation(iommu);
1850 }
1851 
1852 static void free_dmar_iommu(struct intel_iommu *iommu)
1853 {
1854 	if ((iommu->domains) && (iommu->domain_ids)) {
1855 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1856 		int i;
1857 
1858 		for (i = 0; i < elems; i++)
1859 			kfree(iommu->domains[i]);
1860 		kfree(iommu->domains);
1861 		kfree(iommu->domain_ids);
1862 		iommu->domains = NULL;
1863 		iommu->domain_ids = NULL;
1864 	}
1865 
1866 	g_iommus[iommu->seq_id] = NULL;
1867 
1868 	/* free context mapping */
1869 	free_context_table(iommu);
1870 
1871 #ifdef CONFIG_INTEL_IOMMU_SVM
1872 	if (pasid_supported(iommu)) {
1873 		if (ecap_prs(iommu->ecap))
1874 			intel_svm_finish_prq(iommu);
1875 	}
1876 	if (vccap_pasid(iommu->vccap))
1877 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1878 
1879 #endif
1880 }
1881 
1882 /*
1883  * Check and return whether first level is used by default for
1884  * DMA translation.
1885  */
1886 static bool first_level_by_default(void)
1887 {
1888 	return scalable_mode_support() && intel_cap_flts_sanity();
1889 }
1890 
1891 static struct dmar_domain *alloc_domain(int flags)
1892 {
1893 	struct dmar_domain *domain;
1894 
1895 	domain = alloc_domain_mem();
1896 	if (!domain)
1897 		return NULL;
1898 
1899 	memset(domain, 0, sizeof(*domain));
1900 	domain->nid = NUMA_NO_NODE;
1901 	domain->flags = flags;
1902 	if (first_level_by_default())
1903 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1904 	domain->has_iotlb_device = false;
1905 	INIT_LIST_HEAD(&domain->devices);
1906 	INIT_LIST_HEAD(&domain->subdevices);
1907 
1908 	return domain;
1909 }
1910 
1911 /* Must be called with iommu->lock */
1912 static int domain_attach_iommu(struct dmar_domain *domain,
1913 			       struct intel_iommu *iommu)
1914 {
1915 	unsigned long ndomains;
1916 	int num;
1917 
1918 	assert_spin_locked(&device_domain_lock);
1919 	assert_spin_locked(&iommu->lock);
1920 
1921 	domain->iommu_refcnt[iommu->seq_id] += 1;
1922 	domain->iommu_count += 1;
1923 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1924 		ndomains = cap_ndoms(iommu->cap);
1925 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1926 
1927 		if (num >= ndomains) {
1928 			pr_err("%s: No free domain ids\n", iommu->name);
1929 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1930 			domain->iommu_count -= 1;
1931 			return -ENOSPC;
1932 		}
1933 
1934 		set_bit(num, iommu->domain_ids);
1935 		set_iommu_domain(iommu, num, domain);
1936 
1937 		domain->iommu_did[iommu->seq_id] = num;
1938 		domain->nid			 = iommu->node;
1939 
1940 		domain_update_iommu_cap(domain);
1941 	}
1942 
1943 	return 0;
1944 }
1945 
1946 static int domain_detach_iommu(struct dmar_domain *domain,
1947 			       struct intel_iommu *iommu)
1948 {
1949 	int num, count;
1950 
1951 	assert_spin_locked(&device_domain_lock);
1952 	assert_spin_locked(&iommu->lock);
1953 
1954 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1955 	count = --domain->iommu_count;
1956 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1957 		num = domain->iommu_did[iommu->seq_id];
1958 		clear_bit(num, iommu->domain_ids);
1959 		set_iommu_domain(iommu, num, NULL);
1960 
1961 		domain_update_iommu_cap(domain);
1962 		domain->iommu_did[iommu->seq_id] = 0;
1963 	}
1964 
1965 	return count;
1966 }
1967 
1968 static inline int guestwidth_to_adjustwidth(int gaw)
1969 {
1970 	int agaw;
1971 	int r = (gaw - 12) % 9;
1972 
1973 	if (r == 0)
1974 		agaw = gaw;
1975 	else
1976 		agaw = gaw + 9 - r;
1977 	if (agaw > 64)
1978 		agaw = 64;
1979 	return agaw;
1980 }
1981 
1982 static void domain_exit(struct dmar_domain *domain)
1983 {
1984 
1985 	/* Remove associated devices and clear attached or cached domains */
1986 	domain_remove_dev_info(domain);
1987 
1988 	/* destroy iovas */
1989 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1990 		iommu_put_dma_cookie(&domain->domain);
1991 
1992 	if (domain->pgd) {
1993 		struct page *freelist;
1994 
1995 		freelist = domain_unmap(domain, 0,
1996 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1997 		dma_free_pagelist(freelist);
1998 	}
1999 
2000 	free_domain_mem(domain);
2001 }
2002 
2003 /*
2004  * Get the PASID directory size for scalable mode context entry.
2005  * Value of X in the PDTS field of a scalable mode context entry
2006  * indicates PASID directory with 2^(X + 7) entries.
2007  */
2008 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2009 {
2010 	int pds, max_pde;
2011 
2012 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2013 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2014 	if (pds < 7)
2015 		return 0;
2016 
2017 	return pds - 7;
2018 }
2019 
2020 /*
2021  * Set the RID_PASID field of a scalable mode context entry. The
2022  * IOMMU hardware will use the PASID value set in this field for
2023  * DMA translations of DMA requests without PASID.
2024  */
2025 static inline void
2026 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2027 {
2028 	context->hi |= pasid & ((1 << 20) - 1);
2029 }
2030 
2031 /*
2032  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2033  * entry.
2034  */
2035 static inline void context_set_sm_dte(struct context_entry *context)
2036 {
2037 	context->lo |= (1 << 2);
2038 }
2039 
2040 /*
2041  * Set the PRE(Page Request Enable) field of a scalable mode context
2042  * entry.
2043  */
2044 static inline void context_set_sm_pre(struct context_entry *context)
2045 {
2046 	context->lo |= (1 << 4);
2047 }
2048 
2049 /* Convert value to context PASID directory size field coding. */
2050 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2051 
2052 static int domain_context_mapping_one(struct dmar_domain *domain,
2053 				      struct intel_iommu *iommu,
2054 				      struct pasid_table *table,
2055 				      u8 bus, u8 devfn)
2056 {
2057 	u16 did = domain->iommu_did[iommu->seq_id];
2058 	int translation = CONTEXT_TT_MULTI_LEVEL;
2059 	struct device_domain_info *info = NULL;
2060 	struct context_entry *context;
2061 	unsigned long flags;
2062 	int ret;
2063 
2064 	WARN_ON(did == 0);
2065 
2066 	if (hw_pass_through && domain_type_is_si(domain))
2067 		translation = CONTEXT_TT_PASS_THROUGH;
2068 
2069 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2070 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2071 
2072 	BUG_ON(!domain->pgd);
2073 
2074 	spin_lock_irqsave(&device_domain_lock, flags);
2075 	spin_lock(&iommu->lock);
2076 
2077 	ret = -ENOMEM;
2078 	context = iommu_context_addr(iommu, bus, devfn, 1);
2079 	if (!context)
2080 		goto out_unlock;
2081 
2082 	ret = 0;
2083 	if (context_present(context))
2084 		goto out_unlock;
2085 
2086 	/*
2087 	 * For kdump cases, old valid entries may be cached due to the
2088 	 * in-flight DMA and copied pgtable, but there is no unmapping
2089 	 * behaviour for them, thus we need an explicit cache flush for
2090 	 * the newly-mapped device. For kdump, at this point, the device
2091 	 * is supposed to finish reset at its driver probe stage, so no
2092 	 * in-flight DMA will exist, and we don't need to worry anymore
2093 	 * hereafter.
2094 	 */
2095 	if (context_copied(context)) {
2096 		u16 did_old = context_domain_id(context);
2097 
2098 		if (did_old < cap_ndoms(iommu->cap)) {
2099 			iommu->flush.flush_context(iommu, did_old,
2100 						   (((u16)bus) << 8) | devfn,
2101 						   DMA_CCMD_MASK_NOBIT,
2102 						   DMA_CCMD_DEVICE_INVL);
2103 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2104 						 DMA_TLB_DSI_FLUSH);
2105 		}
2106 	}
2107 
2108 	context_clear_entry(context);
2109 
2110 	if (sm_supported(iommu)) {
2111 		unsigned long pds;
2112 
2113 		WARN_ON(!table);
2114 
2115 		/* Setup the PASID DIR pointer: */
2116 		pds = context_get_sm_pds(table);
2117 		context->lo = (u64)virt_to_phys(table->table) |
2118 				context_pdts(pds);
2119 
2120 		/* Setup the RID_PASID field: */
2121 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2122 
2123 		/*
2124 		 * Setup the Device-TLB enable bit and Page request
2125 		 * Enable bit:
2126 		 */
2127 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2128 		if (info && info->ats_supported)
2129 			context_set_sm_dte(context);
2130 		if (info && info->pri_supported)
2131 			context_set_sm_pre(context);
2132 	} else {
2133 		struct dma_pte *pgd = domain->pgd;
2134 		int agaw;
2135 
2136 		context_set_domain_id(context, did);
2137 
2138 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2139 			/*
2140 			 * Skip top levels of page tables for iommu which has
2141 			 * less agaw than default. Unnecessary for PT mode.
2142 			 */
2143 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2144 				ret = -ENOMEM;
2145 				pgd = phys_to_virt(dma_pte_addr(pgd));
2146 				if (!dma_pte_present(pgd))
2147 					goto out_unlock;
2148 			}
2149 
2150 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2151 			if (info && info->ats_supported)
2152 				translation = CONTEXT_TT_DEV_IOTLB;
2153 			else
2154 				translation = CONTEXT_TT_MULTI_LEVEL;
2155 
2156 			context_set_address_root(context, virt_to_phys(pgd));
2157 			context_set_address_width(context, agaw);
2158 		} else {
2159 			/*
2160 			 * In pass through mode, AW must be programmed to
2161 			 * indicate the largest AGAW value supported by
2162 			 * hardware. And ASR is ignored by hardware.
2163 			 */
2164 			context_set_address_width(context, iommu->msagaw);
2165 		}
2166 
2167 		context_set_translation_type(context, translation);
2168 	}
2169 
2170 	context_set_fault_enable(context);
2171 	context_set_present(context);
2172 	if (!ecap_coherent(iommu->ecap))
2173 		clflush_cache_range(context, sizeof(*context));
2174 
2175 	/*
2176 	 * It's a non-present to present mapping. If hardware doesn't cache
2177 	 * non-present entry we only need to flush the write-buffer. If the
2178 	 * _does_ cache non-present entries, then it does so in the special
2179 	 * domain #0, which we have to flush:
2180 	 */
2181 	if (cap_caching_mode(iommu->cap)) {
2182 		iommu->flush.flush_context(iommu, 0,
2183 					   (((u16)bus) << 8) | devfn,
2184 					   DMA_CCMD_MASK_NOBIT,
2185 					   DMA_CCMD_DEVICE_INVL);
2186 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2187 	} else {
2188 		iommu_flush_write_buffer(iommu);
2189 	}
2190 	iommu_enable_dev_iotlb(info);
2191 
2192 	ret = 0;
2193 
2194 out_unlock:
2195 	spin_unlock(&iommu->lock);
2196 	spin_unlock_irqrestore(&device_domain_lock, flags);
2197 
2198 	return ret;
2199 }
2200 
2201 struct domain_context_mapping_data {
2202 	struct dmar_domain *domain;
2203 	struct intel_iommu *iommu;
2204 	struct pasid_table *table;
2205 };
2206 
2207 static int domain_context_mapping_cb(struct pci_dev *pdev,
2208 				     u16 alias, void *opaque)
2209 {
2210 	struct domain_context_mapping_data *data = opaque;
2211 
2212 	return domain_context_mapping_one(data->domain, data->iommu,
2213 					  data->table, PCI_BUS_NUM(alias),
2214 					  alias & 0xff);
2215 }
2216 
2217 static int
2218 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2219 {
2220 	struct domain_context_mapping_data data;
2221 	struct pasid_table *table;
2222 	struct intel_iommu *iommu;
2223 	u8 bus, devfn;
2224 
2225 	iommu = device_to_iommu(dev, &bus, &devfn);
2226 	if (!iommu)
2227 		return -ENODEV;
2228 
2229 	table = intel_pasid_get_table(dev);
2230 
2231 	if (!dev_is_pci(dev))
2232 		return domain_context_mapping_one(domain, iommu, table,
2233 						  bus, devfn);
2234 
2235 	data.domain = domain;
2236 	data.iommu = iommu;
2237 	data.table = table;
2238 
2239 	return pci_for_each_dma_alias(to_pci_dev(dev),
2240 				      &domain_context_mapping_cb, &data);
2241 }
2242 
2243 static int domain_context_mapped_cb(struct pci_dev *pdev,
2244 				    u16 alias, void *opaque)
2245 {
2246 	struct intel_iommu *iommu = opaque;
2247 
2248 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2249 }
2250 
2251 static int domain_context_mapped(struct device *dev)
2252 {
2253 	struct intel_iommu *iommu;
2254 	u8 bus, devfn;
2255 
2256 	iommu = device_to_iommu(dev, &bus, &devfn);
2257 	if (!iommu)
2258 		return -ENODEV;
2259 
2260 	if (!dev_is_pci(dev))
2261 		return device_context_mapped(iommu, bus, devfn);
2262 
2263 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2264 				       domain_context_mapped_cb, iommu);
2265 }
2266 
2267 /* Returns a number of VTD pages, but aligned to MM page size */
2268 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2269 					    size_t size)
2270 {
2271 	host_addr &= ~PAGE_MASK;
2272 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2273 }
2274 
2275 /* Return largest possible superpage level for a given mapping */
2276 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2277 					  unsigned long iov_pfn,
2278 					  unsigned long phy_pfn,
2279 					  unsigned long pages)
2280 {
2281 	int support, level = 1;
2282 	unsigned long pfnmerge;
2283 
2284 	support = domain->iommu_superpage;
2285 
2286 	/* To use a large page, the virtual *and* physical addresses
2287 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2288 	   of them will mean we have to use smaller pages. So just
2289 	   merge them and check both at once. */
2290 	pfnmerge = iov_pfn | phy_pfn;
2291 
2292 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2293 		pages >>= VTD_STRIDE_SHIFT;
2294 		if (!pages)
2295 			break;
2296 		pfnmerge >>= VTD_STRIDE_SHIFT;
2297 		level++;
2298 		support--;
2299 	}
2300 	return level;
2301 }
2302 
2303 /*
2304  * Ensure that old small page tables are removed to make room for superpage(s).
2305  * We're going to add new large pages, so make sure we don't remove their parent
2306  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2307  */
2308 static void switch_to_super_page(struct dmar_domain *domain,
2309 				 unsigned long start_pfn,
2310 				 unsigned long end_pfn, int level)
2311 {
2312 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2313 	struct dma_pte *pte = NULL;
2314 	int i;
2315 
2316 	while (start_pfn <= end_pfn) {
2317 		if (!pte)
2318 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2319 
2320 		if (dma_pte_present(pte)) {
2321 			dma_pte_free_pagetable(domain, start_pfn,
2322 					       start_pfn + lvl_pages - 1,
2323 					       level + 1);
2324 
2325 			for_each_domain_iommu(i, domain)
2326 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2327 						      start_pfn, lvl_pages,
2328 						      0, 0);
2329 		}
2330 
2331 		pte++;
2332 		start_pfn += lvl_pages;
2333 		if (first_pte_in_page(pte))
2334 			pte = NULL;
2335 	}
2336 }
2337 
2338 static int
2339 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2340 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2341 {
2342 	unsigned int largepage_lvl = 0;
2343 	unsigned long lvl_pages = 0;
2344 	struct dma_pte *pte = NULL;
2345 	phys_addr_t pteval;
2346 	u64 attr;
2347 
2348 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2349 
2350 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2351 		return -EINVAL;
2352 
2353 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2354 	attr |= DMA_FL_PTE_PRESENT;
2355 	if (domain_use_first_level(domain)) {
2356 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2357 
2358 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2359 			attr |= DMA_FL_PTE_ACCESS;
2360 			if (prot & DMA_PTE_WRITE)
2361 				attr |= DMA_FL_PTE_DIRTY;
2362 		}
2363 	}
2364 
2365 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2366 
2367 	while (nr_pages > 0) {
2368 		uint64_t tmp;
2369 
2370 		if (!pte) {
2371 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2372 					phys_pfn, nr_pages);
2373 
2374 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2375 			if (!pte)
2376 				return -ENOMEM;
2377 			/* It is large page*/
2378 			if (largepage_lvl > 1) {
2379 				unsigned long end_pfn;
2380 
2381 				pteval |= DMA_PTE_LARGE_PAGE;
2382 				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2383 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2384 			} else {
2385 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2386 			}
2387 
2388 		}
2389 		/* We don't need lock here, nobody else
2390 		 * touches the iova range
2391 		 */
2392 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2393 		if (tmp) {
2394 			static int dumps = 5;
2395 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2396 				iov_pfn, tmp, (unsigned long long)pteval);
2397 			if (dumps) {
2398 				dumps--;
2399 				debug_dma_dump_mappings(NULL);
2400 			}
2401 			WARN_ON(1);
2402 		}
2403 
2404 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2405 
2406 		BUG_ON(nr_pages < lvl_pages);
2407 
2408 		nr_pages -= lvl_pages;
2409 		iov_pfn += lvl_pages;
2410 		phys_pfn += lvl_pages;
2411 		pteval += lvl_pages * VTD_PAGE_SIZE;
2412 
2413 		/* If the next PTE would be the first in a new page, then we
2414 		 * need to flush the cache on the entries we've just written.
2415 		 * And then we'll need to recalculate 'pte', so clear it and
2416 		 * let it get set again in the if (!pte) block above.
2417 		 *
2418 		 * If we're done (!nr_pages) we need to flush the cache too.
2419 		 *
2420 		 * Also if we've been setting superpages, we may need to
2421 		 * recalculate 'pte' and switch back to smaller pages for the
2422 		 * end of the mapping, if the trailing size is not enough to
2423 		 * use another superpage (i.e. nr_pages < lvl_pages).
2424 		 *
2425 		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2426 		 * callback.
2427 		 */
2428 		pte++;
2429 		if (!nr_pages || first_pte_in_page(pte) ||
2430 		    (largepage_lvl > 1 && nr_pages < lvl_pages))
2431 			pte = NULL;
2432 	}
2433 
2434 	return 0;
2435 }
2436 
2437 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2438 {
2439 	unsigned long flags;
2440 	struct context_entry *context;
2441 	u16 did_old;
2442 
2443 	if (!iommu)
2444 		return;
2445 
2446 	spin_lock_irqsave(&iommu->lock, flags);
2447 	context = iommu_context_addr(iommu, bus, devfn, 0);
2448 	if (!context) {
2449 		spin_unlock_irqrestore(&iommu->lock, flags);
2450 		return;
2451 	}
2452 	did_old = context_domain_id(context);
2453 	context_clear_entry(context);
2454 	__iommu_flush_cache(iommu, context, sizeof(*context));
2455 	spin_unlock_irqrestore(&iommu->lock, flags);
2456 	iommu->flush.flush_context(iommu,
2457 				   did_old,
2458 				   (((u16)bus) << 8) | devfn,
2459 				   DMA_CCMD_MASK_NOBIT,
2460 				   DMA_CCMD_DEVICE_INVL);
2461 
2462 	if (sm_supported(iommu))
2463 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2464 
2465 	iommu->flush.flush_iotlb(iommu,
2466 				 did_old,
2467 				 0,
2468 				 0,
2469 				 DMA_TLB_DSI_FLUSH);
2470 }
2471 
2472 static inline void unlink_domain_info(struct device_domain_info *info)
2473 {
2474 	assert_spin_locked(&device_domain_lock);
2475 	list_del(&info->link);
2476 	list_del(&info->global);
2477 	if (info->dev)
2478 		dev_iommu_priv_set(info->dev, NULL);
2479 }
2480 
2481 static void domain_remove_dev_info(struct dmar_domain *domain)
2482 {
2483 	struct device_domain_info *info, *tmp;
2484 	unsigned long flags;
2485 
2486 	spin_lock_irqsave(&device_domain_lock, flags);
2487 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2488 		__dmar_remove_one_dev_info(info);
2489 	spin_unlock_irqrestore(&device_domain_lock, flags);
2490 }
2491 
2492 struct dmar_domain *find_domain(struct device *dev)
2493 {
2494 	struct device_domain_info *info;
2495 
2496 	if (unlikely(!dev || !dev->iommu))
2497 		return NULL;
2498 
2499 	if (unlikely(attach_deferred(dev)))
2500 		return NULL;
2501 
2502 	/* No lock here, assumes no domain exit in normal case */
2503 	info = get_domain_info(dev);
2504 	if (likely(info))
2505 		return info->domain;
2506 
2507 	return NULL;
2508 }
2509 
2510 static inline struct device_domain_info *
2511 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2512 {
2513 	struct device_domain_info *info;
2514 
2515 	list_for_each_entry(info, &device_domain_list, global)
2516 		if (info->segment == segment && info->bus == bus &&
2517 		    info->devfn == devfn)
2518 			return info;
2519 
2520 	return NULL;
2521 }
2522 
2523 static int domain_setup_first_level(struct intel_iommu *iommu,
2524 				    struct dmar_domain *domain,
2525 				    struct device *dev,
2526 				    u32 pasid)
2527 {
2528 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2529 	struct dma_pte *pgd = domain->pgd;
2530 	int agaw, level;
2531 
2532 	/*
2533 	 * Skip top levels of page tables for iommu which has
2534 	 * less agaw than default. Unnecessary for PT mode.
2535 	 */
2536 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2537 		pgd = phys_to_virt(dma_pte_addr(pgd));
2538 		if (!dma_pte_present(pgd))
2539 			return -ENOMEM;
2540 	}
2541 
2542 	level = agaw_to_level(agaw);
2543 	if (level != 4 && level != 5)
2544 		return -EINVAL;
2545 
2546 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2547 
2548 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2549 		flags |= PASID_FLAG_PAGE_SNOOP;
2550 
2551 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2552 					     domain->iommu_did[iommu->seq_id],
2553 					     flags);
2554 }
2555 
2556 static bool dev_is_real_dma_subdevice(struct device *dev)
2557 {
2558 	return dev && dev_is_pci(dev) &&
2559 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2560 }
2561 
2562 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2563 						    int bus, int devfn,
2564 						    struct device *dev,
2565 						    struct dmar_domain *domain)
2566 {
2567 	struct dmar_domain *found = NULL;
2568 	struct device_domain_info *info;
2569 	unsigned long flags;
2570 	int ret;
2571 
2572 	info = alloc_devinfo_mem();
2573 	if (!info)
2574 		return NULL;
2575 
2576 	if (!dev_is_real_dma_subdevice(dev)) {
2577 		info->bus = bus;
2578 		info->devfn = devfn;
2579 		info->segment = iommu->segment;
2580 	} else {
2581 		struct pci_dev *pdev = to_pci_dev(dev);
2582 
2583 		info->bus = pdev->bus->number;
2584 		info->devfn = pdev->devfn;
2585 		info->segment = pci_domain_nr(pdev->bus);
2586 	}
2587 
2588 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2589 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2590 	info->ats_qdep = 0;
2591 	info->dev = dev;
2592 	info->domain = domain;
2593 	info->iommu = iommu;
2594 	info->pasid_table = NULL;
2595 	info->auxd_enabled = 0;
2596 	INIT_LIST_HEAD(&info->subdevices);
2597 
2598 	if (dev && dev_is_pci(dev)) {
2599 		struct pci_dev *pdev = to_pci_dev(info->dev);
2600 
2601 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2602 		    pci_ats_supported(pdev) &&
2603 		    dmar_find_matched_atsr_unit(pdev))
2604 			info->ats_supported = 1;
2605 
2606 		if (sm_supported(iommu)) {
2607 			if (pasid_supported(iommu)) {
2608 				int features = pci_pasid_features(pdev);
2609 				if (features >= 0)
2610 					info->pasid_supported = features | 1;
2611 			}
2612 
2613 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2614 			    pci_pri_supported(pdev))
2615 				info->pri_supported = 1;
2616 		}
2617 	}
2618 
2619 	spin_lock_irqsave(&device_domain_lock, flags);
2620 	if (dev)
2621 		found = find_domain(dev);
2622 
2623 	if (!found) {
2624 		struct device_domain_info *info2;
2625 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2626 						       info->devfn);
2627 		if (info2) {
2628 			found      = info2->domain;
2629 			info2->dev = dev;
2630 		}
2631 	}
2632 
2633 	if (found) {
2634 		spin_unlock_irqrestore(&device_domain_lock, flags);
2635 		free_devinfo_mem(info);
2636 		/* Caller must free the original domain */
2637 		return found;
2638 	}
2639 
2640 	spin_lock(&iommu->lock);
2641 	ret = domain_attach_iommu(domain, iommu);
2642 	spin_unlock(&iommu->lock);
2643 
2644 	if (ret) {
2645 		spin_unlock_irqrestore(&device_domain_lock, flags);
2646 		free_devinfo_mem(info);
2647 		return NULL;
2648 	}
2649 
2650 	list_add(&info->link, &domain->devices);
2651 	list_add(&info->global, &device_domain_list);
2652 	if (dev)
2653 		dev_iommu_priv_set(dev, info);
2654 	spin_unlock_irqrestore(&device_domain_lock, flags);
2655 
2656 	/* PASID table is mandatory for a PCI device in scalable mode. */
2657 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2658 		ret = intel_pasid_alloc_table(dev);
2659 		if (ret) {
2660 			dev_err(dev, "PASID table allocation failed\n");
2661 			dmar_remove_one_dev_info(dev);
2662 			return NULL;
2663 		}
2664 
2665 		/* Setup the PASID entry for requests without PASID: */
2666 		spin_lock_irqsave(&iommu->lock, flags);
2667 		if (hw_pass_through && domain_type_is_si(domain))
2668 			ret = intel_pasid_setup_pass_through(iommu, domain,
2669 					dev, PASID_RID2PASID);
2670 		else if (domain_use_first_level(domain))
2671 			ret = domain_setup_first_level(iommu, domain, dev,
2672 					PASID_RID2PASID);
2673 		else
2674 			ret = intel_pasid_setup_second_level(iommu, domain,
2675 					dev, PASID_RID2PASID);
2676 		spin_unlock_irqrestore(&iommu->lock, flags);
2677 		if (ret) {
2678 			dev_err(dev, "Setup RID2PASID failed\n");
2679 			dmar_remove_one_dev_info(dev);
2680 			return NULL;
2681 		}
2682 	}
2683 
2684 	if (dev && domain_context_mapping(domain, dev)) {
2685 		dev_err(dev, "Domain context map failed\n");
2686 		dmar_remove_one_dev_info(dev);
2687 		return NULL;
2688 	}
2689 
2690 	return domain;
2691 }
2692 
2693 static int iommu_domain_identity_map(struct dmar_domain *domain,
2694 				     unsigned long first_vpfn,
2695 				     unsigned long last_vpfn)
2696 {
2697 	/*
2698 	 * RMRR range might have overlap with physical memory range,
2699 	 * clear it first
2700 	 */
2701 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2702 
2703 	return __domain_mapping(domain, first_vpfn,
2704 				first_vpfn, last_vpfn - first_vpfn + 1,
2705 				DMA_PTE_READ|DMA_PTE_WRITE);
2706 }
2707 
2708 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2709 
2710 static int __init si_domain_init(int hw)
2711 {
2712 	struct dmar_rmrr_unit *rmrr;
2713 	struct device *dev;
2714 	int i, nid, ret;
2715 
2716 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2717 	if (!si_domain)
2718 		return -EFAULT;
2719 
2720 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2721 		domain_exit(si_domain);
2722 		return -EFAULT;
2723 	}
2724 
2725 	if (hw)
2726 		return 0;
2727 
2728 	for_each_online_node(nid) {
2729 		unsigned long start_pfn, end_pfn;
2730 		int i;
2731 
2732 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2733 			ret = iommu_domain_identity_map(si_domain,
2734 					mm_to_dma_pfn(start_pfn),
2735 					mm_to_dma_pfn(end_pfn));
2736 			if (ret)
2737 				return ret;
2738 		}
2739 	}
2740 
2741 	/*
2742 	 * Identity map the RMRRs so that devices with RMRRs could also use
2743 	 * the si_domain.
2744 	 */
2745 	for_each_rmrr_units(rmrr) {
2746 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2747 					  i, dev) {
2748 			unsigned long long start = rmrr->base_address;
2749 			unsigned long long end = rmrr->end_address;
2750 
2751 			if (WARN_ON(end < start ||
2752 				    end >> agaw_to_width(si_domain->agaw)))
2753 				continue;
2754 
2755 			ret = iommu_domain_identity_map(si_domain,
2756 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2757 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2758 			if (ret)
2759 				return ret;
2760 		}
2761 	}
2762 
2763 	return 0;
2764 }
2765 
2766 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2767 {
2768 	struct dmar_domain *ndomain;
2769 	struct intel_iommu *iommu;
2770 	u8 bus, devfn;
2771 
2772 	iommu = device_to_iommu(dev, &bus, &devfn);
2773 	if (!iommu)
2774 		return -ENODEV;
2775 
2776 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2777 	if (ndomain != domain)
2778 		return -EBUSY;
2779 
2780 	return 0;
2781 }
2782 
2783 static bool device_has_rmrr(struct device *dev)
2784 {
2785 	struct dmar_rmrr_unit *rmrr;
2786 	struct device *tmp;
2787 	int i;
2788 
2789 	rcu_read_lock();
2790 	for_each_rmrr_units(rmrr) {
2791 		/*
2792 		 * Return TRUE if this RMRR contains the device that
2793 		 * is passed in.
2794 		 */
2795 		for_each_active_dev_scope(rmrr->devices,
2796 					  rmrr->devices_cnt, i, tmp)
2797 			if (tmp == dev ||
2798 			    is_downstream_to_pci_bridge(dev, tmp)) {
2799 				rcu_read_unlock();
2800 				return true;
2801 			}
2802 	}
2803 	rcu_read_unlock();
2804 	return false;
2805 }
2806 
2807 /**
2808  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2809  * is relaxable (ie. is allowed to be not enforced under some conditions)
2810  * @dev: device handle
2811  *
2812  * We assume that PCI USB devices with RMRRs have them largely
2813  * for historical reasons and that the RMRR space is not actively used post
2814  * boot.  This exclusion may change if vendors begin to abuse it.
2815  *
2816  * The same exception is made for graphics devices, with the requirement that
2817  * any use of the RMRR regions will be torn down before assigning the device
2818  * to a guest.
2819  *
2820  * Return: true if the RMRR is relaxable, false otherwise
2821  */
2822 static bool device_rmrr_is_relaxable(struct device *dev)
2823 {
2824 	struct pci_dev *pdev;
2825 
2826 	if (!dev_is_pci(dev))
2827 		return false;
2828 
2829 	pdev = to_pci_dev(dev);
2830 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2831 		return true;
2832 	else
2833 		return false;
2834 }
2835 
2836 /*
2837  * There are a couple cases where we need to restrict the functionality of
2838  * devices associated with RMRRs.  The first is when evaluating a device for
2839  * identity mapping because problems exist when devices are moved in and out
2840  * of domains and their respective RMRR information is lost.  This means that
2841  * a device with associated RMRRs will never be in a "passthrough" domain.
2842  * The second is use of the device through the IOMMU API.  This interface
2843  * expects to have full control of the IOVA space for the device.  We cannot
2844  * satisfy both the requirement that RMRR access is maintained and have an
2845  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2846  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2847  * We therefore prevent devices associated with an RMRR from participating in
2848  * the IOMMU API, which eliminates them from device assignment.
2849  *
2850  * In both cases, devices which have relaxable RMRRs are not concerned by this
2851  * restriction. See device_rmrr_is_relaxable comment.
2852  */
2853 static bool device_is_rmrr_locked(struct device *dev)
2854 {
2855 	if (!device_has_rmrr(dev))
2856 		return false;
2857 
2858 	if (device_rmrr_is_relaxable(dev))
2859 		return false;
2860 
2861 	return true;
2862 }
2863 
2864 /*
2865  * Return the required default domain type for a specific device.
2866  *
2867  * @dev: the device in query
2868  * @startup: true if this is during early boot
2869  *
2870  * Returns:
2871  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2872  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2873  *  - 0: both identity and dynamic domains work for this device
2874  */
2875 static int device_def_domain_type(struct device *dev)
2876 {
2877 	if (dev_is_pci(dev)) {
2878 		struct pci_dev *pdev = to_pci_dev(dev);
2879 
2880 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2881 			return IOMMU_DOMAIN_IDENTITY;
2882 
2883 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2884 			return IOMMU_DOMAIN_IDENTITY;
2885 	}
2886 
2887 	return 0;
2888 }
2889 
2890 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2891 {
2892 	/*
2893 	 * Start from the sane iommu hardware state.
2894 	 * If the queued invalidation is already initialized by us
2895 	 * (for example, while enabling interrupt-remapping) then
2896 	 * we got the things already rolling from a sane state.
2897 	 */
2898 	if (!iommu->qi) {
2899 		/*
2900 		 * Clear any previous faults.
2901 		 */
2902 		dmar_fault(-1, iommu);
2903 		/*
2904 		 * Disable queued invalidation if supported and already enabled
2905 		 * before OS handover.
2906 		 */
2907 		dmar_disable_qi(iommu);
2908 	}
2909 
2910 	if (dmar_enable_qi(iommu)) {
2911 		/*
2912 		 * Queued Invalidate not enabled, use Register Based Invalidate
2913 		 */
2914 		iommu->flush.flush_context = __iommu_flush_context;
2915 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2916 		pr_info("%s: Using Register based invalidation\n",
2917 			iommu->name);
2918 	} else {
2919 		iommu->flush.flush_context = qi_flush_context;
2920 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2921 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2922 	}
2923 }
2924 
2925 static int copy_context_table(struct intel_iommu *iommu,
2926 			      struct root_entry *old_re,
2927 			      struct context_entry **tbl,
2928 			      int bus, bool ext)
2929 {
2930 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2931 	struct context_entry *new_ce = NULL, ce;
2932 	struct context_entry *old_ce = NULL;
2933 	struct root_entry re;
2934 	phys_addr_t old_ce_phys;
2935 
2936 	tbl_idx = ext ? bus * 2 : bus;
2937 	memcpy(&re, old_re, sizeof(re));
2938 
2939 	for (devfn = 0; devfn < 256; devfn++) {
2940 		/* First calculate the correct index */
2941 		idx = (ext ? devfn * 2 : devfn) % 256;
2942 
2943 		if (idx == 0) {
2944 			/* First save what we may have and clean up */
2945 			if (new_ce) {
2946 				tbl[tbl_idx] = new_ce;
2947 				__iommu_flush_cache(iommu, new_ce,
2948 						    VTD_PAGE_SIZE);
2949 				pos = 1;
2950 			}
2951 
2952 			if (old_ce)
2953 				memunmap(old_ce);
2954 
2955 			ret = 0;
2956 			if (devfn < 0x80)
2957 				old_ce_phys = root_entry_lctp(&re);
2958 			else
2959 				old_ce_phys = root_entry_uctp(&re);
2960 
2961 			if (!old_ce_phys) {
2962 				if (ext && devfn == 0) {
2963 					/* No LCTP, try UCTP */
2964 					devfn = 0x7f;
2965 					continue;
2966 				} else {
2967 					goto out;
2968 				}
2969 			}
2970 
2971 			ret = -ENOMEM;
2972 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2973 					MEMREMAP_WB);
2974 			if (!old_ce)
2975 				goto out;
2976 
2977 			new_ce = alloc_pgtable_page(iommu->node);
2978 			if (!new_ce)
2979 				goto out_unmap;
2980 
2981 			ret = 0;
2982 		}
2983 
2984 		/* Now copy the context entry */
2985 		memcpy(&ce, old_ce + idx, sizeof(ce));
2986 
2987 		if (!__context_present(&ce))
2988 			continue;
2989 
2990 		did = context_domain_id(&ce);
2991 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2992 			set_bit(did, iommu->domain_ids);
2993 
2994 		/*
2995 		 * We need a marker for copied context entries. This
2996 		 * marker needs to work for the old format as well as
2997 		 * for extended context entries.
2998 		 *
2999 		 * Bit 67 of the context entry is used. In the old
3000 		 * format this bit is available to software, in the
3001 		 * extended format it is the PGE bit, but PGE is ignored
3002 		 * by HW if PASIDs are disabled (and thus still
3003 		 * available).
3004 		 *
3005 		 * So disable PASIDs first and then mark the entry
3006 		 * copied. This means that we don't copy PASID
3007 		 * translations from the old kernel, but this is fine as
3008 		 * faults there are not fatal.
3009 		 */
3010 		context_clear_pasid_enable(&ce);
3011 		context_set_copied(&ce);
3012 
3013 		new_ce[idx] = ce;
3014 	}
3015 
3016 	tbl[tbl_idx + pos] = new_ce;
3017 
3018 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3019 
3020 out_unmap:
3021 	memunmap(old_ce);
3022 
3023 out:
3024 	return ret;
3025 }
3026 
3027 static int copy_translation_tables(struct intel_iommu *iommu)
3028 {
3029 	struct context_entry **ctxt_tbls;
3030 	struct root_entry *old_rt;
3031 	phys_addr_t old_rt_phys;
3032 	int ctxt_table_entries;
3033 	unsigned long flags;
3034 	u64 rtaddr_reg;
3035 	int bus, ret;
3036 	bool new_ext, ext;
3037 
3038 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3039 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3040 	new_ext    = !!ecap_ecs(iommu->ecap);
3041 
3042 	/*
3043 	 * The RTT bit can only be changed when translation is disabled,
3044 	 * but disabling translation means to open a window for data
3045 	 * corruption. So bail out and don't copy anything if we would
3046 	 * have to change the bit.
3047 	 */
3048 	if (new_ext != ext)
3049 		return -EINVAL;
3050 
3051 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3052 	if (!old_rt_phys)
3053 		return -EINVAL;
3054 
3055 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3056 	if (!old_rt)
3057 		return -ENOMEM;
3058 
3059 	/* This is too big for the stack - allocate it from slab */
3060 	ctxt_table_entries = ext ? 512 : 256;
3061 	ret = -ENOMEM;
3062 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3063 	if (!ctxt_tbls)
3064 		goto out_unmap;
3065 
3066 	for (bus = 0; bus < 256; bus++) {
3067 		ret = copy_context_table(iommu, &old_rt[bus],
3068 					 ctxt_tbls, bus, ext);
3069 		if (ret) {
3070 			pr_err("%s: Failed to copy context table for bus %d\n",
3071 				iommu->name, bus);
3072 			continue;
3073 		}
3074 	}
3075 
3076 	spin_lock_irqsave(&iommu->lock, flags);
3077 
3078 	/* Context tables are copied, now write them to the root_entry table */
3079 	for (bus = 0; bus < 256; bus++) {
3080 		int idx = ext ? bus * 2 : bus;
3081 		u64 val;
3082 
3083 		if (ctxt_tbls[idx]) {
3084 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3085 			iommu->root_entry[bus].lo = val;
3086 		}
3087 
3088 		if (!ext || !ctxt_tbls[idx + 1])
3089 			continue;
3090 
3091 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3092 		iommu->root_entry[bus].hi = val;
3093 	}
3094 
3095 	spin_unlock_irqrestore(&iommu->lock, flags);
3096 
3097 	kfree(ctxt_tbls);
3098 
3099 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3100 
3101 	ret = 0;
3102 
3103 out_unmap:
3104 	memunmap(old_rt);
3105 
3106 	return ret;
3107 }
3108 
3109 #ifdef CONFIG_INTEL_IOMMU_SVM
3110 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3111 {
3112 	struct intel_iommu *iommu = data;
3113 	ioasid_t ioasid;
3114 
3115 	if (!iommu)
3116 		return INVALID_IOASID;
3117 	/*
3118 	 * VT-d virtual command interface always uses the full 20 bit
3119 	 * PASID range. Host can partition guest PASID range based on
3120 	 * policies but it is out of guest's control.
3121 	 */
3122 	if (min < PASID_MIN || max > intel_pasid_max_id)
3123 		return INVALID_IOASID;
3124 
3125 	if (vcmd_alloc_pasid(iommu, &ioasid))
3126 		return INVALID_IOASID;
3127 
3128 	return ioasid;
3129 }
3130 
3131 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3132 {
3133 	struct intel_iommu *iommu = data;
3134 
3135 	if (!iommu)
3136 		return;
3137 	/*
3138 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3139 	 * We can only free the PASID when all the devices are unbound.
3140 	 */
3141 	if (ioasid_find(NULL, ioasid, NULL)) {
3142 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3143 		return;
3144 	}
3145 	vcmd_free_pasid(iommu, ioasid);
3146 }
3147 
3148 static void register_pasid_allocator(struct intel_iommu *iommu)
3149 {
3150 	/*
3151 	 * If we are running in the host, no need for custom allocator
3152 	 * in that PASIDs are allocated from the host system-wide.
3153 	 */
3154 	if (!cap_caching_mode(iommu->cap))
3155 		return;
3156 
3157 	if (!sm_supported(iommu)) {
3158 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3159 		return;
3160 	}
3161 
3162 	/*
3163 	 * Register a custom PASID allocator if we are running in a guest,
3164 	 * guest PASID must be obtained via virtual command interface.
3165 	 * There can be multiple vIOMMUs in each guest but only one allocator
3166 	 * is active. All vIOMMU allocators will eventually be calling the same
3167 	 * host allocator.
3168 	 */
3169 	if (!vccap_pasid(iommu->vccap))
3170 		return;
3171 
3172 	pr_info("Register custom PASID allocator\n");
3173 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3174 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3175 	iommu->pasid_allocator.pdata = (void *)iommu;
3176 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3177 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3178 		/*
3179 		 * Disable scalable mode on this IOMMU if there
3180 		 * is no custom allocator. Mixing SM capable vIOMMU
3181 		 * and non-SM vIOMMU are not supported.
3182 		 */
3183 		intel_iommu_sm = 0;
3184 	}
3185 }
3186 #endif
3187 
3188 static int __init init_dmars(void)
3189 {
3190 	struct dmar_drhd_unit *drhd;
3191 	struct intel_iommu *iommu;
3192 	int ret;
3193 
3194 	/*
3195 	 * for each drhd
3196 	 *    allocate root
3197 	 *    initialize and program root entry to not present
3198 	 * endfor
3199 	 */
3200 	for_each_drhd_unit(drhd) {
3201 		/*
3202 		 * lock not needed as this is only incremented in the single
3203 		 * threaded kernel __init code path all other access are read
3204 		 * only
3205 		 */
3206 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3207 			g_num_of_iommus++;
3208 			continue;
3209 		}
3210 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3211 	}
3212 
3213 	/* Preallocate enough resources for IOMMU hot-addition */
3214 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3215 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3216 
3217 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3218 			GFP_KERNEL);
3219 	if (!g_iommus) {
3220 		pr_err("Allocating global iommu array failed\n");
3221 		ret = -ENOMEM;
3222 		goto error;
3223 	}
3224 
3225 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3226 	if (ret)
3227 		goto free_iommu;
3228 
3229 	for_each_iommu(iommu, drhd) {
3230 		if (drhd->ignored) {
3231 			iommu_disable_translation(iommu);
3232 			continue;
3233 		}
3234 
3235 		/*
3236 		 * Find the max pasid size of all IOMMU's in the system.
3237 		 * We need to ensure the system pasid table is no bigger
3238 		 * than the smallest supported.
3239 		 */
3240 		if (pasid_supported(iommu)) {
3241 			u32 temp = 2 << ecap_pss(iommu->ecap);
3242 
3243 			intel_pasid_max_id = min_t(u32, temp,
3244 						   intel_pasid_max_id);
3245 		}
3246 
3247 		g_iommus[iommu->seq_id] = iommu;
3248 
3249 		intel_iommu_init_qi(iommu);
3250 
3251 		ret = iommu_init_domains(iommu);
3252 		if (ret)
3253 			goto free_iommu;
3254 
3255 		init_translation_status(iommu);
3256 
3257 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3258 			iommu_disable_translation(iommu);
3259 			clear_translation_pre_enabled(iommu);
3260 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3261 				iommu->name);
3262 		}
3263 
3264 		/*
3265 		 * TBD:
3266 		 * we could share the same root & context tables
3267 		 * among all IOMMU's. Need to Split it later.
3268 		 */
3269 		ret = iommu_alloc_root_entry(iommu);
3270 		if (ret)
3271 			goto free_iommu;
3272 
3273 		if (translation_pre_enabled(iommu)) {
3274 			pr_info("Translation already enabled - trying to copy translation structures\n");
3275 
3276 			ret = copy_translation_tables(iommu);
3277 			if (ret) {
3278 				/*
3279 				 * We found the IOMMU with translation
3280 				 * enabled - but failed to copy over the
3281 				 * old root-entry table. Try to proceed
3282 				 * by disabling translation now and
3283 				 * allocating a clean root-entry table.
3284 				 * This might cause DMAR faults, but
3285 				 * probably the dump will still succeed.
3286 				 */
3287 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3288 				       iommu->name);
3289 				iommu_disable_translation(iommu);
3290 				clear_translation_pre_enabled(iommu);
3291 			} else {
3292 				pr_info("Copied translation tables from previous kernel for %s\n",
3293 					iommu->name);
3294 			}
3295 		}
3296 
3297 		if (!ecap_pass_through(iommu->ecap))
3298 			hw_pass_through = 0;
3299 		intel_svm_check(iommu);
3300 	}
3301 
3302 	/*
3303 	 * Now that qi is enabled on all iommus, set the root entry and flush
3304 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3305 	 * flush_context function will loop forever and the boot hangs.
3306 	 */
3307 	for_each_active_iommu(iommu, drhd) {
3308 		iommu_flush_write_buffer(iommu);
3309 #ifdef CONFIG_INTEL_IOMMU_SVM
3310 		register_pasid_allocator(iommu);
3311 #endif
3312 		iommu_set_root_entry(iommu);
3313 	}
3314 
3315 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3316 	dmar_map_gfx = 0;
3317 #endif
3318 
3319 	if (!dmar_map_gfx)
3320 		iommu_identity_mapping |= IDENTMAP_GFX;
3321 
3322 	check_tylersburg_isoch();
3323 
3324 	ret = si_domain_init(hw_pass_through);
3325 	if (ret)
3326 		goto free_iommu;
3327 
3328 	/*
3329 	 * for each drhd
3330 	 *   enable fault log
3331 	 *   global invalidate context cache
3332 	 *   global invalidate iotlb
3333 	 *   enable translation
3334 	 */
3335 	for_each_iommu(iommu, drhd) {
3336 		if (drhd->ignored) {
3337 			/*
3338 			 * we always have to disable PMRs or DMA may fail on
3339 			 * this device
3340 			 */
3341 			if (force_on)
3342 				iommu_disable_protect_mem_regions(iommu);
3343 			continue;
3344 		}
3345 
3346 		iommu_flush_write_buffer(iommu);
3347 
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3350 			/*
3351 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3352 			 * could cause possible lock race condition.
3353 			 */
3354 			up_write(&dmar_global_lock);
3355 			ret = intel_svm_enable_prq(iommu);
3356 			down_write(&dmar_global_lock);
3357 			if (ret)
3358 				goto free_iommu;
3359 		}
3360 #endif
3361 		ret = dmar_set_interrupt(iommu);
3362 		if (ret)
3363 			goto free_iommu;
3364 	}
3365 
3366 	return 0;
3367 
3368 free_iommu:
3369 	for_each_active_iommu(iommu, drhd) {
3370 		disable_dmar_iommu(iommu);
3371 		free_dmar_iommu(iommu);
3372 	}
3373 
3374 	kfree(g_iommus);
3375 
3376 error:
3377 	return ret;
3378 }
3379 
3380 static inline int iommu_domain_cache_init(void)
3381 {
3382 	int ret = 0;
3383 
3384 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3385 					 sizeof(struct dmar_domain),
3386 					 0,
3387 					 SLAB_HWCACHE_ALIGN,
3388 
3389 					 NULL);
3390 	if (!iommu_domain_cache) {
3391 		pr_err("Couldn't create iommu_domain cache\n");
3392 		ret = -ENOMEM;
3393 	}
3394 
3395 	return ret;
3396 }
3397 
3398 static inline int iommu_devinfo_cache_init(void)
3399 {
3400 	int ret = 0;
3401 
3402 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3403 					 sizeof(struct device_domain_info),
3404 					 0,
3405 					 SLAB_HWCACHE_ALIGN,
3406 					 NULL);
3407 	if (!iommu_devinfo_cache) {
3408 		pr_err("Couldn't create devinfo cache\n");
3409 		ret = -ENOMEM;
3410 	}
3411 
3412 	return ret;
3413 }
3414 
3415 static int __init iommu_init_mempool(void)
3416 {
3417 	int ret;
3418 	ret = iova_cache_get();
3419 	if (ret)
3420 		return ret;
3421 
3422 	ret = iommu_domain_cache_init();
3423 	if (ret)
3424 		goto domain_error;
3425 
3426 	ret = iommu_devinfo_cache_init();
3427 	if (!ret)
3428 		return ret;
3429 
3430 	kmem_cache_destroy(iommu_domain_cache);
3431 domain_error:
3432 	iova_cache_put();
3433 
3434 	return -ENOMEM;
3435 }
3436 
3437 static void __init iommu_exit_mempool(void)
3438 {
3439 	kmem_cache_destroy(iommu_devinfo_cache);
3440 	kmem_cache_destroy(iommu_domain_cache);
3441 	iova_cache_put();
3442 }
3443 
3444 static void __init init_no_remapping_devices(void)
3445 {
3446 	struct dmar_drhd_unit *drhd;
3447 	struct device *dev;
3448 	int i;
3449 
3450 	for_each_drhd_unit(drhd) {
3451 		if (!drhd->include_all) {
3452 			for_each_active_dev_scope(drhd->devices,
3453 						  drhd->devices_cnt, i, dev)
3454 				break;
3455 			/* ignore DMAR unit if no devices exist */
3456 			if (i == drhd->devices_cnt)
3457 				drhd->ignored = 1;
3458 		}
3459 	}
3460 
3461 	for_each_active_drhd_unit(drhd) {
3462 		if (drhd->include_all)
3463 			continue;
3464 
3465 		for_each_active_dev_scope(drhd->devices,
3466 					  drhd->devices_cnt, i, dev)
3467 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3468 				break;
3469 		if (i < drhd->devices_cnt)
3470 			continue;
3471 
3472 		/* This IOMMU has *only* gfx devices. Either bypass it or
3473 		   set the gfx_mapped flag, as appropriate */
3474 		drhd->gfx_dedicated = 1;
3475 		if (!dmar_map_gfx)
3476 			drhd->ignored = 1;
3477 	}
3478 }
3479 
3480 #ifdef CONFIG_SUSPEND
3481 static int init_iommu_hw(void)
3482 {
3483 	struct dmar_drhd_unit *drhd;
3484 	struct intel_iommu *iommu = NULL;
3485 
3486 	for_each_active_iommu(iommu, drhd)
3487 		if (iommu->qi)
3488 			dmar_reenable_qi(iommu);
3489 
3490 	for_each_iommu(iommu, drhd) {
3491 		if (drhd->ignored) {
3492 			/*
3493 			 * we always have to disable PMRs or DMA may fail on
3494 			 * this device
3495 			 */
3496 			if (force_on)
3497 				iommu_disable_protect_mem_regions(iommu);
3498 			continue;
3499 		}
3500 
3501 		iommu_flush_write_buffer(iommu);
3502 		iommu_set_root_entry(iommu);
3503 		iommu_enable_translation(iommu);
3504 		iommu_disable_protect_mem_regions(iommu);
3505 	}
3506 
3507 	return 0;
3508 }
3509 
3510 static void iommu_flush_all(void)
3511 {
3512 	struct dmar_drhd_unit *drhd;
3513 	struct intel_iommu *iommu;
3514 
3515 	for_each_active_iommu(iommu, drhd) {
3516 		iommu->flush.flush_context(iommu, 0, 0, 0,
3517 					   DMA_CCMD_GLOBAL_INVL);
3518 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3519 					 DMA_TLB_GLOBAL_FLUSH);
3520 	}
3521 }
3522 
3523 static int iommu_suspend(void)
3524 {
3525 	struct dmar_drhd_unit *drhd;
3526 	struct intel_iommu *iommu = NULL;
3527 	unsigned long flag;
3528 
3529 	for_each_active_iommu(iommu, drhd) {
3530 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3531 					     GFP_KERNEL);
3532 		if (!iommu->iommu_state)
3533 			goto nomem;
3534 	}
3535 
3536 	iommu_flush_all();
3537 
3538 	for_each_active_iommu(iommu, drhd) {
3539 		iommu_disable_translation(iommu);
3540 
3541 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3542 
3543 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3544 			readl(iommu->reg + DMAR_FECTL_REG);
3545 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3546 			readl(iommu->reg + DMAR_FEDATA_REG);
3547 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3548 			readl(iommu->reg + DMAR_FEADDR_REG);
3549 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3550 			readl(iommu->reg + DMAR_FEUADDR_REG);
3551 
3552 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3553 	}
3554 	return 0;
3555 
3556 nomem:
3557 	for_each_active_iommu(iommu, drhd)
3558 		kfree(iommu->iommu_state);
3559 
3560 	return -ENOMEM;
3561 }
3562 
3563 static void iommu_resume(void)
3564 {
3565 	struct dmar_drhd_unit *drhd;
3566 	struct intel_iommu *iommu = NULL;
3567 	unsigned long flag;
3568 
3569 	if (init_iommu_hw()) {
3570 		if (force_on)
3571 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3572 		else
3573 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3574 		return;
3575 	}
3576 
3577 	for_each_active_iommu(iommu, drhd) {
3578 
3579 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3580 
3581 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3582 			iommu->reg + DMAR_FECTL_REG);
3583 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3584 			iommu->reg + DMAR_FEDATA_REG);
3585 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3586 			iommu->reg + DMAR_FEADDR_REG);
3587 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3588 			iommu->reg + DMAR_FEUADDR_REG);
3589 
3590 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3591 	}
3592 
3593 	for_each_active_iommu(iommu, drhd)
3594 		kfree(iommu->iommu_state);
3595 }
3596 
3597 static struct syscore_ops iommu_syscore_ops = {
3598 	.resume		= iommu_resume,
3599 	.suspend	= iommu_suspend,
3600 };
3601 
3602 static void __init init_iommu_pm_ops(void)
3603 {
3604 	register_syscore_ops(&iommu_syscore_ops);
3605 }
3606 
3607 #else
3608 static inline void init_iommu_pm_ops(void) {}
3609 #endif	/* CONFIG_PM */
3610 
3611 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3612 {
3613 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3614 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3615 	    rmrr->end_address <= rmrr->base_address ||
3616 	    arch_rmrr_sanity_check(rmrr))
3617 		return -EINVAL;
3618 
3619 	return 0;
3620 }
3621 
3622 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3623 {
3624 	struct acpi_dmar_reserved_memory *rmrr;
3625 	struct dmar_rmrr_unit *rmrru;
3626 
3627 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3628 	if (rmrr_sanity_check(rmrr)) {
3629 		pr_warn(FW_BUG
3630 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3631 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3632 			   rmrr->base_address, rmrr->end_address,
3633 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3634 			   dmi_get_system_info(DMI_BIOS_VERSION),
3635 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3636 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3637 	}
3638 
3639 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3640 	if (!rmrru)
3641 		goto out;
3642 
3643 	rmrru->hdr = header;
3644 
3645 	rmrru->base_address = rmrr->base_address;
3646 	rmrru->end_address = rmrr->end_address;
3647 
3648 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3649 				((void *)rmrr) + rmrr->header.length,
3650 				&rmrru->devices_cnt);
3651 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3652 		goto free_rmrru;
3653 
3654 	list_add(&rmrru->list, &dmar_rmrr_units);
3655 
3656 	return 0;
3657 free_rmrru:
3658 	kfree(rmrru);
3659 out:
3660 	return -ENOMEM;
3661 }
3662 
3663 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3664 {
3665 	struct dmar_atsr_unit *atsru;
3666 	struct acpi_dmar_atsr *tmp;
3667 
3668 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3669 				dmar_rcu_check()) {
3670 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3671 		if (atsr->segment != tmp->segment)
3672 			continue;
3673 		if (atsr->header.length != tmp->header.length)
3674 			continue;
3675 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3676 			return atsru;
3677 	}
3678 
3679 	return NULL;
3680 }
3681 
3682 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3683 {
3684 	struct acpi_dmar_atsr *atsr;
3685 	struct dmar_atsr_unit *atsru;
3686 
3687 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3688 		return 0;
3689 
3690 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3691 	atsru = dmar_find_atsr(atsr);
3692 	if (atsru)
3693 		return 0;
3694 
3695 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3696 	if (!atsru)
3697 		return -ENOMEM;
3698 
3699 	/*
3700 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3701 	 * copy the memory content because the memory buffer will be freed
3702 	 * on return.
3703 	 */
3704 	atsru->hdr = (void *)(atsru + 1);
3705 	memcpy(atsru->hdr, hdr, hdr->length);
3706 	atsru->include_all = atsr->flags & 0x1;
3707 	if (!atsru->include_all) {
3708 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3709 				(void *)atsr + atsr->header.length,
3710 				&atsru->devices_cnt);
3711 		if (atsru->devices_cnt && atsru->devices == NULL) {
3712 			kfree(atsru);
3713 			return -ENOMEM;
3714 		}
3715 	}
3716 
3717 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3718 
3719 	return 0;
3720 }
3721 
3722 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3723 {
3724 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3725 	kfree(atsru);
3726 }
3727 
3728 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3729 {
3730 	struct acpi_dmar_atsr *atsr;
3731 	struct dmar_atsr_unit *atsru;
3732 
3733 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3734 	atsru = dmar_find_atsr(atsr);
3735 	if (atsru) {
3736 		list_del_rcu(&atsru->list);
3737 		synchronize_rcu();
3738 		intel_iommu_free_atsr(atsru);
3739 	}
3740 
3741 	return 0;
3742 }
3743 
3744 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3745 {
3746 	int i;
3747 	struct device *dev;
3748 	struct acpi_dmar_atsr *atsr;
3749 	struct dmar_atsr_unit *atsru;
3750 
3751 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3752 	atsru = dmar_find_atsr(atsr);
3753 	if (!atsru)
3754 		return 0;
3755 
3756 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3757 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3758 					  i, dev)
3759 			return -EBUSY;
3760 	}
3761 
3762 	return 0;
3763 }
3764 
3765 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3766 {
3767 	struct dmar_satc_unit *satcu;
3768 	struct acpi_dmar_satc *tmp;
3769 
3770 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3771 				dmar_rcu_check()) {
3772 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3773 		if (satc->segment != tmp->segment)
3774 			continue;
3775 		if (satc->header.length != tmp->header.length)
3776 			continue;
3777 		if (memcmp(satc, tmp, satc->header.length) == 0)
3778 			return satcu;
3779 	}
3780 
3781 	return NULL;
3782 }
3783 
3784 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3785 {
3786 	struct acpi_dmar_satc *satc;
3787 	struct dmar_satc_unit *satcu;
3788 
3789 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3790 		return 0;
3791 
3792 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3793 	satcu = dmar_find_satc(satc);
3794 	if (satcu)
3795 		return 0;
3796 
3797 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3798 	if (!satcu)
3799 		return -ENOMEM;
3800 
3801 	satcu->hdr = (void *)(satcu + 1);
3802 	memcpy(satcu->hdr, hdr, hdr->length);
3803 	satcu->atc_required = satc->flags & 0x1;
3804 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3805 					      (void *)satc + satc->header.length,
3806 					      &satcu->devices_cnt);
3807 	if (satcu->devices_cnt && !satcu->devices) {
3808 		kfree(satcu);
3809 		return -ENOMEM;
3810 	}
3811 	list_add_rcu(&satcu->list, &dmar_satc_units);
3812 
3813 	return 0;
3814 }
3815 
3816 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3817 {
3818 	int sp, ret;
3819 	struct intel_iommu *iommu = dmaru->iommu;
3820 
3821 	if (g_iommus[iommu->seq_id])
3822 		return 0;
3823 
3824 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3825 	if (ret)
3826 		goto out;
3827 
3828 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3829 		pr_warn("%s: Doesn't support hardware pass through.\n",
3830 			iommu->name);
3831 		return -ENXIO;
3832 	}
3833 	if (!ecap_sc_support(iommu->ecap) &&
3834 	    domain_update_iommu_snooping(iommu)) {
3835 		pr_warn("%s: Doesn't support snooping.\n",
3836 			iommu->name);
3837 		return -ENXIO;
3838 	}
3839 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3840 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3841 		pr_warn("%s: Doesn't support large page.\n",
3842 			iommu->name);
3843 		return -ENXIO;
3844 	}
3845 
3846 	/*
3847 	 * Disable translation if already enabled prior to OS handover.
3848 	 */
3849 	if (iommu->gcmd & DMA_GCMD_TE)
3850 		iommu_disable_translation(iommu);
3851 
3852 	g_iommus[iommu->seq_id] = iommu;
3853 	ret = iommu_init_domains(iommu);
3854 	if (ret == 0)
3855 		ret = iommu_alloc_root_entry(iommu);
3856 	if (ret)
3857 		goto out;
3858 
3859 	intel_svm_check(iommu);
3860 
3861 	if (dmaru->ignored) {
3862 		/*
3863 		 * we always have to disable PMRs or DMA may fail on this device
3864 		 */
3865 		if (force_on)
3866 			iommu_disable_protect_mem_regions(iommu);
3867 		return 0;
3868 	}
3869 
3870 	intel_iommu_init_qi(iommu);
3871 	iommu_flush_write_buffer(iommu);
3872 
3873 #ifdef CONFIG_INTEL_IOMMU_SVM
3874 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3875 		ret = intel_svm_enable_prq(iommu);
3876 		if (ret)
3877 			goto disable_iommu;
3878 	}
3879 #endif
3880 	ret = dmar_set_interrupt(iommu);
3881 	if (ret)
3882 		goto disable_iommu;
3883 
3884 	iommu_set_root_entry(iommu);
3885 	iommu_enable_translation(iommu);
3886 
3887 	iommu_disable_protect_mem_regions(iommu);
3888 	return 0;
3889 
3890 disable_iommu:
3891 	disable_dmar_iommu(iommu);
3892 out:
3893 	free_dmar_iommu(iommu);
3894 	return ret;
3895 }
3896 
3897 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3898 {
3899 	int ret = 0;
3900 	struct intel_iommu *iommu = dmaru->iommu;
3901 
3902 	if (!intel_iommu_enabled)
3903 		return 0;
3904 	if (iommu == NULL)
3905 		return -EINVAL;
3906 
3907 	if (insert) {
3908 		ret = intel_iommu_add(dmaru);
3909 	} else {
3910 		disable_dmar_iommu(iommu);
3911 		free_dmar_iommu(iommu);
3912 	}
3913 
3914 	return ret;
3915 }
3916 
3917 static void intel_iommu_free_dmars(void)
3918 {
3919 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3920 	struct dmar_atsr_unit *atsru, *atsr_n;
3921 	struct dmar_satc_unit *satcu, *satc_n;
3922 
3923 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3924 		list_del(&rmrru->list);
3925 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3926 		kfree(rmrru);
3927 	}
3928 
3929 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3930 		list_del(&atsru->list);
3931 		intel_iommu_free_atsr(atsru);
3932 	}
3933 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3934 		list_del(&satcu->list);
3935 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3936 		kfree(satcu);
3937 	}
3938 }
3939 
3940 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3941 {
3942 	int i, ret = 1;
3943 	struct pci_bus *bus;
3944 	struct pci_dev *bridge = NULL;
3945 	struct device *tmp;
3946 	struct acpi_dmar_atsr *atsr;
3947 	struct dmar_atsr_unit *atsru;
3948 
3949 	dev = pci_physfn(dev);
3950 	for (bus = dev->bus; bus; bus = bus->parent) {
3951 		bridge = bus->self;
3952 		/* If it's an integrated device, allow ATS */
3953 		if (!bridge)
3954 			return 1;
3955 		/* Connected via non-PCIe: no ATS */
3956 		if (!pci_is_pcie(bridge) ||
3957 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3958 			return 0;
3959 		/* If we found the root port, look it up in the ATSR */
3960 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3961 			break;
3962 	}
3963 
3964 	rcu_read_lock();
3965 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3966 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3967 		if (atsr->segment != pci_domain_nr(dev->bus))
3968 			continue;
3969 
3970 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3971 			if (tmp == &bridge->dev)
3972 				goto out;
3973 
3974 		if (atsru->include_all)
3975 			goto out;
3976 	}
3977 	ret = 0;
3978 out:
3979 	rcu_read_unlock();
3980 
3981 	return ret;
3982 }
3983 
3984 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3985 {
3986 	int ret;
3987 	struct dmar_rmrr_unit *rmrru;
3988 	struct dmar_atsr_unit *atsru;
3989 	struct dmar_satc_unit *satcu;
3990 	struct acpi_dmar_atsr *atsr;
3991 	struct acpi_dmar_reserved_memory *rmrr;
3992 	struct acpi_dmar_satc *satc;
3993 
3994 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3995 		return 0;
3996 
3997 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3998 		rmrr = container_of(rmrru->hdr,
3999 				    struct acpi_dmar_reserved_memory, header);
4000 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4001 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4002 				((void *)rmrr) + rmrr->header.length,
4003 				rmrr->segment, rmrru->devices,
4004 				rmrru->devices_cnt);
4005 			if (ret < 0)
4006 				return ret;
4007 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4008 			dmar_remove_dev_scope(info, rmrr->segment,
4009 				rmrru->devices, rmrru->devices_cnt);
4010 		}
4011 	}
4012 
4013 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4014 		if (atsru->include_all)
4015 			continue;
4016 
4017 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4018 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4019 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4020 					(void *)atsr + atsr->header.length,
4021 					atsr->segment, atsru->devices,
4022 					atsru->devices_cnt);
4023 			if (ret > 0)
4024 				break;
4025 			else if (ret < 0)
4026 				return ret;
4027 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4028 			if (dmar_remove_dev_scope(info, atsr->segment,
4029 					atsru->devices, atsru->devices_cnt))
4030 				break;
4031 		}
4032 	}
4033 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4034 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4035 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4036 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4037 					(void *)satc + satc->header.length,
4038 					satc->segment, satcu->devices,
4039 					satcu->devices_cnt);
4040 			if (ret > 0)
4041 				break;
4042 			else if (ret < 0)
4043 				return ret;
4044 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4045 			if (dmar_remove_dev_scope(info, satc->segment,
4046 					satcu->devices, satcu->devices_cnt))
4047 				break;
4048 		}
4049 	}
4050 
4051 	return 0;
4052 }
4053 
4054 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4055 				       unsigned long val, void *v)
4056 {
4057 	struct memory_notify *mhp = v;
4058 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4059 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4060 			mhp->nr_pages - 1);
4061 
4062 	switch (val) {
4063 	case MEM_GOING_ONLINE:
4064 		if (iommu_domain_identity_map(si_domain,
4065 					      start_vpfn, last_vpfn)) {
4066 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4067 				start_vpfn, last_vpfn);
4068 			return NOTIFY_BAD;
4069 		}
4070 		break;
4071 
4072 	case MEM_OFFLINE:
4073 	case MEM_CANCEL_ONLINE:
4074 		{
4075 			struct dmar_drhd_unit *drhd;
4076 			struct intel_iommu *iommu;
4077 			struct page *freelist;
4078 
4079 			freelist = domain_unmap(si_domain,
4080 						start_vpfn, last_vpfn,
4081 						NULL);
4082 
4083 			rcu_read_lock();
4084 			for_each_active_iommu(iommu, drhd)
4085 				iommu_flush_iotlb_psi(iommu, si_domain,
4086 					start_vpfn, mhp->nr_pages,
4087 					!freelist, 0);
4088 			rcu_read_unlock();
4089 			dma_free_pagelist(freelist);
4090 		}
4091 		break;
4092 	}
4093 
4094 	return NOTIFY_OK;
4095 }
4096 
4097 static struct notifier_block intel_iommu_memory_nb = {
4098 	.notifier_call = intel_iommu_memory_notifier,
4099 	.priority = 0
4100 };
4101 
4102 static void intel_disable_iommus(void)
4103 {
4104 	struct intel_iommu *iommu = NULL;
4105 	struct dmar_drhd_unit *drhd;
4106 
4107 	for_each_iommu(iommu, drhd)
4108 		iommu_disable_translation(iommu);
4109 }
4110 
4111 void intel_iommu_shutdown(void)
4112 {
4113 	struct dmar_drhd_unit *drhd;
4114 	struct intel_iommu *iommu = NULL;
4115 
4116 	if (no_iommu || dmar_disabled)
4117 		return;
4118 
4119 	down_write(&dmar_global_lock);
4120 
4121 	/* Disable PMRs explicitly here. */
4122 	for_each_iommu(iommu, drhd)
4123 		iommu_disable_protect_mem_regions(iommu);
4124 
4125 	/* Make sure the IOMMUs are switched off */
4126 	intel_disable_iommus();
4127 
4128 	up_write(&dmar_global_lock);
4129 }
4130 
4131 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4132 {
4133 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4134 
4135 	return container_of(iommu_dev, struct intel_iommu, iommu);
4136 }
4137 
4138 static ssize_t intel_iommu_show_version(struct device *dev,
4139 					struct device_attribute *attr,
4140 					char *buf)
4141 {
4142 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4143 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4144 	return sprintf(buf, "%d:%d\n",
4145 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4146 }
4147 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4148 
4149 static ssize_t intel_iommu_show_address(struct device *dev,
4150 					struct device_attribute *attr,
4151 					char *buf)
4152 {
4153 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4154 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4155 }
4156 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4157 
4158 static ssize_t intel_iommu_show_cap(struct device *dev,
4159 				    struct device_attribute *attr,
4160 				    char *buf)
4161 {
4162 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4163 	return sprintf(buf, "%llx\n", iommu->cap);
4164 }
4165 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4166 
4167 static ssize_t intel_iommu_show_ecap(struct device *dev,
4168 				    struct device_attribute *attr,
4169 				    char *buf)
4170 {
4171 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4172 	return sprintf(buf, "%llx\n", iommu->ecap);
4173 }
4174 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4175 
4176 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4177 				      struct device_attribute *attr,
4178 				      char *buf)
4179 {
4180 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4181 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4182 }
4183 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4184 
4185 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4186 					   struct device_attribute *attr,
4187 					   char *buf)
4188 {
4189 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4190 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4191 						  cap_ndoms(iommu->cap)));
4192 }
4193 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4194 
4195 static struct attribute *intel_iommu_attrs[] = {
4196 	&dev_attr_version.attr,
4197 	&dev_attr_address.attr,
4198 	&dev_attr_cap.attr,
4199 	&dev_attr_ecap.attr,
4200 	&dev_attr_domains_supported.attr,
4201 	&dev_attr_domains_used.attr,
4202 	NULL,
4203 };
4204 
4205 static struct attribute_group intel_iommu_group = {
4206 	.name = "intel-iommu",
4207 	.attrs = intel_iommu_attrs,
4208 };
4209 
4210 const struct attribute_group *intel_iommu_groups[] = {
4211 	&intel_iommu_group,
4212 	NULL,
4213 };
4214 
4215 static inline bool has_external_pci(void)
4216 {
4217 	struct pci_dev *pdev = NULL;
4218 
4219 	for_each_pci_dev(pdev)
4220 		if (pdev->external_facing)
4221 			return true;
4222 
4223 	return false;
4224 }
4225 
4226 static int __init platform_optin_force_iommu(void)
4227 {
4228 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4229 		return 0;
4230 
4231 	if (no_iommu || dmar_disabled)
4232 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4233 
4234 	/*
4235 	 * If Intel-IOMMU is disabled by default, we will apply identity
4236 	 * map for all devices except those marked as being untrusted.
4237 	 */
4238 	if (dmar_disabled)
4239 		iommu_set_default_passthrough(false);
4240 
4241 	dmar_disabled = 0;
4242 	no_iommu = 0;
4243 
4244 	return 1;
4245 }
4246 
4247 static int __init probe_acpi_namespace_devices(void)
4248 {
4249 	struct dmar_drhd_unit *drhd;
4250 	/* To avoid a -Wunused-but-set-variable warning. */
4251 	struct intel_iommu *iommu __maybe_unused;
4252 	struct device *dev;
4253 	int i, ret = 0;
4254 
4255 	for_each_active_iommu(iommu, drhd) {
4256 		for_each_active_dev_scope(drhd->devices,
4257 					  drhd->devices_cnt, i, dev) {
4258 			struct acpi_device_physical_node *pn;
4259 			struct iommu_group *group;
4260 			struct acpi_device *adev;
4261 
4262 			if (dev->bus != &acpi_bus_type)
4263 				continue;
4264 
4265 			adev = to_acpi_device(dev);
4266 			mutex_lock(&adev->physical_node_lock);
4267 			list_for_each_entry(pn,
4268 					    &adev->physical_node_list, node) {
4269 				group = iommu_group_get(pn->dev);
4270 				if (group) {
4271 					iommu_group_put(group);
4272 					continue;
4273 				}
4274 
4275 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4276 				ret = iommu_probe_device(pn->dev);
4277 				if (ret)
4278 					break;
4279 			}
4280 			mutex_unlock(&adev->physical_node_lock);
4281 
4282 			if (ret)
4283 				return ret;
4284 		}
4285 	}
4286 
4287 	return 0;
4288 }
4289 
4290 int __init intel_iommu_init(void)
4291 {
4292 	int ret = -ENODEV;
4293 	struct dmar_drhd_unit *drhd;
4294 	struct intel_iommu *iommu;
4295 
4296 	/*
4297 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4298 	 * opt in, so enforce that.
4299 	 */
4300 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4301 		    platform_optin_force_iommu();
4302 
4303 	if (iommu_init_mempool()) {
4304 		if (force_on)
4305 			panic("tboot: Failed to initialize iommu memory\n");
4306 		return -ENOMEM;
4307 	}
4308 
4309 	down_write(&dmar_global_lock);
4310 	if (dmar_table_init()) {
4311 		if (force_on)
4312 			panic("tboot: Failed to initialize DMAR table\n");
4313 		goto out_free_dmar;
4314 	}
4315 
4316 	if (dmar_dev_scope_init() < 0) {
4317 		if (force_on)
4318 			panic("tboot: Failed to initialize DMAR device scope\n");
4319 		goto out_free_dmar;
4320 	}
4321 
4322 	up_write(&dmar_global_lock);
4323 
4324 	/*
4325 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4326 	 * complain later when we register it under the lock.
4327 	 */
4328 	dmar_register_bus_notifier();
4329 
4330 	down_write(&dmar_global_lock);
4331 
4332 	if (!no_iommu)
4333 		intel_iommu_debugfs_init();
4334 
4335 	if (no_iommu || dmar_disabled) {
4336 		/*
4337 		 * We exit the function here to ensure IOMMU's remapping and
4338 		 * mempool aren't setup, which means that the IOMMU's PMRs
4339 		 * won't be disabled via the call to init_dmars(). So disable
4340 		 * it explicitly here. The PMRs were setup by tboot prior to
4341 		 * calling SENTER, but the kernel is expected to reset/tear
4342 		 * down the PMRs.
4343 		 */
4344 		if (intel_iommu_tboot_noforce) {
4345 			for_each_iommu(iommu, drhd)
4346 				iommu_disable_protect_mem_regions(iommu);
4347 		}
4348 
4349 		/*
4350 		 * Make sure the IOMMUs are switched off, even when we
4351 		 * boot into a kexec kernel and the previous kernel left
4352 		 * them enabled
4353 		 */
4354 		intel_disable_iommus();
4355 		goto out_free_dmar;
4356 	}
4357 
4358 	if (list_empty(&dmar_rmrr_units))
4359 		pr_info("No RMRR found\n");
4360 
4361 	if (list_empty(&dmar_atsr_units))
4362 		pr_info("No ATSR found\n");
4363 
4364 	if (list_empty(&dmar_satc_units))
4365 		pr_info("No SATC found\n");
4366 
4367 	if (dmar_map_gfx)
4368 		intel_iommu_gfx_mapped = 1;
4369 
4370 	init_no_remapping_devices();
4371 
4372 	ret = init_dmars();
4373 	if (ret) {
4374 		if (force_on)
4375 			panic("tboot: Failed to initialize DMARs\n");
4376 		pr_err("Initialization failed\n");
4377 		goto out_free_dmar;
4378 	}
4379 	up_write(&dmar_global_lock);
4380 
4381 	init_iommu_pm_ops();
4382 
4383 	down_read(&dmar_global_lock);
4384 	for_each_active_iommu(iommu, drhd) {
4385 		/*
4386 		 * The flush queue implementation does not perform
4387 		 * page-selective invalidations that are required for efficient
4388 		 * TLB flushes in virtual environments.  The benefit of batching
4389 		 * is likely to be much lower than the overhead of synchronizing
4390 		 * the virtual and physical IOMMU page-tables.
4391 		 */
4392 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
4393 			pr_warn("IOMMU batching is disabled due to virtualization");
4394 			intel_iommu_strict = 1;
4395 		}
4396 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4397 				       intel_iommu_groups,
4398 				       "%s", iommu->name);
4399 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4400 	}
4401 	up_read(&dmar_global_lock);
4402 
4403 	iommu_set_dma_strict(intel_iommu_strict);
4404 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4405 	if (si_domain && !hw_pass_through)
4406 		register_memory_notifier(&intel_iommu_memory_nb);
4407 
4408 	down_read(&dmar_global_lock);
4409 	if (probe_acpi_namespace_devices())
4410 		pr_warn("ACPI name space devices didn't probe correctly\n");
4411 
4412 	/* Finally, we enable the DMA remapping hardware. */
4413 	for_each_iommu(iommu, drhd) {
4414 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4415 			iommu_enable_translation(iommu);
4416 
4417 		iommu_disable_protect_mem_regions(iommu);
4418 	}
4419 	up_read(&dmar_global_lock);
4420 
4421 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4422 
4423 	intel_iommu_enabled = 1;
4424 
4425 	return 0;
4426 
4427 out_free_dmar:
4428 	intel_iommu_free_dmars();
4429 	up_write(&dmar_global_lock);
4430 	iommu_exit_mempool();
4431 	return ret;
4432 }
4433 
4434 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4435 {
4436 	struct intel_iommu *iommu = opaque;
4437 
4438 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4439 	return 0;
4440 }
4441 
4442 /*
4443  * NB - intel-iommu lacks any sort of reference counting for the users of
4444  * dependent devices.  If multiple endpoints have intersecting dependent
4445  * devices, unbinding the driver from any one of them will possibly leave
4446  * the others unable to operate.
4447  */
4448 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4449 {
4450 	if (!iommu || !dev || !dev_is_pci(dev))
4451 		return;
4452 
4453 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4454 }
4455 
4456 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4457 {
4458 	struct dmar_domain *domain;
4459 	struct intel_iommu *iommu;
4460 	unsigned long flags;
4461 
4462 	assert_spin_locked(&device_domain_lock);
4463 
4464 	if (WARN_ON(!info))
4465 		return;
4466 
4467 	iommu = info->iommu;
4468 	domain = info->domain;
4469 
4470 	if (info->dev) {
4471 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4472 			intel_pasid_tear_down_entry(iommu, info->dev,
4473 					PASID_RID2PASID, false);
4474 
4475 		iommu_disable_dev_iotlb(info);
4476 		if (!dev_is_real_dma_subdevice(info->dev))
4477 			domain_context_clear(iommu, info->dev);
4478 		intel_pasid_free_table(info->dev);
4479 	}
4480 
4481 	unlink_domain_info(info);
4482 
4483 	spin_lock_irqsave(&iommu->lock, flags);
4484 	domain_detach_iommu(domain, iommu);
4485 	spin_unlock_irqrestore(&iommu->lock, flags);
4486 
4487 	free_devinfo_mem(info);
4488 }
4489 
4490 static void dmar_remove_one_dev_info(struct device *dev)
4491 {
4492 	struct device_domain_info *info;
4493 	unsigned long flags;
4494 
4495 	spin_lock_irqsave(&device_domain_lock, flags);
4496 	info = get_domain_info(dev);
4497 	if (info)
4498 		__dmar_remove_one_dev_info(info);
4499 	spin_unlock_irqrestore(&device_domain_lock, flags);
4500 }
4501 
4502 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4503 {
4504 	int adjust_width;
4505 
4506 	/* calculate AGAW */
4507 	domain->gaw = guest_width;
4508 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4509 	domain->agaw = width_to_agaw(adjust_width);
4510 
4511 	domain->iommu_coherency = 0;
4512 	domain->iommu_snooping = 0;
4513 	domain->iommu_superpage = 0;
4514 	domain->max_addr = 0;
4515 
4516 	/* always allocate the top pgd */
4517 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4518 	if (!domain->pgd)
4519 		return -ENOMEM;
4520 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4521 	return 0;
4522 }
4523 
4524 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4525 {
4526 	struct dmar_domain *dmar_domain;
4527 	struct iommu_domain *domain;
4528 
4529 	switch (type) {
4530 	case IOMMU_DOMAIN_DMA:
4531 	case IOMMU_DOMAIN_UNMANAGED:
4532 		dmar_domain = alloc_domain(0);
4533 		if (!dmar_domain) {
4534 			pr_err("Can't allocate dmar_domain\n");
4535 			return NULL;
4536 		}
4537 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4538 			pr_err("Domain initialization failed\n");
4539 			domain_exit(dmar_domain);
4540 			return NULL;
4541 		}
4542 
4543 		if (type == IOMMU_DOMAIN_DMA &&
4544 		    iommu_get_dma_cookie(&dmar_domain->domain))
4545 			return NULL;
4546 
4547 		domain = &dmar_domain->domain;
4548 		domain->geometry.aperture_start = 0;
4549 		domain->geometry.aperture_end   =
4550 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4551 		domain->geometry.force_aperture = true;
4552 
4553 		return domain;
4554 	case IOMMU_DOMAIN_IDENTITY:
4555 		return &si_domain->domain;
4556 	default:
4557 		return NULL;
4558 	}
4559 
4560 	return NULL;
4561 }
4562 
4563 static void intel_iommu_domain_free(struct iommu_domain *domain)
4564 {
4565 	if (domain != &si_domain->domain)
4566 		domain_exit(to_dmar_domain(domain));
4567 }
4568 
4569 /*
4570  * Check whether a @domain could be attached to the @dev through the
4571  * aux-domain attach/detach APIs.
4572  */
4573 static inline bool
4574 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4575 {
4576 	struct device_domain_info *info = get_domain_info(dev);
4577 
4578 	return info && info->auxd_enabled &&
4579 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4580 }
4581 
4582 static inline struct subdev_domain_info *
4583 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4584 {
4585 	struct subdev_domain_info *sinfo;
4586 
4587 	if (!list_empty(&domain->subdevices)) {
4588 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4589 			if (sinfo->pdev == dev)
4590 				return sinfo;
4591 		}
4592 	}
4593 
4594 	return NULL;
4595 }
4596 
4597 static int auxiliary_link_device(struct dmar_domain *domain,
4598 				 struct device *dev)
4599 {
4600 	struct device_domain_info *info = get_domain_info(dev);
4601 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4602 
4603 	assert_spin_locked(&device_domain_lock);
4604 	if (WARN_ON(!info))
4605 		return -EINVAL;
4606 
4607 	if (!sinfo) {
4608 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4609 		sinfo->domain = domain;
4610 		sinfo->pdev = dev;
4611 		list_add(&sinfo->link_phys, &info->subdevices);
4612 		list_add(&sinfo->link_domain, &domain->subdevices);
4613 	}
4614 
4615 	return ++sinfo->users;
4616 }
4617 
4618 static int auxiliary_unlink_device(struct dmar_domain *domain,
4619 				   struct device *dev)
4620 {
4621 	struct device_domain_info *info = get_domain_info(dev);
4622 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4623 	int ret;
4624 
4625 	assert_spin_locked(&device_domain_lock);
4626 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4627 		return -EINVAL;
4628 
4629 	ret = --sinfo->users;
4630 	if (!ret) {
4631 		list_del(&sinfo->link_phys);
4632 		list_del(&sinfo->link_domain);
4633 		kfree(sinfo);
4634 	}
4635 
4636 	return ret;
4637 }
4638 
4639 static int aux_domain_add_dev(struct dmar_domain *domain,
4640 			      struct device *dev)
4641 {
4642 	int ret;
4643 	unsigned long flags;
4644 	struct intel_iommu *iommu;
4645 
4646 	iommu = device_to_iommu(dev, NULL, NULL);
4647 	if (!iommu)
4648 		return -ENODEV;
4649 
4650 	if (domain->default_pasid <= 0) {
4651 		u32 pasid;
4652 
4653 		/* No private data needed for the default pasid */
4654 		pasid = ioasid_alloc(NULL, PASID_MIN,
4655 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4656 				     NULL);
4657 		if (pasid == INVALID_IOASID) {
4658 			pr_err("Can't allocate default pasid\n");
4659 			return -ENODEV;
4660 		}
4661 		domain->default_pasid = pasid;
4662 	}
4663 
4664 	spin_lock_irqsave(&device_domain_lock, flags);
4665 	ret = auxiliary_link_device(domain, dev);
4666 	if (ret <= 0)
4667 		goto link_failed;
4668 
4669 	/*
4670 	 * Subdevices from the same physical device can be attached to the
4671 	 * same domain. For such cases, only the first subdevice attachment
4672 	 * needs to go through the full steps in this function. So if ret >
4673 	 * 1, just goto out.
4674 	 */
4675 	if (ret > 1)
4676 		goto out;
4677 
4678 	/*
4679 	 * iommu->lock must be held to attach domain to iommu and setup the
4680 	 * pasid entry for second level translation.
4681 	 */
4682 	spin_lock(&iommu->lock);
4683 	ret = domain_attach_iommu(domain, iommu);
4684 	if (ret)
4685 		goto attach_failed;
4686 
4687 	/* Setup the PASID entry for mediated devices: */
4688 	if (domain_use_first_level(domain))
4689 		ret = domain_setup_first_level(iommu, domain, dev,
4690 					       domain->default_pasid);
4691 	else
4692 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4693 						     domain->default_pasid);
4694 	if (ret)
4695 		goto table_failed;
4696 
4697 	spin_unlock(&iommu->lock);
4698 out:
4699 	spin_unlock_irqrestore(&device_domain_lock, flags);
4700 
4701 	return 0;
4702 
4703 table_failed:
4704 	domain_detach_iommu(domain, iommu);
4705 attach_failed:
4706 	spin_unlock(&iommu->lock);
4707 	auxiliary_unlink_device(domain, dev);
4708 link_failed:
4709 	spin_unlock_irqrestore(&device_domain_lock, flags);
4710 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4711 		ioasid_put(domain->default_pasid);
4712 
4713 	return ret;
4714 }
4715 
4716 static void aux_domain_remove_dev(struct dmar_domain *domain,
4717 				  struct device *dev)
4718 {
4719 	struct device_domain_info *info;
4720 	struct intel_iommu *iommu;
4721 	unsigned long flags;
4722 
4723 	if (!is_aux_domain(dev, &domain->domain))
4724 		return;
4725 
4726 	spin_lock_irqsave(&device_domain_lock, flags);
4727 	info = get_domain_info(dev);
4728 	iommu = info->iommu;
4729 
4730 	if (!auxiliary_unlink_device(domain, dev)) {
4731 		spin_lock(&iommu->lock);
4732 		intel_pasid_tear_down_entry(iommu, dev,
4733 					    domain->default_pasid, false);
4734 		domain_detach_iommu(domain, iommu);
4735 		spin_unlock(&iommu->lock);
4736 	}
4737 
4738 	spin_unlock_irqrestore(&device_domain_lock, flags);
4739 
4740 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4741 		ioasid_put(domain->default_pasid);
4742 }
4743 
4744 static int prepare_domain_attach_device(struct iommu_domain *domain,
4745 					struct device *dev)
4746 {
4747 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4748 	struct intel_iommu *iommu;
4749 	int addr_width;
4750 
4751 	iommu = device_to_iommu(dev, NULL, NULL);
4752 	if (!iommu)
4753 		return -ENODEV;
4754 
4755 	/* check if this iommu agaw is sufficient for max mapped address */
4756 	addr_width = agaw_to_width(iommu->agaw);
4757 	if (addr_width > cap_mgaw(iommu->cap))
4758 		addr_width = cap_mgaw(iommu->cap);
4759 
4760 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4761 		dev_err(dev, "%s: iommu width (%d) is not "
4762 		        "sufficient for the mapped address (%llx)\n",
4763 		        __func__, addr_width, dmar_domain->max_addr);
4764 		return -EFAULT;
4765 	}
4766 	dmar_domain->gaw = addr_width;
4767 
4768 	/*
4769 	 * Knock out extra levels of page tables if necessary
4770 	 */
4771 	while (iommu->agaw < dmar_domain->agaw) {
4772 		struct dma_pte *pte;
4773 
4774 		pte = dmar_domain->pgd;
4775 		if (dma_pte_present(pte)) {
4776 			dmar_domain->pgd = (struct dma_pte *)
4777 				phys_to_virt(dma_pte_addr(pte));
4778 			free_pgtable_page(pte);
4779 		}
4780 		dmar_domain->agaw--;
4781 	}
4782 
4783 	return 0;
4784 }
4785 
4786 static int intel_iommu_attach_device(struct iommu_domain *domain,
4787 				     struct device *dev)
4788 {
4789 	int ret;
4790 
4791 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4792 	    device_is_rmrr_locked(dev)) {
4793 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4794 		return -EPERM;
4795 	}
4796 
4797 	if (is_aux_domain(dev, domain))
4798 		return -EPERM;
4799 
4800 	/* normally dev is not mapped */
4801 	if (unlikely(domain_context_mapped(dev))) {
4802 		struct dmar_domain *old_domain;
4803 
4804 		old_domain = find_domain(dev);
4805 		if (old_domain)
4806 			dmar_remove_one_dev_info(dev);
4807 	}
4808 
4809 	ret = prepare_domain_attach_device(domain, dev);
4810 	if (ret)
4811 		return ret;
4812 
4813 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4814 }
4815 
4816 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4817 					 struct device *dev)
4818 {
4819 	int ret;
4820 
4821 	if (!is_aux_domain(dev, domain))
4822 		return -EPERM;
4823 
4824 	ret = prepare_domain_attach_device(domain, dev);
4825 	if (ret)
4826 		return ret;
4827 
4828 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4829 }
4830 
4831 static void intel_iommu_detach_device(struct iommu_domain *domain,
4832 				      struct device *dev)
4833 {
4834 	dmar_remove_one_dev_info(dev);
4835 }
4836 
4837 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4838 					  struct device *dev)
4839 {
4840 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4841 }
4842 
4843 #ifdef CONFIG_INTEL_IOMMU_SVM
4844 /*
4845  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4846  * VT-d granularity. Invalidation is typically included in the unmap operation
4847  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4848  * owns the first level page tables. Invalidations of translation caches in the
4849  * guest are trapped and passed down to the host.
4850  *
4851  * vIOMMU in the guest will only expose first level page tables, therefore
4852  * we do not support IOTLB granularity for request without PASID (second level).
4853  *
4854  * For example, to find the VT-d granularity encoding for IOTLB
4855  * type and page selective granularity within PASID:
4856  * X: indexed by iommu cache type
4857  * Y: indexed by enum iommu_inv_granularity
4858  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4859  */
4860 
4861 static const int
4862 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4863 	/*
4864 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4865 	 * page selective (address granularity)
4866 	 */
4867 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4868 	/* PASID based dev TLBs */
4869 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4870 	/* PASID cache */
4871 	{-EINVAL, -EINVAL, -EINVAL}
4872 };
4873 
4874 static inline int to_vtd_granularity(int type, int granu)
4875 {
4876 	return inv_type_granu_table[type][granu];
4877 }
4878 
4879 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4880 {
4881 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4882 
4883 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4884 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4885 	 * granu size in contiguous memory.
4886 	 */
4887 	return order_base_2(nr_pages);
4888 }
4889 
4890 static int
4891 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4892 			   struct iommu_cache_invalidate_info *inv_info)
4893 {
4894 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4895 	struct device_domain_info *info;
4896 	struct intel_iommu *iommu;
4897 	unsigned long flags;
4898 	int cache_type;
4899 	u8 bus, devfn;
4900 	u16 did, sid;
4901 	int ret = 0;
4902 	u64 size = 0;
4903 
4904 	if (!inv_info || !dmar_domain)
4905 		return -EINVAL;
4906 
4907 	if (!dev || !dev_is_pci(dev))
4908 		return -ENODEV;
4909 
4910 	iommu = device_to_iommu(dev, &bus, &devfn);
4911 	if (!iommu)
4912 		return -ENODEV;
4913 
4914 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4915 		return -EINVAL;
4916 
4917 	spin_lock_irqsave(&device_domain_lock, flags);
4918 	spin_lock(&iommu->lock);
4919 	info = get_domain_info(dev);
4920 	if (!info) {
4921 		ret = -EINVAL;
4922 		goto out_unlock;
4923 	}
4924 	did = dmar_domain->iommu_did[iommu->seq_id];
4925 	sid = PCI_DEVID(bus, devfn);
4926 
4927 	/* Size is only valid in address selective invalidation */
4928 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4929 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4930 				   inv_info->granu.addr_info.nb_granules);
4931 
4932 	for_each_set_bit(cache_type,
4933 			 (unsigned long *)&inv_info->cache,
4934 			 IOMMU_CACHE_INV_TYPE_NR) {
4935 		int granu = 0;
4936 		u64 pasid = 0;
4937 		u64 addr = 0;
4938 
4939 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4940 		if (granu == -EINVAL) {
4941 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4942 					   cache_type, inv_info->granularity);
4943 			break;
4944 		}
4945 
4946 		/*
4947 		 * PASID is stored in different locations based on the
4948 		 * granularity.
4949 		 */
4950 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4951 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4952 			pasid = inv_info->granu.pasid_info.pasid;
4953 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4954 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4955 			pasid = inv_info->granu.addr_info.pasid;
4956 
4957 		switch (BIT(cache_type)) {
4958 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4959 			/* HW will ignore LSB bits based on address mask */
4960 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4961 			    size &&
4962 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4963 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4964 						   inv_info->granu.addr_info.addr, size);
4965 			}
4966 
4967 			/*
4968 			 * If granu is PASID-selective, address is ignored.
4969 			 * We use npages = -1 to indicate that.
4970 			 */
4971 			qi_flush_piotlb(iommu, did, pasid,
4972 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4973 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4974 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4975 
4976 			if (!info->ats_enabled)
4977 				break;
4978 			/*
4979 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4980 			 * in the guest may assume IOTLB flush is inclusive,
4981 			 * which is more efficient.
4982 			 */
4983 			fallthrough;
4984 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4985 			/*
4986 			 * PASID based device TLB invalidation does not support
4987 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4988 			 * IOMMU_INV_GRANU_ADDR.
4989 			 * The equivalent of that is we set the size to be the
4990 			 * entire range of 64 bit. User only provides PASID info
4991 			 * without address info. So we set addr to 0.
4992 			 */
4993 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4994 				size = 64 - VTD_PAGE_SHIFT;
4995 				addr = 0;
4996 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4997 				addr = inv_info->granu.addr_info.addr;
4998 			}
4999 
5000 			if (info->ats_enabled)
5001 				qi_flush_dev_iotlb_pasid(iommu, sid,
5002 						info->pfsid, pasid,
5003 						info->ats_qdep, addr,
5004 						size);
5005 			else
5006 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5007 			break;
5008 		default:
5009 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5010 					    cache_type);
5011 			ret = -EINVAL;
5012 		}
5013 	}
5014 out_unlock:
5015 	spin_unlock(&iommu->lock);
5016 	spin_unlock_irqrestore(&device_domain_lock, flags);
5017 
5018 	return ret;
5019 }
5020 #endif
5021 
5022 static int intel_iommu_map(struct iommu_domain *domain,
5023 			   unsigned long iova, phys_addr_t hpa,
5024 			   size_t size, int iommu_prot, gfp_t gfp)
5025 {
5026 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5027 	u64 max_addr;
5028 	int prot = 0;
5029 
5030 	if (iommu_prot & IOMMU_READ)
5031 		prot |= DMA_PTE_READ;
5032 	if (iommu_prot & IOMMU_WRITE)
5033 		prot |= DMA_PTE_WRITE;
5034 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5035 		prot |= DMA_PTE_SNP;
5036 
5037 	max_addr = iova + size;
5038 	if (dmar_domain->max_addr < max_addr) {
5039 		u64 end;
5040 
5041 		/* check if minimum agaw is sufficient for mapped address */
5042 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5043 		if (end < max_addr) {
5044 			pr_err("%s: iommu width (%d) is not "
5045 			       "sufficient for the mapped address (%llx)\n",
5046 			       __func__, dmar_domain->gaw, max_addr);
5047 			return -EFAULT;
5048 		}
5049 		dmar_domain->max_addr = max_addr;
5050 	}
5051 	/* Round up size to next multiple of PAGE_SIZE, if it and
5052 	   the low bits of hpa would take us onto the next page */
5053 	size = aligned_nrpages(hpa, size);
5054 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5055 				hpa >> VTD_PAGE_SHIFT, size, prot);
5056 }
5057 
5058 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5059 				unsigned long iova, size_t size,
5060 				struct iommu_iotlb_gather *gather)
5061 {
5062 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5063 	unsigned long start_pfn, last_pfn;
5064 	int level = 0;
5065 
5066 	/* Cope with horrid API which requires us to unmap more than the
5067 	   size argument if it happens to be a large-page mapping. */
5068 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5069 
5070 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5071 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5072 
5073 	start_pfn = iova >> VTD_PAGE_SHIFT;
5074 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5075 
5076 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5077 					last_pfn, gather->freelist);
5078 
5079 	if (dmar_domain->max_addr == iova + size)
5080 		dmar_domain->max_addr = iova;
5081 
5082 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5083 
5084 	return size;
5085 }
5086 
5087 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5088 				 struct iommu_iotlb_gather *gather)
5089 {
5090 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5091 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5092 	size_t size = gather->end - gather->start;
5093 	unsigned long start_pfn;
5094 	unsigned long nrpages;
5095 	int iommu_id;
5096 
5097 	nrpages = aligned_nrpages(gather->start, size);
5098 	start_pfn = mm_to_dma_pfn(iova_pfn);
5099 
5100 	for_each_domain_iommu(iommu_id, dmar_domain)
5101 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5102 				      start_pfn, nrpages, !gather->freelist, 0);
5103 
5104 	dma_free_pagelist(gather->freelist);
5105 }
5106 
5107 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5108 					    dma_addr_t iova)
5109 {
5110 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5111 	struct dma_pte *pte;
5112 	int level = 0;
5113 	u64 phys = 0;
5114 
5115 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5116 	if (pte && dma_pte_present(pte))
5117 		phys = dma_pte_addr(pte) +
5118 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5119 						VTD_PAGE_SHIFT) - 1));
5120 
5121 	return phys;
5122 }
5123 
5124 static bool intel_iommu_capable(enum iommu_cap cap)
5125 {
5126 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5127 		return domain_update_iommu_snooping(NULL) == 1;
5128 	if (cap == IOMMU_CAP_INTR_REMAP)
5129 		return irq_remapping_enabled == 1;
5130 
5131 	return false;
5132 }
5133 
5134 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5135 {
5136 	struct intel_iommu *iommu;
5137 
5138 	iommu = device_to_iommu(dev, NULL, NULL);
5139 	if (!iommu)
5140 		return ERR_PTR(-ENODEV);
5141 
5142 	if (translation_pre_enabled(iommu))
5143 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5144 
5145 	return &iommu->iommu;
5146 }
5147 
5148 static void intel_iommu_release_device(struct device *dev)
5149 {
5150 	struct intel_iommu *iommu;
5151 
5152 	iommu = device_to_iommu(dev, NULL, NULL);
5153 	if (!iommu)
5154 		return;
5155 
5156 	dmar_remove_one_dev_info(dev);
5157 
5158 	set_dma_ops(dev, NULL);
5159 }
5160 
5161 static void intel_iommu_probe_finalize(struct device *dev)
5162 {
5163 	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5164 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5165 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5166 
5167 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5168 		iommu_setup_dma_ops(dev, base,
5169 				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5170 	else
5171 		set_dma_ops(dev, NULL);
5172 }
5173 
5174 static void intel_iommu_get_resv_regions(struct device *device,
5175 					 struct list_head *head)
5176 {
5177 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5178 	struct iommu_resv_region *reg;
5179 	struct dmar_rmrr_unit *rmrr;
5180 	struct device *i_dev;
5181 	int i;
5182 
5183 	down_read(&dmar_global_lock);
5184 	for_each_rmrr_units(rmrr) {
5185 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5186 					  i, i_dev) {
5187 			struct iommu_resv_region *resv;
5188 			enum iommu_resv_type type;
5189 			size_t length;
5190 
5191 			if (i_dev != device &&
5192 			    !is_downstream_to_pci_bridge(device, i_dev))
5193 				continue;
5194 
5195 			length = rmrr->end_address - rmrr->base_address + 1;
5196 
5197 			type = device_rmrr_is_relaxable(device) ?
5198 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5199 
5200 			resv = iommu_alloc_resv_region(rmrr->base_address,
5201 						       length, prot, type);
5202 			if (!resv)
5203 				break;
5204 
5205 			list_add_tail(&resv->list, head);
5206 		}
5207 	}
5208 	up_read(&dmar_global_lock);
5209 
5210 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5211 	if (dev_is_pci(device)) {
5212 		struct pci_dev *pdev = to_pci_dev(device);
5213 
5214 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5215 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5216 						   IOMMU_RESV_DIRECT_RELAXABLE);
5217 			if (reg)
5218 				list_add_tail(&reg->list, head);
5219 		}
5220 	}
5221 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5222 
5223 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5224 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5225 				      0, IOMMU_RESV_MSI);
5226 	if (!reg)
5227 		return;
5228 	list_add_tail(&reg->list, head);
5229 }
5230 
5231 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5232 {
5233 	struct device_domain_info *info;
5234 	struct context_entry *context;
5235 	struct dmar_domain *domain;
5236 	unsigned long flags;
5237 	u64 ctx_lo;
5238 	int ret;
5239 
5240 	domain = find_domain(dev);
5241 	if (!domain)
5242 		return -EINVAL;
5243 
5244 	spin_lock_irqsave(&device_domain_lock, flags);
5245 	spin_lock(&iommu->lock);
5246 
5247 	ret = -EINVAL;
5248 	info = get_domain_info(dev);
5249 	if (!info || !info->pasid_supported)
5250 		goto out;
5251 
5252 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5253 	if (WARN_ON(!context))
5254 		goto out;
5255 
5256 	ctx_lo = context[0].lo;
5257 
5258 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5259 		ctx_lo |= CONTEXT_PASIDE;
5260 		context[0].lo = ctx_lo;
5261 		wmb();
5262 		iommu->flush.flush_context(iommu,
5263 					   domain->iommu_did[iommu->seq_id],
5264 					   PCI_DEVID(info->bus, info->devfn),
5265 					   DMA_CCMD_MASK_NOBIT,
5266 					   DMA_CCMD_DEVICE_INVL);
5267 	}
5268 
5269 	/* Enable PASID support in the device, if it wasn't already */
5270 	if (!info->pasid_enabled)
5271 		iommu_enable_dev_iotlb(info);
5272 
5273 	ret = 0;
5274 
5275  out:
5276 	spin_unlock(&iommu->lock);
5277 	spin_unlock_irqrestore(&device_domain_lock, flags);
5278 
5279 	return ret;
5280 }
5281 
5282 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5283 {
5284 	if (dev_is_pci(dev))
5285 		return pci_device_group(dev);
5286 	return generic_device_group(dev);
5287 }
5288 
5289 static int intel_iommu_enable_auxd(struct device *dev)
5290 {
5291 	struct device_domain_info *info;
5292 	struct intel_iommu *iommu;
5293 	unsigned long flags;
5294 	int ret;
5295 
5296 	iommu = device_to_iommu(dev, NULL, NULL);
5297 	if (!iommu || dmar_disabled)
5298 		return -EINVAL;
5299 
5300 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5301 		return -EINVAL;
5302 
5303 	ret = intel_iommu_enable_pasid(iommu, dev);
5304 	if (ret)
5305 		return -ENODEV;
5306 
5307 	spin_lock_irqsave(&device_domain_lock, flags);
5308 	info = get_domain_info(dev);
5309 	info->auxd_enabled = 1;
5310 	spin_unlock_irqrestore(&device_domain_lock, flags);
5311 
5312 	return 0;
5313 }
5314 
5315 static int intel_iommu_disable_auxd(struct device *dev)
5316 {
5317 	struct device_domain_info *info;
5318 	unsigned long flags;
5319 
5320 	spin_lock_irqsave(&device_domain_lock, flags);
5321 	info = get_domain_info(dev);
5322 	if (!WARN_ON(!info))
5323 		info->auxd_enabled = 0;
5324 	spin_unlock_irqrestore(&device_domain_lock, flags);
5325 
5326 	return 0;
5327 }
5328 
5329 /*
5330  * A PCI express designated vendor specific extended capability is defined
5331  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5332  * for system software and tools to detect endpoint devices supporting the
5333  * Intel scalable IO virtualization without host driver dependency.
5334  *
5335  * Returns the address of the matching extended capability structure within
5336  * the device's PCI configuration space or 0 if the device does not support
5337  * it.
5338  */
5339 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5340 {
5341 	int pos;
5342 	u16 vendor, id;
5343 
5344 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5345 	while (pos) {
5346 		pci_read_config_word(pdev, pos + 4, &vendor);
5347 		pci_read_config_word(pdev, pos + 8, &id);
5348 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5349 			return pos;
5350 
5351 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5352 	}
5353 
5354 	return 0;
5355 }
5356 
5357 static bool
5358 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5359 {
5360 	struct device_domain_info *info = get_domain_info(dev);
5361 
5362 	if (feat == IOMMU_DEV_FEAT_AUX) {
5363 		int ret;
5364 
5365 		if (!dev_is_pci(dev) || dmar_disabled ||
5366 		    !scalable_mode_support() || !pasid_mode_support())
5367 			return false;
5368 
5369 		ret = pci_pasid_features(to_pci_dev(dev));
5370 		if (ret < 0)
5371 			return false;
5372 
5373 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5374 	}
5375 
5376 	if (feat == IOMMU_DEV_FEAT_IOPF)
5377 		return info && info->pri_supported;
5378 
5379 	if (feat == IOMMU_DEV_FEAT_SVA)
5380 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5381 			info->pasid_supported && info->pri_supported &&
5382 			info->ats_supported;
5383 
5384 	return false;
5385 }
5386 
5387 static int
5388 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5389 {
5390 	if (feat == IOMMU_DEV_FEAT_AUX)
5391 		return intel_iommu_enable_auxd(dev);
5392 
5393 	if (feat == IOMMU_DEV_FEAT_IOPF)
5394 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5395 
5396 	if (feat == IOMMU_DEV_FEAT_SVA) {
5397 		struct device_domain_info *info = get_domain_info(dev);
5398 
5399 		if (!info)
5400 			return -EINVAL;
5401 
5402 		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5403 			return -EINVAL;
5404 
5405 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5406 			return 0;
5407 	}
5408 
5409 	return -ENODEV;
5410 }
5411 
5412 static int
5413 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5414 {
5415 	if (feat == IOMMU_DEV_FEAT_AUX)
5416 		return intel_iommu_disable_auxd(dev);
5417 
5418 	return -ENODEV;
5419 }
5420 
5421 static bool
5422 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5423 {
5424 	struct device_domain_info *info = get_domain_info(dev);
5425 
5426 	if (feat == IOMMU_DEV_FEAT_AUX)
5427 		return scalable_mode_support() && info && info->auxd_enabled;
5428 
5429 	return false;
5430 }
5431 
5432 static int
5433 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5434 {
5435 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5436 
5437 	return dmar_domain->default_pasid > 0 ?
5438 			dmar_domain->default_pasid : -EINVAL;
5439 }
5440 
5441 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5442 					   struct device *dev)
5443 {
5444 	return attach_deferred(dev);
5445 }
5446 
5447 static int
5448 intel_iommu_enable_nesting(struct iommu_domain *domain)
5449 {
5450 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5451 	unsigned long flags;
5452 	int ret = -ENODEV;
5453 
5454 	spin_lock_irqsave(&device_domain_lock, flags);
5455 	if (nested_mode_support() && list_empty(&dmar_domain->devices)) {
5456 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5457 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5458 		ret = 0;
5459 	}
5460 	spin_unlock_irqrestore(&device_domain_lock, flags);
5461 
5462 	return ret;
5463 }
5464 
5465 /*
5466  * Check that the device does not live on an external facing PCI port that is
5467  * marked as untrusted. Such devices should not be able to apply quirks and
5468  * thus not be able to bypass the IOMMU restrictions.
5469  */
5470 static bool risky_device(struct pci_dev *pdev)
5471 {
5472 	if (pdev->untrusted) {
5473 		pci_info(pdev,
5474 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5475 			 pdev->vendor, pdev->device);
5476 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5477 		return true;
5478 	}
5479 	return false;
5480 }
5481 
5482 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5483 			     unsigned long clf_pages)
5484 {
5485 	struct dma_pte *first_pte = NULL, *pte = NULL;
5486 	unsigned long lvl_pages = 0;
5487 	int level = 0;
5488 
5489 	while (clf_pages > 0) {
5490 		if (!pte) {
5491 			level = 0;
5492 			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5493 			if (WARN_ON(!pte))
5494 				return;
5495 			first_pte = pte;
5496 			lvl_pages = lvl_to_nr_pages(level);
5497 		}
5498 
5499 		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5500 			return;
5501 
5502 		clf_pages -= lvl_pages;
5503 		clf_pfn += lvl_pages;
5504 		pte++;
5505 
5506 		if (!clf_pages || first_pte_in_page(pte) ||
5507 		    (level > 1 && clf_pages < lvl_pages)) {
5508 			domain_flush_cache(domain, first_pte,
5509 					   (void *)pte - (void *)first_pte);
5510 			pte = NULL;
5511 		}
5512 	}
5513 }
5514 
5515 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5516 				       unsigned long iova, size_t size)
5517 {
5518 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5519 	unsigned long pages = aligned_nrpages(iova, size);
5520 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5521 	struct intel_iommu *iommu;
5522 	int iommu_id;
5523 
5524 	if (!dmar_domain->iommu_coherency)
5525 		clflush_sync_map(dmar_domain, pfn, pages);
5526 
5527 	for_each_domain_iommu(iommu_id, dmar_domain) {
5528 		iommu = g_iommus[iommu_id];
5529 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5530 	}
5531 }
5532 
5533 const struct iommu_ops intel_iommu_ops = {
5534 	.capable		= intel_iommu_capable,
5535 	.domain_alloc		= intel_iommu_domain_alloc,
5536 	.domain_free		= intel_iommu_domain_free,
5537 	.enable_nesting		= intel_iommu_enable_nesting,
5538 	.attach_dev		= intel_iommu_attach_device,
5539 	.detach_dev		= intel_iommu_detach_device,
5540 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5541 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5542 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5543 	.map			= intel_iommu_map,
5544 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5545 	.unmap			= intel_iommu_unmap,
5546 	.flush_iotlb_all        = intel_flush_iotlb_all,
5547 	.iotlb_sync		= intel_iommu_tlb_sync,
5548 	.iova_to_phys		= intel_iommu_iova_to_phys,
5549 	.probe_device		= intel_iommu_probe_device,
5550 	.probe_finalize		= intel_iommu_probe_finalize,
5551 	.release_device		= intel_iommu_release_device,
5552 	.get_resv_regions	= intel_iommu_get_resv_regions,
5553 	.put_resv_regions	= generic_iommu_put_resv_regions,
5554 	.device_group		= intel_iommu_device_group,
5555 	.dev_has_feat		= intel_iommu_dev_has_feat,
5556 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5557 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5558 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5559 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5560 	.def_domain_type	= device_def_domain_type,
5561 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5562 #ifdef CONFIG_INTEL_IOMMU_SVM
5563 	.cache_invalidate	= intel_iommu_sva_invalidate,
5564 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5565 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5566 	.sva_bind		= intel_svm_bind,
5567 	.sva_unbind		= intel_svm_unbind,
5568 	.sva_get_pasid		= intel_svm_get_pasid,
5569 	.page_response		= intel_svm_page_response,
5570 #endif
5571 };
5572 
5573 static void quirk_iommu_igfx(struct pci_dev *dev)
5574 {
5575 	if (risky_device(dev))
5576 		return;
5577 
5578 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5579 	dmar_map_gfx = 0;
5580 }
5581 
5582 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5585 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5586 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5587 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5588 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5589 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5590 
5591 /* Broadwell igfx malfunctions with dmar */
5592 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5593 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5594 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5595 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5596 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5597 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5598 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5599 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5600 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5601 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5602 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5603 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5604 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5605 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5606 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5607 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5608 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5609 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5610 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5611 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5612 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5613 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5615 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5616 
5617 static void quirk_iommu_rwbf(struct pci_dev *dev)
5618 {
5619 	if (risky_device(dev))
5620 		return;
5621 
5622 	/*
5623 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5624 	 * but needs it. Same seems to hold for the desktop versions.
5625 	 */
5626 	pci_info(dev, "Forcing write-buffer flush capability\n");
5627 	rwbf_quirk = 1;
5628 }
5629 
5630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5633 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5637 
5638 #define GGC 0x52
5639 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5640 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5641 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5642 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5643 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5644 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5645 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5646 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5647 
5648 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5649 {
5650 	unsigned short ggc;
5651 
5652 	if (risky_device(dev))
5653 		return;
5654 
5655 	if (pci_read_config_word(dev, GGC, &ggc))
5656 		return;
5657 
5658 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5659 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5660 		dmar_map_gfx = 0;
5661 	} else if (dmar_map_gfx) {
5662 		/* we have to ensure the gfx device is idle before we flush */
5663 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5664 		intel_iommu_strict = 1;
5665        }
5666 }
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5671 
5672 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5673 {
5674 	unsigned short ver;
5675 
5676 	if (!IS_GFX_DEVICE(dev))
5677 		return;
5678 
5679 	ver = (dev->device >> 8) & 0xff;
5680 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5681 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5682 	    ver != 0x9a)
5683 		return;
5684 
5685 	if (risky_device(dev))
5686 		return;
5687 
5688 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5689 	iommu_skip_te_disable = 1;
5690 }
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5692 
5693 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5694    ISOCH DMAR unit for the Azalia sound device, but not give it any
5695    TLB entries, which causes it to deadlock. Check for that.  We do
5696    this in a function called from init_dmars(), instead of in a PCI
5697    quirk, because we don't want to print the obnoxious "BIOS broken"
5698    message if VT-d is actually disabled.
5699 */
5700 static void __init check_tylersburg_isoch(void)
5701 {
5702 	struct pci_dev *pdev;
5703 	uint32_t vtisochctrl;
5704 
5705 	/* If there's no Azalia in the system anyway, forget it. */
5706 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5707 	if (!pdev)
5708 		return;
5709 
5710 	if (risky_device(pdev)) {
5711 		pci_dev_put(pdev);
5712 		return;
5713 	}
5714 
5715 	pci_dev_put(pdev);
5716 
5717 	/* System Management Registers. Might be hidden, in which case
5718 	   we can't do the sanity check. But that's OK, because the
5719 	   known-broken BIOSes _don't_ actually hide it, so far. */
5720 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5721 	if (!pdev)
5722 		return;
5723 
5724 	if (risky_device(pdev)) {
5725 		pci_dev_put(pdev);
5726 		return;
5727 	}
5728 
5729 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5730 		pci_dev_put(pdev);
5731 		return;
5732 	}
5733 
5734 	pci_dev_put(pdev);
5735 
5736 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5737 	if (vtisochctrl & 1)
5738 		return;
5739 
5740 	/* Drop all bits other than the number of TLB entries */
5741 	vtisochctrl &= 0x1c;
5742 
5743 	/* If we have the recommended number of TLB entries (16), fine. */
5744 	if (vtisochctrl == 0x10)
5745 		return;
5746 
5747 	/* Zero TLB entries? You get to ride the short bus to school. */
5748 	if (!vtisochctrl) {
5749 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5750 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5751 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5752 		     dmi_get_system_info(DMI_BIOS_VERSION),
5753 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5754 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5755 		return;
5756 	}
5757 
5758 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5759 	       vtisochctrl);
5760 }
5761