xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 69868c3b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 
48 #include "../irq_remapping.h"
49 #include "../iommu-sva-lib.h"
50 #include "pasid.h"
51 #include "cap_audit.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 struct dmar_satc_unit {
321 	struct list_head list;		/* list of SATC units */
322 	struct acpi_dmar_header *hdr;	/* ACPI header */
323 	struct dmar_dev_scope *devices;	/* target devices */
324 	struct intel_iommu *iommu;	/* the corresponding iommu */
325 	int devices_cnt;		/* target device count */
326 	u8 atc_required:1;		/* ATS is required */
327 };
328 
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331 static LIST_HEAD(dmar_satc_units);
332 
333 #define for_each_rmrr_units(rmrr) \
334 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 
336 /* bitmap for indexing intel_iommus */
337 static int g_num_of_iommus;
338 
339 static void domain_exit(struct dmar_domain *domain);
340 static void domain_remove_dev_info(struct dmar_domain *domain);
341 static void dmar_remove_one_dev_info(struct device *dev);
342 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
343 static int intel_iommu_attach_device(struct iommu_domain *domain,
344 				     struct device *dev);
345 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
346 					    dma_addr_t iova);
347 
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
350 #else
351 int dmar_disabled = 1;
352 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
353 
354 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
355 int intel_iommu_sm = 1;
356 #else
357 int intel_iommu_sm;
358 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
359 
360 int intel_iommu_enabled = 0;
361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
362 
363 static int dmar_map_gfx = 1;
364 static int intel_iommu_strict;
365 static int intel_iommu_superpage = 1;
366 static int iommu_identity_mapping;
367 static int iommu_skip_te_disable;
368 
369 #define IDENTMAP_GFX		2
370 #define IDENTMAP_AZALIA		4
371 
372 int intel_iommu_gfx_mapped;
373 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
374 
375 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
376 struct device_domain_info *get_domain_info(struct device *dev)
377 {
378 	struct device_domain_info *info;
379 
380 	if (!dev)
381 		return NULL;
382 
383 	info = dev_iommu_priv_get(dev);
384 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
385 		return NULL;
386 
387 	return info;
388 }
389 
390 DEFINE_SPINLOCK(device_domain_lock);
391 static LIST_HEAD(device_domain_list);
392 
393 /*
394  * Iterate over elements in device_domain_list and call the specified
395  * callback @fn against each element.
396  */
397 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
398 				     void *data), void *data)
399 {
400 	int ret = 0;
401 	unsigned long flags;
402 	struct device_domain_info *info;
403 
404 	spin_lock_irqsave(&device_domain_lock, flags);
405 	list_for_each_entry(info, &device_domain_list, global) {
406 		ret = fn(info, data);
407 		if (ret) {
408 			spin_unlock_irqrestore(&device_domain_lock, flags);
409 			return ret;
410 		}
411 	}
412 	spin_unlock_irqrestore(&device_domain_lock, flags);
413 
414 	return 0;
415 }
416 
417 const struct iommu_ops intel_iommu_ops;
418 
419 static bool translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
422 }
423 
424 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
425 {
426 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
427 }
428 
429 static void init_translation_status(struct intel_iommu *iommu)
430 {
431 	u32 gsts;
432 
433 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
434 	if (gsts & DMA_GSTS_TES)
435 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
436 }
437 
438 static int __init intel_iommu_setup(char *str)
439 {
440 	if (!str)
441 		return -EINVAL;
442 	while (*str) {
443 		if (!strncmp(str, "on", 2)) {
444 			dmar_disabled = 0;
445 			pr_info("IOMMU enabled\n");
446 		} else if (!strncmp(str, "off", 3)) {
447 			dmar_disabled = 1;
448 			no_platform_optin = 1;
449 			pr_info("IOMMU disabled\n");
450 		} else if (!strncmp(str, "igfx_off", 8)) {
451 			dmar_map_gfx = 0;
452 			pr_info("Disable GFX device mapping\n");
453 		} else if (!strncmp(str, "forcedac", 8)) {
454 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
455 			iommu_dma_forcedac = true;
456 		} else if (!strncmp(str, "strict", 6)) {
457 			pr_info("Disable batched IOTLB flush\n");
458 			intel_iommu_strict = 1;
459 		} else if (!strncmp(str, "sp_off", 6)) {
460 			pr_info("Disable supported super page\n");
461 			intel_iommu_superpage = 0;
462 		} else if (!strncmp(str, "sm_on", 5)) {
463 			pr_info("Intel-IOMMU: scalable mode supported\n");
464 			intel_iommu_sm = 1;
465 		} else if (!strncmp(str, "tboot_noforce", 13)) {
466 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 			intel_iommu_tboot_noforce = 1;
468 		}
469 
470 		str += strcspn(str, ",");
471 		while (*str == ',')
472 			str++;
473 	}
474 	return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477 
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480 
481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 {
483 	struct dmar_domain **domains;
484 	int idx = did >> 8;
485 
486 	domains = iommu->domains[idx];
487 	if (!domains)
488 		return NULL;
489 
490 	return domains[did & 0xff];
491 }
492 
493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494 			     struct dmar_domain *domain)
495 {
496 	struct dmar_domain **domains;
497 	int idx = did >> 8;
498 
499 	if (!iommu->domains[idx]) {
500 		size_t size = 256 * sizeof(struct dmar_domain *);
501 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 	}
503 
504 	domains = iommu->domains[idx];
505 	if (WARN_ON(!domains))
506 		return;
507 	else
508 		domains[did & 0xff] = domain;
509 }
510 
511 void *alloc_pgtable_page(int node)
512 {
513 	struct page *page;
514 	void *vaddr = NULL;
515 
516 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 	if (page)
518 		vaddr = page_address(page);
519 	return vaddr;
520 }
521 
522 void free_pgtable_page(void *vaddr)
523 {
524 	free_page((unsigned long)vaddr);
525 }
526 
527 static inline void *alloc_domain_mem(void)
528 {
529 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 }
531 
532 static void free_domain_mem(void *vaddr)
533 {
534 	kmem_cache_free(iommu_domain_cache, vaddr);
535 }
536 
537 static inline void * alloc_devinfo_mem(void)
538 {
539 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 }
541 
542 static inline void free_devinfo_mem(void *vaddr)
543 {
544 	kmem_cache_free(iommu_devinfo_cache, vaddr);
545 }
546 
547 static inline int domain_type_is_si(struct dmar_domain *domain)
548 {
549 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 }
551 
552 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 {
554 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 }
556 
557 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 				       unsigned long pfn)
559 {
560 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 
562 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 }
564 
565 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
566 {
567 	unsigned long sagaw;
568 	int agaw;
569 
570 	sagaw = cap_sagaw(iommu->cap);
571 	for (agaw = width_to_agaw(max_gaw);
572 	     agaw >= 0; agaw--) {
573 		if (test_bit(agaw, &sagaw))
574 			break;
575 	}
576 
577 	return agaw;
578 }
579 
580 /*
581  * Calculate max SAGAW for each iommu.
582  */
583 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 {
585 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
586 }
587 
588 /*
589  * calculate agaw for each iommu.
590  * "SAGAW" may be different across iommus, use a default agaw, and
591  * get a supported less agaw for iommus that don't support the default agaw.
592  */
593 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 {
595 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 }
597 
598 /* This functionin only returns single iommu in a domain */
599 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
600 {
601 	int iommu_id;
602 
603 	/* si_domain and vm domain should not get here. */
604 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 		return NULL;
606 
607 	for_each_domain_iommu(iommu_id, domain)
608 		break;
609 
610 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 		return NULL;
612 
613 	return g_iommus[iommu_id];
614 }
615 
616 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617 {
618 	return sm_supported(iommu) ?
619 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 }
621 
622 static void domain_update_iommu_coherency(struct dmar_domain *domain)
623 {
624 	struct dmar_drhd_unit *drhd;
625 	struct intel_iommu *iommu;
626 	bool found = false;
627 	int i;
628 
629 	domain->iommu_coherency = true;
630 
631 	for_each_domain_iommu(i, domain) {
632 		found = true;
633 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
634 			domain->iommu_coherency = false;
635 			break;
636 		}
637 	}
638 	if (found)
639 		return;
640 
641 	/* No hardware attached; use lowest common denominator */
642 	rcu_read_lock();
643 	for_each_active_iommu(iommu, drhd) {
644 		if (!iommu_paging_structure_coherency(iommu)) {
645 			domain->iommu_coherency = false;
646 			break;
647 		}
648 	}
649 	rcu_read_unlock();
650 }
651 
652 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
653 {
654 	struct dmar_drhd_unit *drhd;
655 	struct intel_iommu *iommu;
656 	bool ret = true;
657 
658 	rcu_read_lock();
659 	for_each_active_iommu(iommu, drhd) {
660 		if (iommu != skip) {
661 			/*
662 			 * If the hardware is operating in the scalable mode,
663 			 * the snooping control is always supported since we
664 			 * always set PASID-table-entry.PGSNP bit if the domain
665 			 * is managed outside (UNMANAGED).
666 			 */
667 			if (!sm_supported(iommu) &&
668 			    !ecap_sc_support(iommu->ecap)) {
669 				ret = false;
670 				break;
671 			}
672 		}
673 	}
674 	rcu_read_unlock();
675 
676 	return ret;
677 }
678 
679 static int domain_update_iommu_superpage(struct dmar_domain *domain,
680 					 struct intel_iommu *skip)
681 {
682 	struct dmar_drhd_unit *drhd;
683 	struct intel_iommu *iommu;
684 	int mask = 0x3;
685 
686 	if (!intel_iommu_superpage)
687 		return 0;
688 
689 	/* set iommu_superpage to the smallest common denominator */
690 	rcu_read_lock();
691 	for_each_active_iommu(iommu, drhd) {
692 		if (iommu != skip) {
693 			if (domain && domain_use_first_level(domain)) {
694 				if (!cap_fl1gp_support(iommu->cap))
695 					mask = 0x1;
696 			} else {
697 				mask &= cap_super_page_val(iommu->cap);
698 			}
699 
700 			if (!mask)
701 				break;
702 		}
703 	}
704 	rcu_read_unlock();
705 
706 	return fls(mask);
707 }
708 
709 static int domain_update_device_node(struct dmar_domain *domain)
710 {
711 	struct device_domain_info *info;
712 	int nid = NUMA_NO_NODE;
713 
714 	assert_spin_locked(&device_domain_lock);
715 
716 	if (list_empty(&domain->devices))
717 		return NUMA_NO_NODE;
718 
719 	list_for_each_entry(info, &domain->devices, link) {
720 		if (!info->dev)
721 			continue;
722 
723 		/*
724 		 * There could possibly be multiple device numa nodes as devices
725 		 * within the same domain may sit behind different IOMMUs. There
726 		 * isn't perfect answer in such situation, so we select first
727 		 * come first served policy.
728 		 */
729 		nid = dev_to_node(info->dev);
730 		if (nid != NUMA_NO_NODE)
731 			break;
732 	}
733 
734 	return nid;
735 }
736 
737 static void domain_update_iotlb(struct dmar_domain *domain);
738 
739 /* Some capabilities may be different across iommus */
740 static void domain_update_iommu_cap(struct dmar_domain *domain)
741 {
742 	domain_update_iommu_coherency(domain);
743 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
744 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
745 
746 	/*
747 	 * If RHSA is missing, we should default to the device numa domain
748 	 * as fall back.
749 	 */
750 	if (domain->nid == NUMA_NO_NODE)
751 		domain->nid = domain_update_device_node(domain);
752 
753 	/*
754 	 * First-level translation restricts the input-address to a
755 	 * canonical address (i.e., address bits 63:N have the same
756 	 * value as address bit [N-1], where N is 48-bits with 4-level
757 	 * paging and 57-bits with 5-level paging). Hence, skip bit
758 	 * [N-1].
759 	 */
760 	if (domain_use_first_level(domain))
761 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
762 	else
763 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
764 
765 	domain_update_iotlb(domain);
766 }
767 
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 					 u8 devfn, int alloc)
770 {
771 	struct root_entry *root = &iommu->root_entry[bus];
772 	struct context_entry *context;
773 	u64 *entry;
774 
775 	entry = &root->lo;
776 	if (sm_supported(iommu)) {
777 		if (devfn >= 0x80) {
778 			devfn -= 0x80;
779 			entry = &root->hi;
780 		}
781 		devfn *= 2;
782 	}
783 	if (*entry & 1)
784 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 	else {
786 		unsigned long phy_addr;
787 		if (!alloc)
788 			return NULL;
789 
790 		context = alloc_pgtable_page(iommu->node);
791 		if (!context)
792 			return NULL;
793 
794 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 		phy_addr = virt_to_phys((void *)context);
796 		*entry = phy_addr | 1;
797 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
798 	}
799 	return &context[devfn];
800 }
801 
802 static bool attach_deferred(struct device *dev)
803 {
804 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805 }
806 
807 /**
808  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809  *				 sub-hierarchy of a candidate PCI-PCI bridge
810  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811  * @bridge: the candidate PCI-PCI bridge
812  *
813  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814  */
815 static bool
816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817 {
818 	struct pci_dev *pdev, *pbridge;
819 
820 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 		return false;
822 
823 	pdev = to_pci_dev(dev);
824 	pbridge = to_pci_dev(bridge);
825 
826 	if (pbridge->subordinate &&
827 	    pbridge->subordinate->number <= pdev->bus->number &&
828 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 		return true;
830 
831 	return false;
832 }
833 
834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835 {
836 	struct dmar_drhd_unit *drhd;
837 	u32 vtbar;
838 	int rc;
839 
840 	/* We know that this device on this chipset has its own IOMMU.
841 	 * If we find it under a different IOMMU, then the BIOS is lying
842 	 * to us. Hope that the IOMMU for this device is actually
843 	 * disabled, and it needs no translation...
844 	 */
845 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 	if (rc) {
847 		/* "can't" happen */
848 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 		return false;
850 	}
851 	vtbar &= 0xffff0000;
852 
853 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
854 	drhd = dmar_find_matched_drhd_unit(pdev);
855 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 		return true;
859 	}
860 
861 	return false;
862 }
863 
864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865 {
866 	if (!iommu || iommu->drhd->ignored)
867 		return true;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pdev = to_pci_dev(dev);
871 
872 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 		    quirk_ioat_snb_local_iommu(pdev))
875 			return true;
876 	}
877 
878 	return false;
879 }
880 
881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882 {
883 	struct dmar_drhd_unit *drhd = NULL;
884 	struct pci_dev *pdev = NULL;
885 	struct intel_iommu *iommu;
886 	struct device *tmp;
887 	u16 segment = 0;
888 	int i;
889 
890 	if (!dev)
891 		return NULL;
892 
893 	if (dev_is_pci(dev)) {
894 		struct pci_dev *pf_pdev;
895 
896 		pdev = pci_real_dma_dev(to_pci_dev(dev));
897 
898 		/* VFs aren't listed in scope tables; we need to look up
899 		 * the PF instead to find the IOMMU. */
900 		pf_pdev = pci_physfn(pdev);
901 		dev = &pf_pdev->dev;
902 		segment = pci_domain_nr(pdev->bus);
903 	} else if (has_acpi_companion(dev))
904 		dev = &ACPI_COMPANION(dev)->dev;
905 
906 	rcu_read_lock();
907 	for_each_iommu(iommu, drhd) {
908 		if (pdev && segment != drhd->segment)
909 			continue;
910 
911 		for_each_active_dev_scope(drhd->devices,
912 					  drhd->devices_cnt, i, tmp) {
913 			if (tmp == dev) {
914 				/* For a VF use its original BDF# not that of the PF
915 				 * which we used for the IOMMU lookup. Strictly speaking
916 				 * we could do this for all PCI devices; we only need to
917 				 * get the BDF# from the scope table for ACPI matches. */
918 				if (pdev && pdev->is_virtfn)
919 					goto got_pdev;
920 
921 				if (bus && devfn) {
922 					*bus = drhd->devices[i].bus;
923 					*devfn = drhd->devices[i].devfn;
924 				}
925 				goto out;
926 			}
927 
928 			if (is_downstream_to_pci_bridge(dev, tmp))
929 				goto got_pdev;
930 		}
931 
932 		if (pdev && drhd->include_all) {
933 		got_pdev:
934 			if (bus && devfn) {
935 				*bus = pdev->bus->number;
936 				*devfn = pdev->devfn;
937 			}
938 			goto out;
939 		}
940 	}
941 	iommu = NULL;
942  out:
943 	if (iommu_is_dummy(iommu, dev))
944 		iommu = NULL;
945 
946 	rcu_read_unlock();
947 
948 	return iommu;
949 }
950 
951 static void domain_flush_cache(struct dmar_domain *domain,
952 			       void *addr, int size)
953 {
954 	if (!domain->iommu_coherency)
955 		clflush_cache_range(addr, size);
956 }
957 
958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959 {
960 	struct context_entry *context;
961 	int ret = 0;
962 	unsigned long flags;
963 
964 	spin_lock_irqsave(&iommu->lock, flags);
965 	context = iommu_context_addr(iommu, bus, devfn, 0);
966 	if (context)
967 		ret = context_present(context);
968 	spin_unlock_irqrestore(&iommu->lock, flags);
969 	return ret;
970 }
971 
972 static void free_context_table(struct intel_iommu *iommu)
973 {
974 	int i;
975 	unsigned long flags;
976 	struct context_entry *context;
977 
978 	spin_lock_irqsave(&iommu->lock, flags);
979 	if (!iommu->root_entry) {
980 		goto out;
981 	}
982 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 		context = iommu_context_addr(iommu, i, 0, 0);
984 		if (context)
985 			free_pgtable_page(context);
986 
987 		if (!sm_supported(iommu))
988 			continue;
989 
990 		context = iommu_context_addr(iommu, i, 0x80, 0);
991 		if (context)
992 			free_pgtable_page(context);
993 
994 	}
995 	free_pgtable_page(iommu->root_entry);
996 	iommu->root_entry = NULL;
997 out:
998 	spin_unlock_irqrestore(&iommu->lock, flags);
999 }
1000 
1001 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002 				      unsigned long pfn, int *target_level)
1003 {
1004 	struct dma_pte *parent, *pte;
1005 	int level = agaw_to_level(domain->agaw);
1006 	int offset;
1007 
1008 	BUG_ON(!domain->pgd);
1009 
1010 	if (!domain_pfn_supported(domain, pfn))
1011 		/* Address beyond IOMMU's addressing capabilities. */
1012 		return NULL;
1013 
1014 	parent = domain->pgd;
1015 
1016 	while (1) {
1017 		void *tmp_page;
1018 
1019 		offset = pfn_level_offset(pfn, level);
1020 		pte = &parent[offset];
1021 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022 			break;
1023 		if (level == *target_level)
1024 			break;
1025 
1026 		if (!dma_pte_present(pte)) {
1027 			uint64_t pteval;
1028 
1029 			tmp_page = alloc_pgtable_page(domain->nid);
1030 
1031 			if (!tmp_page)
1032 				return NULL;
1033 
1034 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036 			if (domain_use_first_level(domain)) {
1037 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1038 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1039 					pteval |= DMA_FL_PTE_ACCESS;
1040 			}
1041 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1042 				/* Someone else set it while we were thinking; use theirs. */
1043 				free_pgtable_page(tmp_page);
1044 			else
1045 				domain_flush_cache(domain, pte, sizeof(*pte));
1046 		}
1047 		if (level == 1)
1048 			break;
1049 
1050 		parent = phys_to_virt(dma_pte_addr(pte));
1051 		level--;
1052 	}
1053 
1054 	if (!*target_level)
1055 		*target_level = level;
1056 
1057 	return pte;
1058 }
1059 
1060 /* return address's pte at specific level */
1061 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1062 					 unsigned long pfn,
1063 					 int level, int *large_page)
1064 {
1065 	struct dma_pte *parent, *pte;
1066 	int total = agaw_to_level(domain->agaw);
1067 	int offset;
1068 
1069 	parent = domain->pgd;
1070 	while (level <= total) {
1071 		offset = pfn_level_offset(pfn, total);
1072 		pte = &parent[offset];
1073 		if (level == total)
1074 			return pte;
1075 
1076 		if (!dma_pte_present(pte)) {
1077 			*large_page = total;
1078 			break;
1079 		}
1080 
1081 		if (dma_pte_superpage(pte)) {
1082 			*large_page = total;
1083 			return pte;
1084 		}
1085 
1086 		parent = phys_to_virt(dma_pte_addr(pte));
1087 		total--;
1088 	}
1089 	return NULL;
1090 }
1091 
1092 /* clear last level pte, a tlb flush should be followed */
1093 static void dma_pte_clear_range(struct dmar_domain *domain,
1094 				unsigned long start_pfn,
1095 				unsigned long last_pfn)
1096 {
1097 	unsigned int large_page;
1098 	struct dma_pte *first_pte, *pte;
1099 
1100 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1101 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1102 	BUG_ON(start_pfn > last_pfn);
1103 
1104 	/* we don't need lock here; nobody else touches the iova range */
1105 	do {
1106 		large_page = 1;
1107 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1108 		if (!pte) {
1109 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1110 			continue;
1111 		}
1112 		do {
1113 			dma_clear_pte(pte);
1114 			start_pfn += lvl_to_nr_pages(large_page);
1115 			pte++;
1116 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1117 
1118 		domain_flush_cache(domain, first_pte,
1119 				   (void *)pte - (void *)first_pte);
1120 
1121 	} while (start_pfn && start_pfn <= last_pfn);
1122 }
1123 
1124 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1125 			       int retain_level, struct dma_pte *pte,
1126 			       unsigned long pfn, unsigned long start_pfn,
1127 			       unsigned long last_pfn)
1128 {
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn;
1134 		struct dma_pte *level_pte;
1135 
1136 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1137 			goto next;
1138 
1139 		level_pfn = pfn & level_mask(level);
1140 		level_pte = phys_to_virt(dma_pte_addr(pte));
1141 
1142 		if (level > 2) {
1143 			dma_pte_free_level(domain, level - 1, retain_level,
1144 					   level_pte, level_pfn, start_pfn,
1145 					   last_pfn);
1146 		}
1147 
1148 		/*
1149 		 * Free the page table if we're below the level we want to
1150 		 * retain and the range covers the entire table.
1151 		 */
1152 		if (level < retain_level && !(start_pfn > level_pfn ||
1153 		      last_pfn < level_pfn + level_size(level) - 1)) {
1154 			dma_clear_pte(pte);
1155 			domain_flush_cache(domain, pte, sizeof(*pte));
1156 			free_pgtable_page(level_pte);
1157 		}
1158 next:
1159 		pfn += level_size(level);
1160 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161 }
1162 
1163 /*
1164  * clear last level (leaf) ptes and free page table pages below the
1165  * level we wish to keep intact.
1166  */
1167 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168 				   unsigned long start_pfn,
1169 				   unsigned long last_pfn,
1170 				   int retain_level)
1171 {
1172 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174 	BUG_ON(start_pfn > last_pfn);
1175 
1176 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1177 
1178 	/* We don't need lock here; nobody else touches the iova range */
1179 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1180 			   domain->pgd, 0, start_pfn, last_pfn);
1181 
1182 	/* free pgd */
1183 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184 		free_pgtable_page(domain->pgd);
1185 		domain->pgd = NULL;
1186 	}
1187 }
1188 
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196 					    int level, struct dma_pte *pte,
1197 					    struct page *freelist)
1198 {
1199 	struct page *pg;
1200 
1201 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202 	pg->freelist = freelist;
1203 	freelist = pg;
1204 
1205 	if (level == 1)
1206 		return freelist;
1207 
1208 	pte = page_address(pg);
1209 	do {
1210 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211 			freelist = dma_pte_list_pagetables(domain, level - 1,
1212 							   pte, freelist);
1213 		pte++;
1214 	} while (!first_pte_in_page(pte));
1215 
1216 	return freelist;
1217 }
1218 
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 					struct dma_pte *pte, unsigned long pfn,
1221 					unsigned long start_pfn,
1222 					unsigned long last_pfn,
1223 					struct page *freelist)
1224 {
1225 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226 
1227 	pfn = max(start_pfn, pfn);
1228 	pte = &pte[pfn_level_offset(pfn, level)];
1229 
1230 	do {
1231 		unsigned long level_pfn;
1232 
1233 		if (!dma_pte_present(pte))
1234 			goto next;
1235 
1236 		level_pfn = pfn & level_mask(level);
1237 
1238 		/* If range covers entire pagetable, free it */
1239 		if (start_pfn <= level_pfn &&
1240 		    last_pfn >= level_pfn + level_size(level) - 1) {
1241 			/* These suborbinate page tables are going away entirely. Don't
1242 			   bother to clear them; we're just going to *free* them. */
1243 			if (level > 1 && !dma_pte_superpage(pte))
1244 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245 
1246 			dma_clear_pte(pte);
1247 			if (!first_pte)
1248 				first_pte = pte;
1249 			last_pte = pte;
1250 		} else if (level > 1) {
1251 			/* Recurse down into a level that isn't *entirely* obsolete */
1252 			freelist = dma_pte_clear_level(domain, level - 1,
1253 						       phys_to_virt(dma_pte_addr(pte)),
1254 						       level_pfn, start_pfn, last_pfn,
1255 						       freelist);
1256 		}
1257 next:
1258 		pfn += level_size(level);
1259 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260 
1261 	if (first_pte)
1262 		domain_flush_cache(domain, first_pte,
1263 				   (void *)++last_pte - (void *)first_pte);
1264 
1265 	return freelist;
1266 }
1267 
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272 				 unsigned long start_pfn,
1273 				 unsigned long last_pfn,
1274 				 struct page *freelist)
1275 {
1276 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278 	BUG_ON(start_pfn > last_pfn);
1279 
1280 	/* we don't need lock here; nobody else touches the iova range */
1281 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282 				       domain->pgd, 0, start_pfn, last_pfn,
1283 				       freelist);
1284 
1285 	/* free pgd */
1286 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287 		struct page *pgd_page = virt_to_page(domain->pgd);
1288 		pgd_page->freelist = freelist;
1289 		freelist = pgd_page;
1290 
1291 		domain->pgd = NULL;
1292 	}
1293 
1294 	return freelist;
1295 }
1296 
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299 	struct page *pg;
1300 
1301 	while ((pg = freelist)) {
1302 		freelist = pg->freelist;
1303 		free_pgtable_page(page_address(pg));
1304 	}
1305 }
1306 
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310 	struct root_entry *root;
1311 	unsigned long flags;
1312 
1313 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314 	if (!root) {
1315 		pr_err("Allocating root entry for %s failed\n",
1316 			iommu->name);
1317 		return -ENOMEM;
1318 	}
1319 
1320 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1321 
1322 	spin_lock_irqsave(&iommu->lock, flags);
1323 	iommu->root_entry = root;
1324 	spin_unlock_irqrestore(&iommu->lock, flags);
1325 
1326 	return 0;
1327 }
1328 
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331 	u64 addr;
1332 	u32 sts;
1333 	unsigned long flag;
1334 
1335 	addr = virt_to_phys(iommu->root_entry);
1336 	if (sm_supported(iommu))
1337 		addr |= DMA_RTADDR_SMT;
1338 
1339 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341 
1342 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343 
1344 	/* Make sure hardware complete it */
1345 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346 		      readl, (sts & DMA_GSTS_RTPS), sts);
1347 
1348 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 
1350 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1351 	if (sm_supported(iommu))
1352 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1353 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1354 }
1355 
1356 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1357 {
1358 	u32 val;
1359 	unsigned long flag;
1360 
1361 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1362 		return;
1363 
1364 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1365 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1366 
1367 	/* Make sure hardware complete it */
1368 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1369 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1370 
1371 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1372 }
1373 
1374 /* return value determine if we need a write buffer flush */
1375 static void __iommu_flush_context(struct intel_iommu *iommu,
1376 				  u16 did, u16 source_id, u8 function_mask,
1377 				  u64 type)
1378 {
1379 	u64 val = 0;
1380 	unsigned long flag;
1381 
1382 	switch (type) {
1383 	case DMA_CCMD_GLOBAL_INVL:
1384 		val = DMA_CCMD_GLOBAL_INVL;
1385 		break;
1386 	case DMA_CCMD_DOMAIN_INVL:
1387 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1388 		break;
1389 	case DMA_CCMD_DEVICE_INVL:
1390 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1391 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1392 		break;
1393 	default:
1394 		BUG();
1395 	}
1396 	val |= DMA_CCMD_ICC;
1397 
1398 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1399 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1400 
1401 	/* Make sure hardware complete it */
1402 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1403 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1404 
1405 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406 }
1407 
1408 /* return value determine if we need a write buffer flush */
1409 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1410 				u64 addr, unsigned int size_order, u64 type)
1411 {
1412 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1413 	u64 val = 0, val_iva = 0;
1414 	unsigned long flag;
1415 
1416 	switch (type) {
1417 	case DMA_TLB_GLOBAL_FLUSH:
1418 		/* global flush doesn't need set IVA_REG */
1419 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1420 		break;
1421 	case DMA_TLB_DSI_FLUSH:
1422 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1423 		break;
1424 	case DMA_TLB_PSI_FLUSH:
1425 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1426 		/* IH bit is passed in as part of address */
1427 		val_iva = size_order | addr;
1428 		break;
1429 	default:
1430 		BUG();
1431 	}
1432 	/* Note: set drain read/write */
1433 #if 0
1434 	/*
1435 	 * This is probably to be super secure.. Looks like we can
1436 	 * ignore it without any impact.
1437 	 */
1438 	if (cap_read_drain(iommu->cap))
1439 		val |= DMA_TLB_READ_DRAIN;
1440 #endif
1441 	if (cap_write_drain(iommu->cap))
1442 		val |= DMA_TLB_WRITE_DRAIN;
1443 
1444 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1445 	/* Note: Only uses first TLB reg currently */
1446 	if (val_iva)
1447 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1448 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1449 
1450 	/* Make sure hardware complete it */
1451 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1452 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1453 
1454 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1455 
1456 	/* check IOTLB invalidation granularity */
1457 	if (DMA_TLB_IAIG(val) == 0)
1458 		pr_err("Flush IOTLB failed\n");
1459 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1460 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1461 			(unsigned long long)DMA_TLB_IIRG(type),
1462 			(unsigned long long)DMA_TLB_IAIG(val));
1463 }
1464 
1465 static struct device_domain_info *
1466 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1467 			 u8 bus, u8 devfn)
1468 {
1469 	struct device_domain_info *info;
1470 
1471 	assert_spin_locked(&device_domain_lock);
1472 
1473 	if (!iommu->qi)
1474 		return NULL;
1475 
1476 	list_for_each_entry(info, &domain->devices, link)
1477 		if (info->iommu == iommu && info->bus == bus &&
1478 		    info->devfn == devfn) {
1479 			if (info->ats_supported && info->dev)
1480 				return info;
1481 			break;
1482 		}
1483 
1484 	return NULL;
1485 }
1486 
1487 static void domain_update_iotlb(struct dmar_domain *domain)
1488 {
1489 	struct device_domain_info *info;
1490 	bool has_iotlb_device = false;
1491 
1492 	assert_spin_locked(&device_domain_lock);
1493 
1494 	list_for_each_entry(info, &domain->devices, link)
1495 		if (info->ats_enabled) {
1496 			has_iotlb_device = true;
1497 			break;
1498 		}
1499 
1500 	if (!has_iotlb_device) {
1501 		struct subdev_domain_info *sinfo;
1502 
1503 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1504 			info = get_domain_info(sinfo->pdev);
1505 			if (info && info->ats_enabled) {
1506 				has_iotlb_device = true;
1507 				break;
1508 			}
1509 		}
1510 	}
1511 
1512 	domain->has_iotlb_device = has_iotlb_device;
1513 }
1514 
1515 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1516 {
1517 	struct pci_dev *pdev;
1518 
1519 	assert_spin_locked(&device_domain_lock);
1520 
1521 	if (!info || !dev_is_pci(info->dev))
1522 		return;
1523 
1524 	pdev = to_pci_dev(info->dev);
1525 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1526 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1527 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1528 	 * reserved, which should be set to 0.
1529 	 */
1530 	if (!ecap_dit(info->iommu->ecap))
1531 		info->pfsid = 0;
1532 	else {
1533 		struct pci_dev *pf_pdev;
1534 
1535 		/* pdev will be returned if device is not a vf */
1536 		pf_pdev = pci_physfn(pdev);
1537 		info->pfsid = pci_dev_id(pf_pdev);
1538 	}
1539 
1540 #ifdef CONFIG_INTEL_IOMMU_SVM
1541 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1542 	   the device if you enable PASID support after ATS support is
1543 	   undefined. So always enable PASID support on devices which
1544 	   have it, even if we can't yet know if we're ever going to
1545 	   use it. */
1546 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1547 		info->pasid_enabled = 1;
1548 
1549 	if (info->pri_supported &&
1550 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1551 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1552 		info->pri_enabled = 1;
1553 #endif
1554 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1555 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1556 		info->ats_enabled = 1;
1557 		domain_update_iotlb(info->domain);
1558 		info->ats_qdep = pci_ats_queue_depth(pdev);
1559 	}
1560 }
1561 
1562 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1563 {
1564 	struct pci_dev *pdev;
1565 
1566 	assert_spin_locked(&device_domain_lock);
1567 
1568 	if (!dev_is_pci(info->dev))
1569 		return;
1570 
1571 	pdev = to_pci_dev(info->dev);
1572 
1573 	if (info->ats_enabled) {
1574 		pci_disable_ats(pdev);
1575 		info->ats_enabled = 0;
1576 		domain_update_iotlb(info->domain);
1577 	}
1578 #ifdef CONFIG_INTEL_IOMMU_SVM
1579 	if (info->pri_enabled) {
1580 		pci_disable_pri(pdev);
1581 		info->pri_enabled = 0;
1582 	}
1583 	if (info->pasid_enabled) {
1584 		pci_disable_pasid(pdev);
1585 		info->pasid_enabled = 0;
1586 	}
1587 #endif
1588 }
1589 
1590 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1591 				    u64 addr, unsigned int mask)
1592 {
1593 	u16 sid, qdep;
1594 
1595 	if (!info || !info->ats_enabled)
1596 		return;
1597 
1598 	sid = info->bus << 8 | info->devfn;
1599 	qdep = info->ats_qdep;
1600 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1601 			   qdep, addr, mask);
1602 }
1603 
1604 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1605 				  u64 addr, unsigned mask)
1606 {
1607 	unsigned long flags;
1608 	struct device_domain_info *info;
1609 	struct subdev_domain_info *sinfo;
1610 
1611 	if (!domain->has_iotlb_device)
1612 		return;
1613 
1614 	spin_lock_irqsave(&device_domain_lock, flags);
1615 	list_for_each_entry(info, &domain->devices, link)
1616 		__iommu_flush_dev_iotlb(info, addr, mask);
1617 
1618 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1619 		info = get_domain_info(sinfo->pdev);
1620 		__iommu_flush_dev_iotlb(info, addr, mask);
1621 	}
1622 	spin_unlock_irqrestore(&device_domain_lock, flags);
1623 }
1624 
1625 static void domain_flush_piotlb(struct intel_iommu *iommu,
1626 				struct dmar_domain *domain,
1627 				u64 addr, unsigned long npages, bool ih)
1628 {
1629 	u16 did = domain->iommu_did[iommu->seq_id];
1630 
1631 	if (domain->default_pasid)
1632 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1633 				addr, npages, ih);
1634 
1635 	if (!list_empty(&domain->devices))
1636 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1637 }
1638 
1639 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1640 				  struct dmar_domain *domain,
1641 				  unsigned long pfn, unsigned int pages,
1642 				  int ih, int map)
1643 {
1644 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1645 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1646 	u16 did = domain->iommu_did[iommu->seq_id];
1647 
1648 	BUG_ON(pages == 0);
1649 
1650 	if (ih)
1651 		ih = 1 << 6;
1652 
1653 	if (domain_use_first_level(domain)) {
1654 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1655 	} else {
1656 		/*
1657 		 * Fallback to domain selective flush if no PSI support or
1658 		 * the size is too big. PSI requires page size to be 2 ^ x,
1659 		 * and the base address is naturally aligned to the size.
1660 		 */
1661 		if (!cap_pgsel_inv(iommu->cap) ||
1662 		    mask > cap_max_amask_val(iommu->cap))
1663 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664 							DMA_TLB_DSI_FLUSH);
1665 		else
1666 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1667 							DMA_TLB_PSI_FLUSH);
1668 	}
1669 
1670 	/*
1671 	 * In caching mode, changes of pages from non-present to present require
1672 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1673 	 */
1674 	if (!cap_caching_mode(iommu->cap) || !map)
1675 		iommu_flush_dev_iotlb(domain, addr, mask);
1676 }
1677 
1678 /* Notification for newly created mappings */
1679 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1680 					struct dmar_domain *domain,
1681 					unsigned long pfn, unsigned int pages)
1682 {
1683 	/*
1684 	 * It's a non-present to present mapping. Only flush if caching mode
1685 	 * and second level.
1686 	 */
1687 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1688 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1689 	else
1690 		iommu_flush_write_buffer(iommu);
1691 }
1692 
1693 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1694 {
1695 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1696 	int idx;
1697 
1698 	for_each_domain_iommu(idx, dmar_domain) {
1699 		struct intel_iommu *iommu = g_iommus[idx];
1700 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1701 
1702 		if (domain_use_first_level(dmar_domain))
1703 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1704 		else
1705 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1706 						 DMA_TLB_DSI_FLUSH);
1707 
1708 		if (!cap_caching_mode(iommu->cap))
1709 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1710 					      0, MAX_AGAW_PFN_WIDTH);
1711 	}
1712 }
1713 
1714 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1715 {
1716 	u32 pmen;
1717 	unsigned long flags;
1718 
1719 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1720 		return;
1721 
1722 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1723 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1724 	pmen &= ~DMA_PMEN_EPM;
1725 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1726 
1727 	/* wait for the protected region status bit to clear */
1728 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1729 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1730 
1731 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1732 }
1733 
1734 static void iommu_enable_translation(struct intel_iommu *iommu)
1735 {
1736 	u32 sts;
1737 	unsigned long flags;
1738 
1739 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1740 	iommu->gcmd |= DMA_GCMD_TE;
1741 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1742 
1743 	/* Make sure hardware complete it */
1744 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1745 		      readl, (sts & DMA_GSTS_TES), sts);
1746 
1747 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1748 }
1749 
1750 static void iommu_disable_translation(struct intel_iommu *iommu)
1751 {
1752 	u32 sts;
1753 	unsigned long flag;
1754 
1755 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1756 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1757 		return;
1758 
1759 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1760 	iommu->gcmd &= ~DMA_GCMD_TE;
1761 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1762 
1763 	/* Make sure hardware complete it */
1764 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1765 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1766 
1767 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1768 }
1769 
1770 static int iommu_init_domains(struct intel_iommu *iommu)
1771 {
1772 	u32 ndomains, nlongs;
1773 	size_t size;
1774 
1775 	ndomains = cap_ndoms(iommu->cap);
1776 	pr_debug("%s: Number of Domains supported <%d>\n",
1777 		 iommu->name, ndomains);
1778 	nlongs = BITS_TO_LONGS(ndomains);
1779 
1780 	spin_lock_init(&iommu->lock);
1781 
1782 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1783 	if (!iommu->domain_ids) {
1784 		pr_err("%s: Allocating domain id array failed\n",
1785 		       iommu->name);
1786 		return -ENOMEM;
1787 	}
1788 
1789 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1790 	iommu->domains = kzalloc(size, GFP_KERNEL);
1791 
1792 	if (iommu->domains) {
1793 		size = 256 * sizeof(struct dmar_domain *);
1794 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1795 	}
1796 
1797 	if (!iommu->domains || !iommu->domains[0]) {
1798 		pr_err("%s: Allocating domain array failed\n",
1799 		       iommu->name);
1800 		kfree(iommu->domain_ids);
1801 		kfree(iommu->domains);
1802 		iommu->domain_ids = NULL;
1803 		iommu->domains    = NULL;
1804 		return -ENOMEM;
1805 	}
1806 
1807 	/*
1808 	 * If Caching mode is set, then invalid translations are tagged
1809 	 * with domain-id 0, hence we need to pre-allocate it. We also
1810 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1811 	 * make sure it is not used for a real domain.
1812 	 */
1813 	set_bit(0, iommu->domain_ids);
1814 
1815 	/*
1816 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1817 	 * entry for first-level or pass-through translation modes should
1818 	 * be programmed with a domain id different from those used for
1819 	 * second-level or nested translation. We reserve a domain id for
1820 	 * this purpose.
1821 	 */
1822 	if (sm_supported(iommu))
1823 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1824 
1825 	return 0;
1826 }
1827 
1828 static void disable_dmar_iommu(struct intel_iommu *iommu)
1829 {
1830 	struct device_domain_info *info, *tmp;
1831 	unsigned long flags;
1832 
1833 	if (!iommu->domains || !iommu->domain_ids)
1834 		return;
1835 
1836 	spin_lock_irqsave(&device_domain_lock, flags);
1837 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1838 		if (info->iommu != iommu)
1839 			continue;
1840 
1841 		if (!info->dev || !info->domain)
1842 			continue;
1843 
1844 		__dmar_remove_one_dev_info(info);
1845 	}
1846 	spin_unlock_irqrestore(&device_domain_lock, flags);
1847 
1848 	if (iommu->gcmd & DMA_GCMD_TE)
1849 		iommu_disable_translation(iommu);
1850 }
1851 
1852 static void free_dmar_iommu(struct intel_iommu *iommu)
1853 {
1854 	if ((iommu->domains) && (iommu->domain_ids)) {
1855 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1856 		int i;
1857 
1858 		for (i = 0; i < elems; i++)
1859 			kfree(iommu->domains[i]);
1860 		kfree(iommu->domains);
1861 		kfree(iommu->domain_ids);
1862 		iommu->domains = NULL;
1863 		iommu->domain_ids = NULL;
1864 	}
1865 
1866 	g_iommus[iommu->seq_id] = NULL;
1867 
1868 	/* free context mapping */
1869 	free_context_table(iommu);
1870 
1871 #ifdef CONFIG_INTEL_IOMMU_SVM
1872 	if (pasid_supported(iommu)) {
1873 		if (ecap_prs(iommu->ecap))
1874 			intel_svm_finish_prq(iommu);
1875 	}
1876 	if (vccap_pasid(iommu->vccap))
1877 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1878 
1879 #endif
1880 }
1881 
1882 /*
1883  * Check and return whether first level is used by default for
1884  * DMA translation.
1885  */
1886 static bool first_level_by_default(void)
1887 {
1888 	return scalable_mode_support() && intel_cap_flts_sanity();
1889 }
1890 
1891 static struct dmar_domain *alloc_domain(int flags)
1892 {
1893 	struct dmar_domain *domain;
1894 
1895 	domain = alloc_domain_mem();
1896 	if (!domain)
1897 		return NULL;
1898 
1899 	memset(domain, 0, sizeof(*domain));
1900 	domain->nid = NUMA_NO_NODE;
1901 	domain->flags = flags;
1902 	if (first_level_by_default())
1903 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1904 	domain->has_iotlb_device = false;
1905 	INIT_LIST_HEAD(&domain->devices);
1906 	INIT_LIST_HEAD(&domain->subdevices);
1907 
1908 	return domain;
1909 }
1910 
1911 /* Must be called with iommu->lock */
1912 static int domain_attach_iommu(struct dmar_domain *domain,
1913 			       struct intel_iommu *iommu)
1914 {
1915 	unsigned long ndomains;
1916 	int num;
1917 
1918 	assert_spin_locked(&device_domain_lock);
1919 	assert_spin_locked(&iommu->lock);
1920 
1921 	domain->iommu_refcnt[iommu->seq_id] += 1;
1922 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1923 		ndomains = cap_ndoms(iommu->cap);
1924 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1925 
1926 		if (num >= ndomains) {
1927 			pr_err("%s: No free domain ids\n", iommu->name);
1928 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1929 			return -ENOSPC;
1930 		}
1931 
1932 		set_bit(num, iommu->domain_ids);
1933 		set_iommu_domain(iommu, num, domain);
1934 
1935 		domain->iommu_did[iommu->seq_id] = num;
1936 		domain->nid			 = iommu->node;
1937 
1938 		domain_update_iommu_cap(domain);
1939 	}
1940 
1941 	return 0;
1942 }
1943 
1944 static void domain_detach_iommu(struct dmar_domain *domain,
1945 				struct intel_iommu *iommu)
1946 {
1947 	int num;
1948 
1949 	assert_spin_locked(&device_domain_lock);
1950 	assert_spin_locked(&iommu->lock);
1951 
1952 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1953 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1954 		num = domain->iommu_did[iommu->seq_id];
1955 		clear_bit(num, iommu->domain_ids);
1956 		set_iommu_domain(iommu, num, NULL);
1957 
1958 		domain_update_iommu_cap(domain);
1959 		domain->iommu_did[iommu->seq_id] = 0;
1960 	}
1961 }
1962 
1963 static inline int guestwidth_to_adjustwidth(int gaw)
1964 {
1965 	int agaw;
1966 	int r = (gaw - 12) % 9;
1967 
1968 	if (r == 0)
1969 		agaw = gaw;
1970 	else
1971 		agaw = gaw + 9 - r;
1972 	if (agaw > 64)
1973 		agaw = 64;
1974 	return agaw;
1975 }
1976 
1977 static void domain_exit(struct dmar_domain *domain)
1978 {
1979 
1980 	/* Remove associated devices and clear attached or cached domains */
1981 	domain_remove_dev_info(domain);
1982 
1983 	/* destroy iovas */
1984 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1985 		iommu_put_dma_cookie(&domain->domain);
1986 
1987 	if (domain->pgd) {
1988 		struct page *freelist;
1989 
1990 		freelist = domain_unmap(domain, 0,
1991 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1992 		dma_free_pagelist(freelist);
1993 	}
1994 
1995 	free_domain_mem(domain);
1996 }
1997 
1998 /*
1999  * Get the PASID directory size for scalable mode context entry.
2000  * Value of X in the PDTS field of a scalable mode context entry
2001  * indicates PASID directory with 2^(X + 7) entries.
2002  */
2003 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2004 {
2005 	int pds, max_pde;
2006 
2007 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2008 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2009 	if (pds < 7)
2010 		return 0;
2011 
2012 	return pds - 7;
2013 }
2014 
2015 /*
2016  * Set the RID_PASID field of a scalable mode context entry. The
2017  * IOMMU hardware will use the PASID value set in this field for
2018  * DMA translations of DMA requests without PASID.
2019  */
2020 static inline void
2021 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2022 {
2023 	context->hi |= pasid & ((1 << 20) - 1);
2024 }
2025 
2026 /*
2027  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2028  * entry.
2029  */
2030 static inline void context_set_sm_dte(struct context_entry *context)
2031 {
2032 	context->lo |= (1 << 2);
2033 }
2034 
2035 /*
2036  * Set the PRE(Page Request Enable) field of a scalable mode context
2037  * entry.
2038  */
2039 static inline void context_set_sm_pre(struct context_entry *context)
2040 {
2041 	context->lo |= (1 << 4);
2042 }
2043 
2044 /* Convert value to context PASID directory size field coding. */
2045 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2046 
2047 static int domain_context_mapping_one(struct dmar_domain *domain,
2048 				      struct intel_iommu *iommu,
2049 				      struct pasid_table *table,
2050 				      u8 bus, u8 devfn)
2051 {
2052 	u16 did = domain->iommu_did[iommu->seq_id];
2053 	int translation = CONTEXT_TT_MULTI_LEVEL;
2054 	struct device_domain_info *info = NULL;
2055 	struct context_entry *context;
2056 	unsigned long flags;
2057 	int ret;
2058 
2059 	WARN_ON(did == 0);
2060 
2061 	if (hw_pass_through && domain_type_is_si(domain))
2062 		translation = CONTEXT_TT_PASS_THROUGH;
2063 
2064 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2065 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2066 
2067 	BUG_ON(!domain->pgd);
2068 
2069 	spin_lock_irqsave(&device_domain_lock, flags);
2070 	spin_lock(&iommu->lock);
2071 
2072 	ret = -ENOMEM;
2073 	context = iommu_context_addr(iommu, bus, devfn, 1);
2074 	if (!context)
2075 		goto out_unlock;
2076 
2077 	ret = 0;
2078 	if (context_present(context))
2079 		goto out_unlock;
2080 
2081 	/*
2082 	 * For kdump cases, old valid entries may be cached due to the
2083 	 * in-flight DMA and copied pgtable, but there is no unmapping
2084 	 * behaviour for them, thus we need an explicit cache flush for
2085 	 * the newly-mapped device. For kdump, at this point, the device
2086 	 * is supposed to finish reset at its driver probe stage, so no
2087 	 * in-flight DMA will exist, and we don't need to worry anymore
2088 	 * hereafter.
2089 	 */
2090 	if (context_copied(context)) {
2091 		u16 did_old = context_domain_id(context);
2092 
2093 		if (did_old < cap_ndoms(iommu->cap)) {
2094 			iommu->flush.flush_context(iommu, did_old,
2095 						   (((u16)bus) << 8) | devfn,
2096 						   DMA_CCMD_MASK_NOBIT,
2097 						   DMA_CCMD_DEVICE_INVL);
2098 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2099 						 DMA_TLB_DSI_FLUSH);
2100 		}
2101 	}
2102 
2103 	context_clear_entry(context);
2104 
2105 	if (sm_supported(iommu)) {
2106 		unsigned long pds;
2107 
2108 		WARN_ON(!table);
2109 
2110 		/* Setup the PASID DIR pointer: */
2111 		pds = context_get_sm_pds(table);
2112 		context->lo = (u64)virt_to_phys(table->table) |
2113 				context_pdts(pds);
2114 
2115 		/* Setup the RID_PASID field: */
2116 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2117 
2118 		/*
2119 		 * Setup the Device-TLB enable bit and Page request
2120 		 * Enable bit:
2121 		 */
2122 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2123 		if (info && info->ats_supported)
2124 			context_set_sm_dte(context);
2125 		if (info && info->pri_supported)
2126 			context_set_sm_pre(context);
2127 	} else {
2128 		struct dma_pte *pgd = domain->pgd;
2129 		int agaw;
2130 
2131 		context_set_domain_id(context, did);
2132 
2133 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2134 			/*
2135 			 * Skip top levels of page tables for iommu which has
2136 			 * less agaw than default. Unnecessary for PT mode.
2137 			 */
2138 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2139 				ret = -ENOMEM;
2140 				pgd = phys_to_virt(dma_pte_addr(pgd));
2141 				if (!dma_pte_present(pgd))
2142 					goto out_unlock;
2143 			}
2144 
2145 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146 			if (info && info->ats_supported)
2147 				translation = CONTEXT_TT_DEV_IOTLB;
2148 			else
2149 				translation = CONTEXT_TT_MULTI_LEVEL;
2150 
2151 			context_set_address_root(context, virt_to_phys(pgd));
2152 			context_set_address_width(context, agaw);
2153 		} else {
2154 			/*
2155 			 * In pass through mode, AW must be programmed to
2156 			 * indicate the largest AGAW value supported by
2157 			 * hardware. And ASR is ignored by hardware.
2158 			 */
2159 			context_set_address_width(context, iommu->msagaw);
2160 		}
2161 
2162 		context_set_translation_type(context, translation);
2163 	}
2164 
2165 	context_set_fault_enable(context);
2166 	context_set_present(context);
2167 	if (!ecap_coherent(iommu->ecap))
2168 		clflush_cache_range(context, sizeof(*context));
2169 
2170 	/*
2171 	 * It's a non-present to present mapping. If hardware doesn't cache
2172 	 * non-present entry we only need to flush the write-buffer. If the
2173 	 * _does_ cache non-present entries, then it does so in the special
2174 	 * domain #0, which we have to flush:
2175 	 */
2176 	if (cap_caching_mode(iommu->cap)) {
2177 		iommu->flush.flush_context(iommu, 0,
2178 					   (((u16)bus) << 8) | devfn,
2179 					   DMA_CCMD_MASK_NOBIT,
2180 					   DMA_CCMD_DEVICE_INVL);
2181 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2182 	} else {
2183 		iommu_flush_write_buffer(iommu);
2184 	}
2185 	iommu_enable_dev_iotlb(info);
2186 
2187 	ret = 0;
2188 
2189 out_unlock:
2190 	spin_unlock(&iommu->lock);
2191 	spin_unlock_irqrestore(&device_domain_lock, flags);
2192 
2193 	return ret;
2194 }
2195 
2196 struct domain_context_mapping_data {
2197 	struct dmar_domain *domain;
2198 	struct intel_iommu *iommu;
2199 	struct pasid_table *table;
2200 };
2201 
2202 static int domain_context_mapping_cb(struct pci_dev *pdev,
2203 				     u16 alias, void *opaque)
2204 {
2205 	struct domain_context_mapping_data *data = opaque;
2206 
2207 	return domain_context_mapping_one(data->domain, data->iommu,
2208 					  data->table, PCI_BUS_NUM(alias),
2209 					  alias & 0xff);
2210 }
2211 
2212 static int
2213 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2214 {
2215 	struct domain_context_mapping_data data;
2216 	struct pasid_table *table;
2217 	struct intel_iommu *iommu;
2218 	u8 bus, devfn;
2219 
2220 	iommu = device_to_iommu(dev, &bus, &devfn);
2221 	if (!iommu)
2222 		return -ENODEV;
2223 
2224 	table = intel_pasid_get_table(dev);
2225 
2226 	if (!dev_is_pci(dev))
2227 		return domain_context_mapping_one(domain, iommu, table,
2228 						  bus, devfn);
2229 
2230 	data.domain = domain;
2231 	data.iommu = iommu;
2232 	data.table = table;
2233 
2234 	return pci_for_each_dma_alias(to_pci_dev(dev),
2235 				      &domain_context_mapping_cb, &data);
2236 }
2237 
2238 static int domain_context_mapped_cb(struct pci_dev *pdev,
2239 				    u16 alias, void *opaque)
2240 {
2241 	struct intel_iommu *iommu = opaque;
2242 
2243 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2244 }
2245 
2246 static int domain_context_mapped(struct device *dev)
2247 {
2248 	struct intel_iommu *iommu;
2249 	u8 bus, devfn;
2250 
2251 	iommu = device_to_iommu(dev, &bus, &devfn);
2252 	if (!iommu)
2253 		return -ENODEV;
2254 
2255 	if (!dev_is_pci(dev))
2256 		return device_context_mapped(iommu, bus, devfn);
2257 
2258 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2259 				       domain_context_mapped_cb, iommu);
2260 }
2261 
2262 /* Returns a number of VTD pages, but aligned to MM page size */
2263 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2264 					    size_t size)
2265 {
2266 	host_addr &= ~PAGE_MASK;
2267 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2268 }
2269 
2270 /* Return largest possible superpage level for a given mapping */
2271 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2272 					  unsigned long iov_pfn,
2273 					  unsigned long phy_pfn,
2274 					  unsigned long pages)
2275 {
2276 	int support, level = 1;
2277 	unsigned long pfnmerge;
2278 
2279 	support = domain->iommu_superpage;
2280 
2281 	/* To use a large page, the virtual *and* physical addresses
2282 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2283 	   of them will mean we have to use smaller pages. So just
2284 	   merge them and check both at once. */
2285 	pfnmerge = iov_pfn | phy_pfn;
2286 
2287 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2288 		pages >>= VTD_STRIDE_SHIFT;
2289 		if (!pages)
2290 			break;
2291 		pfnmerge >>= VTD_STRIDE_SHIFT;
2292 		level++;
2293 		support--;
2294 	}
2295 	return level;
2296 }
2297 
2298 /*
2299  * Ensure that old small page tables are removed to make room for superpage(s).
2300  * We're going to add new large pages, so make sure we don't remove their parent
2301  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2302  */
2303 static void switch_to_super_page(struct dmar_domain *domain,
2304 				 unsigned long start_pfn,
2305 				 unsigned long end_pfn, int level)
2306 {
2307 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2308 	struct dma_pte *pte = NULL;
2309 	int i;
2310 
2311 	while (start_pfn <= end_pfn) {
2312 		if (!pte)
2313 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2314 
2315 		if (dma_pte_present(pte)) {
2316 			dma_pte_free_pagetable(domain, start_pfn,
2317 					       start_pfn + lvl_pages - 1,
2318 					       level + 1);
2319 
2320 			for_each_domain_iommu(i, domain)
2321 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2322 						      start_pfn, lvl_pages,
2323 						      0, 0);
2324 		}
2325 
2326 		pte++;
2327 		start_pfn += lvl_pages;
2328 		if (first_pte_in_page(pte))
2329 			pte = NULL;
2330 	}
2331 }
2332 
2333 static int
2334 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2335 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2336 {
2337 	unsigned int largepage_lvl = 0;
2338 	unsigned long lvl_pages = 0;
2339 	struct dma_pte *pte = NULL;
2340 	phys_addr_t pteval;
2341 	u64 attr;
2342 
2343 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2344 
2345 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2346 		return -EINVAL;
2347 
2348 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2349 	attr |= DMA_FL_PTE_PRESENT;
2350 	if (domain_use_first_level(domain)) {
2351 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2352 
2353 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2354 			attr |= DMA_FL_PTE_ACCESS;
2355 			if (prot & DMA_PTE_WRITE)
2356 				attr |= DMA_FL_PTE_DIRTY;
2357 		}
2358 	}
2359 
2360 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2361 
2362 	while (nr_pages > 0) {
2363 		uint64_t tmp;
2364 
2365 		if (!pte) {
2366 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2367 					phys_pfn, nr_pages);
2368 
2369 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2370 			if (!pte)
2371 				return -ENOMEM;
2372 			/* It is large page*/
2373 			if (largepage_lvl > 1) {
2374 				unsigned long end_pfn;
2375 
2376 				pteval |= DMA_PTE_LARGE_PAGE;
2377 				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2378 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2379 			} else {
2380 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2381 			}
2382 
2383 		}
2384 		/* We don't need lock here, nobody else
2385 		 * touches the iova range
2386 		 */
2387 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2388 		if (tmp) {
2389 			static int dumps = 5;
2390 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2391 				iov_pfn, tmp, (unsigned long long)pteval);
2392 			if (dumps) {
2393 				dumps--;
2394 				debug_dma_dump_mappings(NULL);
2395 			}
2396 			WARN_ON(1);
2397 		}
2398 
2399 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2400 
2401 		BUG_ON(nr_pages < lvl_pages);
2402 
2403 		nr_pages -= lvl_pages;
2404 		iov_pfn += lvl_pages;
2405 		phys_pfn += lvl_pages;
2406 		pteval += lvl_pages * VTD_PAGE_SIZE;
2407 
2408 		/* If the next PTE would be the first in a new page, then we
2409 		 * need to flush the cache on the entries we've just written.
2410 		 * And then we'll need to recalculate 'pte', so clear it and
2411 		 * let it get set again in the if (!pte) block above.
2412 		 *
2413 		 * If we're done (!nr_pages) we need to flush the cache too.
2414 		 *
2415 		 * Also if we've been setting superpages, we may need to
2416 		 * recalculate 'pte' and switch back to smaller pages for the
2417 		 * end of the mapping, if the trailing size is not enough to
2418 		 * use another superpage (i.e. nr_pages < lvl_pages).
2419 		 *
2420 		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2421 		 * callback.
2422 		 */
2423 		pte++;
2424 		if (!nr_pages || first_pte_in_page(pte) ||
2425 		    (largepage_lvl > 1 && nr_pages < lvl_pages))
2426 			pte = NULL;
2427 	}
2428 
2429 	return 0;
2430 }
2431 
2432 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2433 {
2434 	struct intel_iommu *iommu = info->iommu;
2435 	struct context_entry *context;
2436 	unsigned long flags;
2437 	u16 did_old;
2438 
2439 	if (!iommu)
2440 		return;
2441 
2442 	spin_lock_irqsave(&iommu->lock, flags);
2443 	context = iommu_context_addr(iommu, bus, devfn, 0);
2444 	if (!context) {
2445 		spin_unlock_irqrestore(&iommu->lock, flags);
2446 		return;
2447 	}
2448 
2449 	if (sm_supported(iommu)) {
2450 		if (hw_pass_through && domain_type_is_si(info->domain))
2451 			did_old = FLPT_DEFAULT_DID;
2452 		else
2453 			did_old = info->domain->iommu_did[iommu->seq_id];
2454 	} else {
2455 		did_old = context_domain_id(context);
2456 	}
2457 
2458 	context_clear_entry(context);
2459 	__iommu_flush_cache(iommu, context, sizeof(*context));
2460 	spin_unlock_irqrestore(&iommu->lock, flags);
2461 	iommu->flush.flush_context(iommu,
2462 				   did_old,
2463 				   (((u16)bus) << 8) | devfn,
2464 				   DMA_CCMD_MASK_NOBIT,
2465 				   DMA_CCMD_DEVICE_INVL);
2466 
2467 	if (sm_supported(iommu))
2468 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2469 
2470 	iommu->flush.flush_iotlb(iommu,
2471 				 did_old,
2472 				 0,
2473 				 0,
2474 				 DMA_TLB_DSI_FLUSH);
2475 
2476 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2477 }
2478 
2479 static inline void unlink_domain_info(struct device_domain_info *info)
2480 {
2481 	assert_spin_locked(&device_domain_lock);
2482 	list_del(&info->link);
2483 	list_del(&info->global);
2484 	if (info->dev)
2485 		dev_iommu_priv_set(info->dev, NULL);
2486 }
2487 
2488 static void domain_remove_dev_info(struct dmar_domain *domain)
2489 {
2490 	struct device_domain_info *info, *tmp;
2491 	unsigned long flags;
2492 
2493 	spin_lock_irqsave(&device_domain_lock, flags);
2494 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2495 		__dmar_remove_one_dev_info(info);
2496 	spin_unlock_irqrestore(&device_domain_lock, flags);
2497 }
2498 
2499 struct dmar_domain *find_domain(struct device *dev)
2500 {
2501 	struct device_domain_info *info;
2502 
2503 	if (unlikely(!dev || !dev->iommu))
2504 		return NULL;
2505 
2506 	if (unlikely(attach_deferred(dev)))
2507 		return NULL;
2508 
2509 	/* No lock here, assumes no domain exit in normal case */
2510 	info = get_domain_info(dev);
2511 	if (likely(info))
2512 		return info->domain;
2513 
2514 	return NULL;
2515 }
2516 
2517 static inline struct device_domain_info *
2518 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2519 {
2520 	struct device_domain_info *info;
2521 
2522 	list_for_each_entry(info, &device_domain_list, global)
2523 		if (info->segment == segment && info->bus == bus &&
2524 		    info->devfn == devfn)
2525 			return info;
2526 
2527 	return NULL;
2528 }
2529 
2530 static int domain_setup_first_level(struct intel_iommu *iommu,
2531 				    struct dmar_domain *domain,
2532 				    struct device *dev,
2533 				    u32 pasid)
2534 {
2535 	struct dma_pte *pgd = domain->pgd;
2536 	int agaw, level;
2537 	int flags = 0;
2538 
2539 	/*
2540 	 * Skip top levels of page tables for iommu which has
2541 	 * less agaw than default. Unnecessary for PT mode.
2542 	 */
2543 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2544 		pgd = phys_to_virt(dma_pte_addr(pgd));
2545 		if (!dma_pte_present(pgd))
2546 			return -ENOMEM;
2547 	}
2548 
2549 	level = agaw_to_level(agaw);
2550 	if (level != 4 && level != 5)
2551 		return -EINVAL;
2552 
2553 	if (pasid != PASID_RID2PASID)
2554 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2555 	if (level == 5)
2556 		flags |= PASID_FLAG_FL5LP;
2557 
2558 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2559 		flags |= PASID_FLAG_PAGE_SNOOP;
2560 
2561 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2562 					     domain->iommu_did[iommu->seq_id],
2563 					     flags);
2564 }
2565 
2566 static bool dev_is_real_dma_subdevice(struct device *dev)
2567 {
2568 	return dev && dev_is_pci(dev) &&
2569 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2570 }
2571 
2572 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2573 						    int bus, int devfn,
2574 						    struct device *dev,
2575 						    struct dmar_domain *domain)
2576 {
2577 	struct dmar_domain *found = NULL;
2578 	struct device_domain_info *info;
2579 	unsigned long flags;
2580 	int ret;
2581 
2582 	info = alloc_devinfo_mem();
2583 	if (!info)
2584 		return NULL;
2585 
2586 	if (!dev_is_real_dma_subdevice(dev)) {
2587 		info->bus = bus;
2588 		info->devfn = devfn;
2589 		info->segment = iommu->segment;
2590 	} else {
2591 		struct pci_dev *pdev = to_pci_dev(dev);
2592 
2593 		info->bus = pdev->bus->number;
2594 		info->devfn = pdev->devfn;
2595 		info->segment = pci_domain_nr(pdev->bus);
2596 	}
2597 
2598 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2599 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2600 	info->ats_qdep = 0;
2601 	info->dev = dev;
2602 	info->domain = domain;
2603 	info->iommu = iommu;
2604 	info->pasid_table = NULL;
2605 	info->auxd_enabled = 0;
2606 	INIT_LIST_HEAD(&info->subdevices);
2607 
2608 	if (dev && dev_is_pci(dev)) {
2609 		struct pci_dev *pdev = to_pci_dev(info->dev);
2610 
2611 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2612 		    pci_ats_supported(pdev) &&
2613 		    dmar_find_matched_atsr_unit(pdev))
2614 			info->ats_supported = 1;
2615 
2616 		if (sm_supported(iommu)) {
2617 			if (pasid_supported(iommu)) {
2618 				int features = pci_pasid_features(pdev);
2619 				if (features >= 0)
2620 					info->pasid_supported = features | 1;
2621 			}
2622 
2623 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2624 			    pci_pri_supported(pdev))
2625 				info->pri_supported = 1;
2626 		}
2627 	}
2628 
2629 	spin_lock_irqsave(&device_domain_lock, flags);
2630 	if (dev)
2631 		found = find_domain(dev);
2632 
2633 	if (!found) {
2634 		struct device_domain_info *info2;
2635 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2636 						       info->devfn);
2637 		if (info2) {
2638 			found      = info2->domain;
2639 			info2->dev = dev;
2640 		}
2641 	}
2642 
2643 	if (found) {
2644 		spin_unlock_irqrestore(&device_domain_lock, flags);
2645 		free_devinfo_mem(info);
2646 		/* Caller must free the original domain */
2647 		return found;
2648 	}
2649 
2650 	spin_lock(&iommu->lock);
2651 	ret = domain_attach_iommu(domain, iommu);
2652 	spin_unlock(&iommu->lock);
2653 
2654 	if (ret) {
2655 		spin_unlock_irqrestore(&device_domain_lock, flags);
2656 		free_devinfo_mem(info);
2657 		return NULL;
2658 	}
2659 
2660 	list_add(&info->link, &domain->devices);
2661 	list_add(&info->global, &device_domain_list);
2662 	if (dev)
2663 		dev_iommu_priv_set(dev, info);
2664 	spin_unlock_irqrestore(&device_domain_lock, flags);
2665 
2666 	/* PASID table is mandatory for a PCI device in scalable mode. */
2667 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2668 		ret = intel_pasid_alloc_table(dev);
2669 		if (ret) {
2670 			dev_err(dev, "PASID table allocation failed\n");
2671 			dmar_remove_one_dev_info(dev);
2672 			return NULL;
2673 		}
2674 
2675 		/* Setup the PASID entry for requests without PASID: */
2676 		spin_lock_irqsave(&iommu->lock, flags);
2677 		if (hw_pass_through && domain_type_is_si(domain))
2678 			ret = intel_pasid_setup_pass_through(iommu, domain,
2679 					dev, PASID_RID2PASID);
2680 		else if (domain_use_first_level(domain))
2681 			ret = domain_setup_first_level(iommu, domain, dev,
2682 					PASID_RID2PASID);
2683 		else
2684 			ret = intel_pasid_setup_second_level(iommu, domain,
2685 					dev, PASID_RID2PASID);
2686 		spin_unlock_irqrestore(&iommu->lock, flags);
2687 		if (ret) {
2688 			dev_err(dev, "Setup RID2PASID failed\n");
2689 			dmar_remove_one_dev_info(dev);
2690 			return NULL;
2691 		}
2692 	}
2693 
2694 	if (dev && domain_context_mapping(domain, dev)) {
2695 		dev_err(dev, "Domain context map failed\n");
2696 		dmar_remove_one_dev_info(dev);
2697 		return NULL;
2698 	}
2699 
2700 	return domain;
2701 }
2702 
2703 static int iommu_domain_identity_map(struct dmar_domain *domain,
2704 				     unsigned long first_vpfn,
2705 				     unsigned long last_vpfn)
2706 {
2707 	/*
2708 	 * RMRR range might have overlap with physical memory range,
2709 	 * clear it first
2710 	 */
2711 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2712 
2713 	return __domain_mapping(domain, first_vpfn,
2714 				first_vpfn, last_vpfn - first_vpfn + 1,
2715 				DMA_PTE_READ|DMA_PTE_WRITE);
2716 }
2717 
2718 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2719 
2720 static int __init si_domain_init(int hw)
2721 {
2722 	struct dmar_rmrr_unit *rmrr;
2723 	struct device *dev;
2724 	int i, nid, ret;
2725 
2726 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2727 	if (!si_domain)
2728 		return -EFAULT;
2729 
2730 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2731 		domain_exit(si_domain);
2732 		return -EFAULT;
2733 	}
2734 
2735 	if (hw)
2736 		return 0;
2737 
2738 	for_each_online_node(nid) {
2739 		unsigned long start_pfn, end_pfn;
2740 		int i;
2741 
2742 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2743 			ret = iommu_domain_identity_map(si_domain,
2744 					mm_to_dma_pfn(start_pfn),
2745 					mm_to_dma_pfn(end_pfn));
2746 			if (ret)
2747 				return ret;
2748 		}
2749 	}
2750 
2751 	/*
2752 	 * Identity map the RMRRs so that devices with RMRRs could also use
2753 	 * the si_domain.
2754 	 */
2755 	for_each_rmrr_units(rmrr) {
2756 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2757 					  i, dev) {
2758 			unsigned long long start = rmrr->base_address;
2759 			unsigned long long end = rmrr->end_address;
2760 
2761 			if (WARN_ON(end < start ||
2762 				    end >> agaw_to_width(si_domain->agaw)))
2763 				continue;
2764 
2765 			ret = iommu_domain_identity_map(si_domain,
2766 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2767 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2768 			if (ret)
2769 				return ret;
2770 		}
2771 	}
2772 
2773 	return 0;
2774 }
2775 
2776 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2777 {
2778 	struct dmar_domain *ndomain;
2779 	struct intel_iommu *iommu;
2780 	u8 bus, devfn;
2781 
2782 	iommu = device_to_iommu(dev, &bus, &devfn);
2783 	if (!iommu)
2784 		return -ENODEV;
2785 
2786 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2787 	if (ndomain != domain)
2788 		return -EBUSY;
2789 
2790 	return 0;
2791 }
2792 
2793 static bool device_has_rmrr(struct device *dev)
2794 {
2795 	struct dmar_rmrr_unit *rmrr;
2796 	struct device *tmp;
2797 	int i;
2798 
2799 	rcu_read_lock();
2800 	for_each_rmrr_units(rmrr) {
2801 		/*
2802 		 * Return TRUE if this RMRR contains the device that
2803 		 * is passed in.
2804 		 */
2805 		for_each_active_dev_scope(rmrr->devices,
2806 					  rmrr->devices_cnt, i, tmp)
2807 			if (tmp == dev ||
2808 			    is_downstream_to_pci_bridge(dev, tmp)) {
2809 				rcu_read_unlock();
2810 				return true;
2811 			}
2812 	}
2813 	rcu_read_unlock();
2814 	return false;
2815 }
2816 
2817 /**
2818  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2819  * is relaxable (ie. is allowed to be not enforced under some conditions)
2820  * @dev: device handle
2821  *
2822  * We assume that PCI USB devices with RMRRs have them largely
2823  * for historical reasons and that the RMRR space is not actively used post
2824  * boot.  This exclusion may change if vendors begin to abuse it.
2825  *
2826  * The same exception is made for graphics devices, with the requirement that
2827  * any use of the RMRR regions will be torn down before assigning the device
2828  * to a guest.
2829  *
2830  * Return: true if the RMRR is relaxable, false otherwise
2831  */
2832 static bool device_rmrr_is_relaxable(struct device *dev)
2833 {
2834 	struct pci_dev *pdev;
2835 
2836 	if (!dev_is_pci(dev))
2837 		return false;
2838 
2839 	pdev = to_pci_dev(dev);
2840 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2841 		return true;
2842 	else
2843 		return false;
2844 }
2845 
2846 /*
2847  * There are a couple cases where we need to restrict the functionality of
2848  * devices associated with RMRRs.  The first is when evaluating a device for
2849  * identity mapping because problems exist when devices are moved in and out
2850  * of domains and their respective RMRR information is lost.  This means that
2851  * a device with associated RMRRs will never be in a "passthrough" domain.
2852  * The second is use of the device through the IOMMU API.  This interface
2853  * expects to have full control of the IOVA space for the device.  We cannot
2854  * satisfy both the requirement that RMRR access is maintained and have an
2855  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2856  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2857  * We therefore prevent devices associated with an RMRR from participating in
2858  * the IOMMU API, which eliminates them from device assignment.
2859  *
2860  * In both cases, devices which have relaxable RMRRs are not concerned by this
2861  * restriction. See device_rmrr_is_relaxable comment.
2862  */
2863 static bool device_is_rmrr_locked(struct device *dev)
2864 {
2865 	if (!device_has_rmrr(dev))
2866 		return false;
2867 
2868 	if (device_rmrr_is_relaxable(dev))
2869 		return false;
2870 
2871 	return true;
2872 }
2873 
2874 /*
2875  * Return the required default domain type for a specific device.
2876  *
2877  * @dev: the device in query
2878  * @startup: true if this is during early boot
2879  *
2880  * Returns:
2881  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2882  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2883  *  - 0: both identity and dynamic domains work for this device
2884  */
2885 static int device_def_domain_type(struct device *dev)
2886 {
2887 	if (dev_is_pci(dev)) {
2888 		struct pci_dev *pdev = to_pci_dev(dev);
2889 
2890 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2891 			return IOMMU_DOMAIN_IDENTITY;
2892 
2893 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2894 			return IOMMU_DOMAIN_IDENTITY;
2895 	}
2896 
2897 	return 0;
2898 }
2899 
2900 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2901 {
2902 	/*
2903 	 * Start from the sane iommu hardware state.
2904 	 * If the queued invalidation is already initialized by us
2905 	 * (for example, while enabling interrupt-remapping) then
2906 	 * we got the things already rolling from a sane state.
2907 	 */
2908 	if (!iommu->qi) {
2909 		/*
2910 		 * Clear any previous faults.
2911 		 */
2912 		dmar_fault(-1, iommu);
2913 		/*
2914 		 * Disable queued invalidation if supported and already enabled
2915 		 * before OS handover.
2916 		 */
2917 		dmar_disable_qi(iommu);
2918 	}
2919 
2920 	if (dmar_enable_qi(iommu)) {
2921 		/*
2922 		 * Queued Invalidate not enabled, use Register Based Invalidate
2923 		 */
2924 		iommu->flush.flush_context = __iommu_flush_context;
2925 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2926 		pr_info("%s: Using Register based invalidation\n",
2927 			iommu->name);
2928 	} else {
2929 		iommu->flush.flush_context = qi_flush_context;
2930 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2931 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2932 	}
2933 }
2934 
2935 static int copy_context_table(struct intel_iommu *iommu,
2936 			      struct root_entry *old_re,
2937 			      struct context_entry **tbl,
2938 			      int bus, bool ext)
2939 {
2940 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2941 	struct context_entry *new_ce = NULL, ce;
2942 	struct context_entry *old_ce = NULL;
2943 	struct root_entry re;
2944 	phys_addr_t old_ce_phys;
2945 
2946 	tbl_idx = ext ? bus * 2 : bus;
2947 	memcpy(&re, old_re, sizeof(re));
2948 
2949 	for (devfn = 0; devfn < 256; devfn++) {
2950 		/* First calculate the correct index */
2951 		idx = (ext ? devfn * 2 : devfn) % 256;
2952 
2953 		if (idx == 0) {
2954 			/* First save what we may have and clean up */
2955 			if (new_ce) {
2956 				tbl[tbl_idx] = new_ce;
2957 				__iommu_flush_cache(iommu, new_ce,
2958 						    VTD_PAGE_SIZE);
2959 				pos = 1;
2960 			}
2961 
2962 			if (old_ce)
2963 				memunmap(old_ce);
2964 
2965 			ret = 0;
2966 			if (devfn < 0x80)
2967 				old_ce_phys = root_entry_lctp(&re);
2968 			else
2969 				old_ce_phys = root_entry_uctp(&re);
2970 
2971 			if (!old_ce_phys) {
2972 				if (ext && devfn == 0) {
2973 					/* No LCTP, try UCTP */
2974 					devfn = 0x7f;
2975 					continue;
2976 				} else {
2977 					goto out;
2978 				}
2979 			}
2980 
2981 			ret = -ENOMEM;
2982 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2983 					MEMREMAP_WB);
2984 			if (!old_ce)
2985 				goto out;
2986 
2987 			new_ce = alloc_pgtable_page(iommu->node);
2988 			if (!new_ce)
2989 				goto out_unmap;
2990 
2991 			ret = 0;
2992 		}
2993 
2994 		/* Now copy the context entry */
2995 		memcpy(&ce, old_ce + idx, sizeof(ce));
2996 
2997 		if (!__context_present(&ce))
2998 			continue;
2999 
3000 		did = context_domain_id(&ce);
3001 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3002 			set_bit(did, iommu->domain_ids);
3003 
3004 		/*
3005 		 * We need a marker for copied context entries. This
3006 		 * marker needs to work for the old format as well as
3007 		 * for extended context entries.
3008 		 *
3009 		 * Bit 67 of the context entry is used. In the old
3010 		 * format this bit is available to software, in the
3011 		 * extended format it is the PGE bit, but PGE is ignored
3012 		 * by HW if PASIDs are disabled (and thus still
3013 		 * available).
3014 		 *
3015 		 * So disable PASIDs first and then mark the entry
3016 		 * copied. This means that we don't copy PASID
3017 		 * translations from the old kernel, but this is fine as
3018 		 * faults there are not fatal.
3019 		 */
3020 		context_clear_pasid_enable(&ce);
3021 		context_set_copied(&ce);
3022 
3023 		new_ce[idx] = ce;
3024 	}
3025 
3026 	tbl[tbl_idx + pos] = new_ce;
3027 
3028 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3029 
3030 out_unmap:
3031 	memunmap(old_ce);
3032 
3033 out:
3034 	return ret;
3035 }
3036 
3037 static int copy_translation_tables(struct intel_iommu *iommu)
3038 {
3039 	struct context_entry **ctxt_tbls;
3040 	struct root_entry *old_rt;
3041 	phys_addr_t old_rt_phys;
3042 	int ctxt_table_entries;
3043 	unsigned long flags;
3044 	u64 rtaddr_reg;
3045 	int bus, ret;
3046 	bool new_ext, ext;
3047 
3048 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3049 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3050 	new_ext    = !!ecap_ecs(iommu->ecap);
3051 
3052 	/*
3053 	 * The RTT bit can only be changed when translation is disabled,
3054 	 * but disabling translation means to open a window for data
3055 	 * corruption. So bail out and don't copy anything if we would
3056 	 * have to change the bit.
3057 	 */
3058 	if (new_ext != ext)
3059 		return -EINVAL;
3060 
3061 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3062 	if (!old_rt_phys)
3063 		return -EINVAL;
3064 
3065 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3066 	if (!old_rt)
3067 		return -ENOMEM;
3068 
3069 	/* This is too big for the stack - allocate it from slab */
3070 	ctxt_table_entries = ext ? 512 : 256;
3071 	ret = -ENOMEM;
3072 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3073 	if (!ctxt_tbls)
3074 		goto out_unmap;
3075 
3076 	for (bus = 0; bus < 256; bus++) {
3077 		ret = copy_context_table(iommu, &old_rt[bus],
3078 					 ctxt_tbls, bus, ext);
3079 		if (ret) {
3080 			pr_err("%s: Failed to copy context table for bus %d\n",
3081 				iommu->name, bus);
3082 			continue;
3083 		}
3084 	}
3085 
3086 	spin_lock_irqsave(&iommu->lock, flags);
3087 
3088 	/* Context tables are copied, now write them to the root_entry table */
3089 	for (bus = 0; bus < 256; bus++) {
3090 		int idx = ext ? bus * 2 : bus;
3091 		u64 val;
3092 
3093 		if (ctxt_tbls[idx]) {
3094 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3095 			iommu->root_entry[bus].lo = val;
3096 		}
3097 
3098 		if (!ext || !ctxt_tbls[idx + 1])
3099 			continue;
3100 
3101 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3102 		iommu->root_entry[bus].hi = val;
3103 	}
3104 
3105 	spin_unlock_irqrestore(&iommu->lock, flags);
3106 
3107 	kfree(ctxt_tbls);
3108 
3109 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3110 
3111 	ret = 0;
3112 
3113 out_unmap:
3114 	memunmap(old_rt);
3115 
3116 	return ret;
3117 }
3118 
3119 #ifdef CONFIG_INTEL_IOMMU_SVM
3120 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3121 {
3122 	struct intel_iommu *iommu = data;
3123 	ioasid_t ioasid;
3124 
3125 	if (!iommu)
3126 		return INVALID_IOASID;
3127 	/*
3128 	 * VT-d virtual command interface always uses the full 20 bit
3129 	 * PASID range. Host can partition guest PASID range based on
3130 	 * policies but it is out of guest's control.
3131 	 */
3132 	if (min < PASID_MIN || max > intel_pasid_max_id)
3133 		return INVALID_IOASID;
3134 
3135 	if (vcmd_alloc_pasid(iommu, &ioasid))
3136 		return INVALID_IOASID;
3137 
3138 	return ioasid;
3139 }
3140 
3141 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3142 {
3143 	struct intel_iommu *iommu = data;
3144 
3145 	if (!iommu)
3146 		return;
3147 	/*
3148 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3149 	 * We can only free the PASID when all the devices are unbound.
3150 	 */
3151 	if (ioasid_find(NULL, ioasid, NULL)) {
3152 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3153 		return;
3154 	}
3155 	vcmd_free_pasid(iommu, ioasid);
3156 }
3157 
3158 static void register_pasid_allocator(struct intel_iommu *iommu)
3159 {
3160 	/*
3161 	 * If we are running in the host, no need for custom allocator
3162 	 * in that PASIDs are allocated from the host system-wide.
3163 	 */
3164 	if (!cap_caching_mode(iommu->cap))
3165 		return;
3166 
3167 	if (!sm_supported(iommu)) {
3168 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3169 		return;
3170 	}
3171 
3172 	/*
3173 	 * Register a custom PASID allocator if we are running in a guest,
3174 	 * guest PASID must be obtained via virtual command interface.
3175 	 * There can be multiple vIOMMUs in each guest but only one allocator
3176 	 * is active. All vIOMMU allocators will eventually be calling the same
3177 	 * host allocator.
3178 	 */
3179 	if (!vccap_pasid(iommu->vccap))
3180 		return;
3181 
3182 	pr_info("Register custom PASID allocator\n");
3183 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3184 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3185 	iommu->pasid_allocator.pdata = (void *)iommu;
3186 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3187 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3188 		/*
3189 		 * Disable scalable mode on this IOMMU if there
3190 		 * is no custom allocator. Mixing SM capable vIOMMU
3191 		 * and non-SM vIOMMU are not supported.
3192 		 */
3193 		intel_iommu_sm = 0;
3194 	}
3195 }
3196 #endif
3197 
3198 static int __init init_dmars(void)
3199 {
3200 	struct dmar_drhd_unit *drhd;
3201 	struct intel_iommu *iommu;
3202 	int ret;
3203 
3204 	/*
3205 	 * for each drhd
3206 	 *    allocate root
3207 	 *    initialize and program root entry to not present
3208 	 * endfor
3209 	 */
3210 	for_each_drhd_unit(drhd) {
3211 		/*
3212 		 * lock not needed as this is only incremented in the single
3213 		 * threaded kernel __init code path all other access are read
3214 		 * only
3215 		 */
3216 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3217 			g_num_of_iommus++;
3218 			continue;
3219 		}
3220 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3221 	}
3222 
3223 	/* Preallocate enough resources for IOMMU hot-addition */
3224 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3225 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3226 
3227 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3228 			GFP_KERNEL);
3229 	if (!g_iommus) {
3230 		pr_err("Allocating global iommu array failed\n");
3231 		ret = -ENOMEM;
3232 		goto error;
3233 	}
3234 
3235 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3236 	if (ret)
3237 		goto free_iommu;
3238 
3239 	for_each_iommu(iommu, drhd) {
3240 		if (drhd->ignored) {
3241 			iommu_disable_translation(iommu);
3242 			continue;
3243 		}
3244 
3245 		/*
3246 		 * Find the max pasid size of all IOMMU's in the system.
3247 		 * We need to ensure the system pasid table is no bigger
3248 		 * than the smallest supported.
3249 		 */
3250 		if (pasid_supported(iommu)) {
3251 			u32 temp = 2 << ecap_pss(iommu->ecap);
3252 
3253 			intel_pasid_max_id = min_t(u32, temp,
3254 						   intel_pasid_max_id);
3255 		}
3256 
3257 		g_iommus[iommu->seq_id] = iommu;
3258 
3259 		intel_iommu_init_qi(iommu);
3260 
3261 		ret = iommu_init_domains(iommu);
3262 		if (ret)
3263 			goto free_iommu;
3264 
3265 		init_translation_status(iommu);
3266 
3267 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3268 			iommu_disable_translation(iommu);
3269 			clear_translation_pre_enabled(iommu);
3270 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3271 				iommu->name);
3272 		}
3273 
3274 		/*
3275 		 * TBD:
3276 		 * we could share the same root & context tables
3277 		 * among all IOMMU's. Need to Split it later.
3278 		 */
3279 		ret = iommu_alloc_root_entry(iommu);
3280 		if (ret)
3281 			goto free_iommu;
3282 
3283 		if (translation_pre_enabled(iommu)) {
3284 			pr_info("Translation already enabled - trying to copy translation structures\n");
3285 
3286 			ret = copy_translation_tables(iommu);
3287 			if (ret) {
3288 				/*
3289 				 * We found the IOMMU with translation
3290 				 * enabled - but failed to copy over the
3291 				 * old root-entry table. Try to proceed
3292 				 * by disabling translation now and
3293 				 * allocating a clean root-entry table.
3294 				 * This might cause DMAR faults, but
3295 				 * probably the dump will still succeed.
3296 				 */
3297 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3298 				       iommu->name);
3299 				iommu_disable_translation(iommu);
3300 				clear_translation_pre_enabled(iommu);
3301 			} else {
3302 				pr_info("Copied translation tables from previous kernel for %s\n",
3303 					iommu->name);
3304 			}
3305 		}
3306 
3307 		if (!ecap_pass_through(iommu->ecap))
3308 			hw_pass_through = 0;
3309 		intel_svm_check(iommu);
3310 	}
3311 
3312 	/*
3313 	 * Now that qi is enabled on all iommus, set the root entry and flush
3314 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3315 	 * flush_context function will loop forever and the boot hangs.
3316 	 */
3317 	for_each_active_iommu(iommu, drhd) {
3318 		iommu_flush_write_buffer(iommu);
3319 #ifdef CONFIG_INTEL_IOMMU_SVM
3320 		register_pasid_allocator(iommu);
3321 #endif
3322 		iommu_set_root_entry(iommu);
3323 	}
3324 
3325 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3326 	dmar_map_gfx = 0;
3327 #endif
3328 
3329 	if (!dmar_map_gfx)
3330 		iommu_identity_mapping |= IDENTMAP_GFX;
3331 
3332 	check_tylersburg_isoch();
3333 
3334 	ret = si_domain_init(hw_pass_through);
3335 	if (ret)
3336 		goto free_iommu;
3337 
3338 	/*
3339 	 * for each drhd
3340 	 *   enable fault log
3341 	 *   global invalidate context cache
3342 	 *   global invalidate iotlb
3343 	 *   enable translation
3344 	 */
3345 	for_each_iommu(iommu, drhd) {
3346 		if (drhd->ignored) {
3347 			/*
3348 			 * we always have to disable PMRs or DMA may fail on
3349 			 * this device
3350 			 */
3351 			if (force_on)
3352 				iommu_disable_protect_mem_regions(iommu);
3353 			continue;
3354 		}
3355 
3356 		iommu_flush_write_buffer(iommu);
3357 
3358 #ifdef CONFIG_INTEL_IOMMU_SVM
3359 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3360 			/*
3361 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3362 			 * could cause possible lock race condition.
3363 			 */
3364 			up_write(&dmar_global_lock);
3365 			ret = intel_svm_enable_prq(iommu);
3366 			down_write(&dmar_global_lock);
3367 			if (ret)
3368 				goto free_iommu;
3369 		}
3370 #endif
3371 		ret = dmar_set_interrupt(iommu);
3372 		if (ret)
3373 			goto free_iommu;
3374 	}
3375 
3376 	return 0;
3377 
3378 free_iommu:
3379 	for_each_active_iommu(iommu, drhd) {
3380 		disable_dmar_iommu(iommu);
3381 		free_dmar_iommu(iommu);
3382 	}
3383 
3384 	kfree(g_iommus);
3385 
3386 error:
3387 	return ret;
3388 }
3389 
3390 static inline int iommu_domain_cache_init(void)
3391 {
3392 	int ret = 0;
3393 
3394 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3395 					 sizeof(struct dmar_domain),
3396 					 0,
3397 					 SLAB_HWCACHE_ALIGN,
3398 
3399 					 NULL);
3400 	if (!iommu_domain_cache) {
3401 		pr_err("Couldn't create iommu_domain cache\n");
3402 		ret = -ENOMEM;
3403 	}
3404 
3405 	return ret;
3406 }
3407 
3408 static inline int iommu_devinfo_cache_init(void)
3409 {
3410 	int ret = 0;
3411 
3412 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3413 					 sizeof(struct device_domain_info),
3414 					 0,
3415 					 SLAB_HWCACHE_ALIGN,
3416 					 NULL);
3417 	if (!iommu_devinfo_cache) {
3418 		pr_err("Couldn't create devinfo cache\n");
3419 		ret = -ENOMEM;
3420 	}
3421 
3422 	return ret;
3423 }
3424 
3425 static int __init iommu_init_mempool(void)
3426 {
3427 	int ret;
3428 	ret = iova_cache_get();
3429 	if (ret)
3430 		return ret;
3431 
3432 	ret = iommu_domain_cache_init();
3433 	if (ret)
3434 		goto domain_error;
3435 
3436 	ret = iommu_devinfo_cache_init();
3437 	if (!ret)
3438 		return ret;
3439 
3440 	kmem_cache_destroy(iommu_domain_cache);
3441 domain_error:
3442 	iova_cache_put();
3443 
3444 	return -ENOMEM;
3445 }
3446 
3447 static void __init iommu_exit_mempool(void)
3448 {
3449 	kmem_cache_destroy(iommu_devinfo_cache);
3450 	kmem_cache_destroy(iommu_domain_cache);
3451 	iova_cache_put();
3452 }
3453 
3454 static void __init init_no_remapping_devices(void)
3455 {
3456 	struct dmar_drhd_unit *drhd;
3457 	struct device *dev;
3458 	int i;
3459 
3460 	for_each_drhd_unit(drhd) {
3461 		if (!drhd->include_all) {
3462 			for_each_active_dev_scope(drhd->devices,
3463 						  drhd->devices_cnt, i, dev)
3464 				break;
3465 			/* ignore DMAR unit if no devices exist */
3466 			if (i == drhd->devices_cnt)
3467 				drhd->ignored = 1;
3468 		}
3469 	}
3470 
3471 	for_each_active_drhd_unit(drhd) {
3472 		if (drhd->include_all)
3473 			continue;
3474 
3475 		for_each_active_dev_scope(drhd->devices,
3476 					  drhd->devices_cnt, i, dev)
3477 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3478 				break;
3479 		if (i < drhd->devices_cnt)
3480 			continue;
3481 
3482 		/* This IOMMU has *only* gfx devices. Either bypass it or
3483 		   set the gfx_mapped flag, as appropriate */
3484 		drhd->gfx_dedicated = 1;
3485 		if (!dmar_map_gfx)
3486 			drhd->ignored = 1;
3487 	}
3488 }
3489 
3490 #ifdef CONFIG_SUSPEND
3491 static int init_iommu_hw(void)
3492 {
3493 	struct dmar_drhd_unit *drhd;
3494 	struct intel_iommu *iommu = NULL;
3495 
3496 	for_each_active_iommu(iommu, drhd)
3497 		if (iommu->qi)
3498 			dmar_reenable_qi(iommu);
3499 
3500 	for_each_iommu(iommu, drhd) {
3501 		if (drhd->ignored) {
3502 			/*
3503 			 * we always have to disable PMRs or DMA may fail on
3504 			 * this device
3505 			 */
3506 			if (force_on)
3507 				iommu_disable_protect_mem_regions(iommu);
3508 			continue;
3509 		}
3510 
3511 		iommu_flush_write_buffer(iommu);
3512 		iommu_set_root_entry(iommu);
3513 		iommu_enable_translation(iommu);
3514 		iommu_disable_protect_mem_regions(iommu);
3515 	}
3516 
3517 	return 0;
3518 }
3519 
3520 static void iommu_flush_all(void)
3521 {
3522 	struct dmar_drhd_unit *drhd;
3523 	struct intel_iommu *iommu;
3524 
3525 	for_each_active_iommu(iommu, drhd) {
3526 		iommu->flush.flush_context(iommu, 0, 0, 0,
3527 					   DMA_CCMD_GLOBAL_INVL);
3528 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3529 					 DMA_TLB_GLOBAL_FLUSH);
3530 	}
3531 }
3532 
3533 static int iommu_suspend(void)
3534 {
3535 	struct dmar_drhd_unit *drhd;
3536 	struct intel_iommu *iommu = NULL;
3537 	unsigned long flag;
3538 
3539 	for_each_active_iommu(iommu, drhd) {
3540 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3541 					     GFP_KERNEL);
3542 		if (!iommu->iommu_state)
3543 			goto nomem;
3544 	}
3545 
3546 	iommu_flush_all();
3547 
3548 	for_each_active_iommu(iommu, drhd) {
3549 		iommu_disable_translation(iommu);
3550 
3551 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3552 
3553 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3554 			readl(iommu->reg + DMAR_FECTL_REG);
3555 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3556 			readl(iommu->reg + DMAR_FEDATA_REG);
3557 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3558 			readl(iommu->reg + DMAR_FEADDR_REG);
3559 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3560 			readl(iommu->reg + DMAR_FEUADDR_REG);
3561 
3562 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3563 	}
3564 	return 0;
3565 
3566 nomem:
3567 	for_each_active_iommu(iommu, drhd)
3568 		kfree(iommu->iommu_state);
3569 
3570 	return -ENOMEM;
3571 }
3572 
3573 static void iommu_resume(void)
3574 {
3575 	struct dmar_drhd_unit *drhd;
3576 	struct intel_iommu *iommu = NULL;
3577 	unsigned long flag;
3578 
3579 	if (init_iommu_hw()) {
3580 		if (force_on)
3581 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3582 		else
3583 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3584 		return;
3585 	}
3586 
3587 	for_each_active_iommu(iommu, drhd) {
3588 
3589 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3590 
3591 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3592 			iommu->reg + DMAR_FECTL_REG);
3593 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3594 			iommu->reg + DMAR_FEDATA_REG);
3595 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3596 			iommu->reg + DMAR_FEADDR_REG);
3597 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3598 			iommu->reg + DMAR_FEUADDR_REG);
3599 
3600 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3601 	}
3602 
3603 	for_each_active_iommu(iommu, drhd)
3604 		kfree(iommu->iommu_state);
3605 }
3606 
3607 static struct syscore_ops iommu_syscore_ops = {
3608 	.resume		= iommu_resume,
3609 	.suspend	= iommu_suspend,
3610 };
3611 
3612 static void __init init_iommu_pm_ops(void)
3613 {
3614 	register_syscore_ops(&iommu_syscore_ops);
3615 }
3616 
3617 #else
3618 static inline void init_iommu_pm_ops(void) {}
3619 #endif	/* CONFIG_PM */
3620 
3621 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3622 {
3623 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3624 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3625 	    rmrr->end_address <= rmrr->base_address ||
3626 	    arch_rmrr_sanity_check(rmrr))
3627 		return -EINVAL;
3628 
3629 	return 0;
3630 }
3631 
3632 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3633 {
3634 	struct acpi_dmar_reserved_memory *rmrr;
3635 	struct dmar_rmrr_unit *rmrru;
3636 
3637 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3638 	if (rmrr_sanity_check(rmrr)) {
3639 		pr_warn(FW_BUG
3640 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3641 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3642 			   rmrr->base_address, rmrr->end_address,
3643 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3644 			   dmi_get_system_info(DMI_BIOS_VERSION),
3645 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3646 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3647 	}
3648 
3649 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3650 	if (!rmrru)
3651 		goto out;
3652 
3653 	rmrru->hdr = header;
3654 
3655 	rmrru->base_address = rmrr->base_address;
3656 	rmrru->end_address = rmrr->end_address;
3657 
3658 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3659 				((void *)rmrr) + rmrr->header.length,
3660 				&rmrru->devices_cnt);
3661 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3662 		goto free_rmrru;
3663 
3664 	list_add(&rmrru->list, &dmar_rmrr_units);
3665 
3666 	return 0;
3667 free_rmrru:
3668 	kfree(rmrru);
3669 out:
3670 	return -ENOMEM;
3671 }
3672 
3673 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3674 {
3675 	struct dmar_atsr_unit *atsru;
3676 	struct acpi_dmar_atsr *tmp;
3677 
3678 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3679 				dmar_rcu_check()) {
3680 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3681 		if (atsr->segment != tmp->segment)
3682 			continue;
3683 		if (atsr->header.length != tmp->header.length)
3684 			continue;
3685 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3686 			return atsru;
3687 	}
3688 
3689 	return NULL;
3690 }
3691 
3692 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3693 {
3694 	struct acpi_dmar_atsr *atsr;
3695 	struct dmar_atsr_unit *atsru;
3696 
3697 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3698 		return 0;
3699 
3700 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3701 	atsru = dmar_find_atsr(atsr);
3702 	if (atsru)
3703 		return 0;
3704 
3705 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3706 	if (!atsru)
3707 		return -ENOMEM;
3708 
3709 	/*
3710 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3711 	 * copy the memory content because the memory buffer will be freed
3712 	 * on return.
3713 	 */
3714 	atsru->hdr = (void *)(atsru + 1);
3715 	memcpy(atsru->hdr, hdr, hdr->length);
3716 	atsru->include_all = atsr->flags & 0x1;
3717 	if (!atsru->include_all) {
3718 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3719 				(void *)atsr + atsr->header.length,
3720 				&atsru->devices_cnt);
3721 		if (atsru->devices_cnt && atsru->devices == NULL) {
3722 			kfree(atsru);
3723 			return -ENOMEM;
3724 		}
3725 	}
3726 
3727 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3728 
3729 	return 0;
3730 }
3731 
3732 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3733 {
3734 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3735 	kfree(atsru);
3736 }
3737 
3738 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3739 {
3740 	struct acpi_dmar_atsr *atsr;
3741 	struct dmar_atsr_unit *atsru;
3742 
3743 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3744 	atsru = dmar_find_atsr(atsr);
3745 	if (atsru) {
3746 		list_del_rcu(&atsru->list);
3747 		synchronize_rcu();
3748 		intel_iommu_free_atsr(atsru);
3749 	}
3750 
3751 	return 0;
3752 }
3753 
3754 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3755 {
3756 	int i;
3757 	struct device *dev;
3758 	struct acpi_dmar_atsr *atsr;
3759 	struct dmar_atsr_unit *atsru;
3760 
3761 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3762 	atsru = dmar_find_atsr(atsr);
3763 	if (!atsru)
3764 		return 0;
3765 
3766 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3767 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3768 					  i, dev)
3769 			return -EBUSY;
3770 	}
3771 
3772 	return 0;
3773 }
3774 
3775 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3776 {
3777 	struct dmar_satc_unit *satcu;
3778 	struct acpi_dmar_satc *tmp;
3779 
3780 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3781 				dmar_rcu_check()) {
3782 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3783 		if (satc->segment != tmp->segment)
3784 			continue;
3785 		if (satc->header.length != tmp->header.length)
3786 			continue;
3787 		if (memcmp(satc, tmp, satc->header.length) == 0)
3788 			return satcu;
3789 	}
3790 
3791 	return NULL;
3792 }
3793 
3794 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3795 {
3796 	struct acpi_dmar_satc *satc;
3797 	struct dmar_satc_unit *satcu;
3798 
3799 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3800 		return 0;
3801 
3802 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3803 	satcu = dmar_find_satc(satc);
3804 	if (satcu)
3805 		return 0;
3806 
3807 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3808 	if (!satcu)
3809 		return -ENOMEM;
3810 
3811 	satcu->hdr = (void *)(satcu + 1);
3812 	memcpy(satcu->hdr, hdr, hdr->length);
3813 	satcu->atc_required = satc->flags & 0x1;
3814 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3815 					      (void *)satc + satc->header.length,
3816 					      &satcu->devices_cnt);
3817 	if (satcu->devices_cnt && !satcu->devices) {
3818 		kfree(satcu);
3819 		return -ENOMEM;
3820 	}
3821 	list_add_rcu(&satcu->list, &dmar_satc_units);
3822 
3823 	return 0;
3824 }
3825 
3826 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3827 {
3828 	int sp, ret;
3829 	struct intel_iommu *iommu = dmaru->iommu;
3830 
3831 	if (g_iommus[iommu->seq_id])
3832 		return 0;
3833 
3834 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3835 	if (ret)
3836 		goto out;
3837 
3838 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3839 		pr_warn("%s: Doesn't support hardware pass through.\n",
3840 			iommu->name);
3841 		return -ENXIO;
3842 	}
3843 	if (!ecap_sc_support(iommu->ecap) &&
3844 	    domain_update_iommu_snooping(iommu)) {
3845 		pr_warn("%s: Doesn't support snooping.\n",
3846 			iommu->name);
3847 		return -ENXIO;
3848 	}
3849 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3850 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3851 		pr_warn("%s: Doesn't support large page.\n",
3852 			iommu->name);
3853 		return -ENXIO;
3854 	}
3855 
3856 	/*
3857 	 * Disable translation if already enabled prior to OS handover.
3858 	 */
3859 	if (iommu->gcmd & DMA_GCMD_TE)
3860 		iommu_disable_translation(iommu);
3861 
3862 	g_iommus[iommu->seq_id] = iommu;
3863 	ret = iommu_init_domains(iommu);
3864 	if (ret == 0)
3865 		ret = iommu_alloc_root_entry(iommu);
3866 	if (ret)
3867 		goto out;
3868 
3869 	intel_svm_check(iommu);
3870 
3871 	if (dmaru->ignored) {
3872 		/*
3873 		 * we always have to disable PMRs or DMA may fail on this device
3874 		 */
3875 		if (force_on)
3876 			iommu_disable_protect_mem_regions(iommu);
3877 		return 0;
3878 	}
3879 
3880 	intel_iommu_init_qi(iommu);
3881 	iommu_flush_write_buffer(iommu);
3882 
3883 #ifdef CONFIG_INTEL_IOMMU_SVM
3884 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3885 		ret = intel_svm_enable_prq(iommu);
3886 		if (ret)
3887 			goto disable_iommu;
3888 	}
3889 #endif
3890 	ret = dmar_set_interrupt(iommu);
3891 	if (ret)
3892 		goto disable_iommu;
3893 
3894 	iommu_set_root_entry(iommu);
3895 	iommu_enable_translation(iommu);
3896 
3897 	iommu_disable_protect_mem_regions(iommu);
3898 	return 0;
3899 
3900 disable_iommu:
3901 	disable_dmar_iommu(iommu);
3902 out:
3903 	free_dmar_iommu(iommu);
3904 	return ret;
3905 }
3906 
3907 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3908 {
3909 	int ret = 0;
3910 	struct intel_iommu *iommu = dmaru->iommu;
3911 
3912 	if (!intel_iommu_enabled)
3913 		return 0;
3914 	if (iommu == NULL)
3915 		return -EINVAL;
3916 
3917 	if (insert) {
3918 		ret = intel_iommu_add(dmaru);
3919 	} else {
3920 		disable_dmar_iommu(iommu);
3921 		free_dmar_iommu(iommu);
3922 	}
3923 
3924 	return ret;
3925 }
3926 
3927 static void intel_iommu_free_dmars(void)
3928 {
3929 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3930 	struct dmar_atsr_unit *atsru, *atsr_n;
3931 	struct dmar_satc_unit *satcu, *satc_n;
3932 
3933 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3934 		list_del(&rmrru->list);
3935 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3936 		kfree(rmrru);
3937 	}
3938 
3939 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3940 		list_del(&atsru->list);
3941 		intel_iommu_free_atsr(atsru);
3942 	}
3943 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3944 		list_del(&satcu->list);
3945 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3946 		kfree(satcu);
3947 	}
3948 }
3949 
3950 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3951 {
3952 	int i, ret = 1;
3953 	struct pci_bus *bus;
3954 	struct pci_dev *bridge = NULL;
3955 	struct device *tmp;
3956 	struct acpi_dmar_atsr *atsr;
3957 	struct dmar_atsr_unit *atsru;
3958 
3959 	dev = pci_physfn(dev);
3960 	for (bus = dev->bus; bus; bus = bus->parent) {
3961 		bridge = bus->self;
3962 		/* If it's an integrated device, allow ATS */
3963 		if (!bridge)
3964 			return 1;
3965 		/* Connected via non-PCIe: no ATS */
3966 		if (!pci_is_pcie(bridge) ||
3967 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3968 			return 0;
3969 		/* If we found the root port, look it up in the ATSR */
3970 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3971 			break;
3972 	}
3973 
3974 	rcu_read_lock();
3975 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3976 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3977 		if (atsr->segment != pci_domain_nr(dev->bus))
3978 			continue;
3979 
3980 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3981 			if (tmp == &bridge->dev)
3982 				goto out;
3983 
3984 		if (atsru->include_all)
3985 			goto out;
3986 	}
3987 	ret = 0;
3988 out:
3989 	rcu_read_unlock();
3990 
3991 	return ret;
3992 }
3993 
3994 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3995 {
3996 	int ret;
3997 	struct dmar_rmrr_unit *rmrru;
3998 	struct dmar_atsr_unit *atsru;
3999 	struct dmar_satc_unit *satcu;
4000 	struct acpi_dmar_atsr *atsr;
4001 	struct acpi_dmar_reserved_memory *rmrr;
4002 	struct acpi_dmar_satc *satc;
4003 
4004 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4005 		return 0;
4006 
4007 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4008 		rmrr = container_of(rmrru->hdr,
4009 				    struct acpi_dmar_reserved_memory, header);
4010 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4011 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4012 				((void *)rmrr) + rmrr->header.length,
4013 				rmrr->segment, rmrru->devices,
4014 				rmrru->devices_cnt);
4015 			if (ret < 0)
4016 				return ret;
4017 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4018 			dmar_remove_dev_scope(info, rmrr->segment,
4019 				rmrru->devices, rmrru->devices_cnt);
4020 		}
4021 	}
4022 
4023 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4024 		if (atsru->include_all)
4025 			continue;
4026 
4027 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4028 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4029 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4030 					(void *)atsr + atsr->header.length,
4031 					atsr->segment, atsru->devices,
4032 					atsru->devices_cnt);
4033 			if (ret > 0)
4034 				break;
4035 			else if (ret < 0)
4036 				return ret;
4037 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4038 			if (dmar_remove_dev_scope(info, atsr->segment,
4039 					atsru->devices, atsru->devices_cnt))
4040 				break;
4041 		}
4042 	}
4043 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4044 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4045 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4046 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4047 					(void *)satc + satc->header.length,
4048 					satc->segment, satcu->devices,
4049 					satcu->devices_cnt);
4050 			if (ret > 0)
4051 				break;
4052 			else if (ret < 0)
4053 				return ret;
4054 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4055 			if (dmar_remove_dev_scope(info, satc->segment,
4056 					satcu->devices, satcu->devices_cnt))
4057 				break;
4058 		}
4059 	}
4060 
4061 	return 0;
4062 }
4063 
4064 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4065 				       unsigned long val, void *v)
4066 {
4067 	struct memory_notify *mhp = v;
4068 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4069 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4070 			mhp->nr_pages - 1);
4071 
4072 	switch (val) {
4073 	case MEM_GOING_ONLINE:
4074 		if (iommu_domain_identity_map(si_domain,
4075 					      start_vpfn, last_vpfn)) {
4076 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4077 				start_vpfn, last_vpfn);
4078 			return NOTIFY_BAD;
4079 		}
4080 		break;
4081 
4082 	case MEM_OFFLINE:
4083 	case MEM_CANCEL_ONLINE:
4084 		{
4085 			struct dmar_drhd_unit *drhd;
4086 			struct intel_iommu *iommu;
4087 			struct page *freelist;
4088 
4089 			freelist = domain_unmap(si_domain,
4090 						start_vpfn, last_vpfn,
4091 						NULL);
4092 
4093 			rcu_read_lock();
4094 			for_each_active_iommu(iommu, drhd)
4095 				iommu_flush_iotlb_psi(iommu, si_domain,
4096 					start_vpfn, mhp->nr_pages,
4097 					!freelist, 0);
4098 			rcu_read_unlock();
4099 			dma_free_pagelist(freelist);
4100 		}
4101 		break;
4102 	}
4103 
4104 	return NOTIFY_OK;
4105 }
4106 
4107 static struct notifier_block intel_iommu_memory_nb = {
4108 	.notifier_call = intel_iommu_memory_notifier,
4109 	.priority = 0
4110 };
4111 
4112 static void intel_disable_iommus(void)
4113 {
4114 	struct intel_iommu *iommu = NULL;
4115 	struct dmar_drhd_unit *drhd;
4116 
4117 	for_each_iommu(iommu, drhd)
4118 		iommu_disable_translation(iommu);
4119 }
4120 
4121 void intel_iommu_shutdown(void)
4122 {
4123 	struct dmar_drhd_unit *drhd;
4124 	struct intel_iommu *iommu = NULL;
4125 
4126 	if (no_iommu || dmar_disabled)
4127 		return;
4128 
4129 	down_write(&dmar_global_lock);
4130 
4131 	/* Disable PMRs explicitly here. */
4132 	for_each_iommu(iommu, drhd)
4133 		iommu_disable_protect_mem_regions(iommu);
4134 
4135 	/* Make sure the IOMMUs are switched off */
4136 	intel_disable_iommus();
4137 
4138 	up_write(&dmar_global_lock);
4139 }
4140 
4141 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4142 {
4143 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4144 
4145 	return container_of(iommu_dev, struct intel_iommu, iommu);
4146 }
4147 
4148 static ssize_t version_show(struct device *dev,
4149 			    struct device_attribute *attr, char *buf)
4150 {
4151 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4152 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4153 	return sprintf(buf, "%d:%d\n",
4154 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4155 }
4156 static DEVICE_ATTR_RO(version);
4157 
4158 static ssize_t address_show(struct device *dev,
4159 			    struct device_attribute *attr, char *buf)
4160 {
4161 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4162 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4163 }
4164 static DEVICE_ATTR_RO(address);
4165 
4166 static ssize_t cap_show(struct device *dev,
4167 			struct device_attribute *attr, char *buf)
4168 {
4169 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4170 	return sprintf(buf, "%llx\n", iommu->cap);
4171 }
4172 static DEVICE_ATTR_RO(cap);
4173 
4174 static ssize_t ecap_show(struct device *dev,
4175 			 struct device_attribute *attr, char *buf)
4176 {
4177 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4178 	return sprintf(buf, "%llx\n", iommu->ecap);
4179 }
4180 static DEVICE_ATTR_RO(ecap);
4181 
4182 static ssize_t domains_supported_show(struct device *dev,
4183 				      struct device_attribute *attr, char *buf)
4184 {
4185 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4186 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4187 }
4188 static DEVICE_ATTR_RO(domains_supported);
4189 
4190 static ssize_t domains_used_show(struct device *dev,
4191 				 struct device_attribute *attr, char *buf)
4192 {
4193 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4194 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4195 						  cap_ndoms(iommu->cap)));
4196 }
4197 static DEVICE_ATTR_RO(domains_used);
4198 
4199 static struct attribute *intel_iommu_attrs[] = {
4200 	&dev_attr_version.attr,
4201 	&dev_attr_address.attr,
4202 	&dev_attr_cap.attr,
4203 	&dev_attr_ecap.attr,
4204 	&dev_attr_domains_supported.attr,
4205 	&dev_attr_domains_used.attr,
4206 	NULL,
4207 };
4208 
4209 static struct attribute_group intel_iommu_group = {
4210 	.name = "intel-iommu",
4211 	.attrs = intel_iommu_attrs,
4212 };
4213 
4214 const struct attribute_group *intel_iommu_groups[] = {
4215 	&intel_iommu_group,
4216 	NULL,
4217 };
4218 
4219 static inline bool has_external_pci(void)
4220 {
4221 	struct pci_dev *pdev = NULL;
4222 
4223 	for_each_pci_dev(pdev)
4224 		if (pdev->external_facing)
4225 			return true;
4226 
4227 	return false;
4228 }
4229 
4230 static int __init platform_optin_force_iommu(void)
4231 {
4232 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4233 		return 0;
4234 
4235 	if (no_iommu || dmar_disabled)
4236 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4237 
4238 	/*
4239 	 * If Intel-IOMMU is disabled by default, we will apply identity
4240 	 * map for all devices except those marked as being untrusted.
4241 	 */
4242 	if (dmar_disabled)
4243 		iommu_set_default_passthrough(false);
4244 
4245 	dmar_disabled = 0;
4246 	no_iommu = 0;
4247 
4248 	return 1;
4249 }
4250 
4251 static int __init probe_acpi_namespace_devices(void)
4252 {
4253 	struct dmar_drhd_unit *drhd;
4254 	/* To avoid a -Wunused-but-set-variable warning. */
4255 	struct intel_iommu *iommu __maybe_unused;
4256 	struct device *dev;
4257 	int i, ret = 0;
4258 
4259 	for_each_active_iommu(iommu, drhd) {
4260 		for_each_active_dev_scope(drhd->devices,
4261 					  drhd->devices_cnt, i, dev) {
4262 			struct acpi_device_physical_node *pn;
4263 			struct iommu_group *group;
4264 			struct acpi_device *adev;
4265 
4266 			if (dev->bus != &acpi_bus_type)
4267 				continue;
4268 
4269 			adev = to_acpi_device(dev);
4270 			mutex_lock(&adev->physical_node_lock);
4271 			list_for_each_entry(pn,
4272 					    &adev->physical_node_list, node) {
4273 				group = iommu_group_get(pn->dev);
4274 				if (group) {
4275 					iommu_group_put(group);
4276 					continue;
4277 				}
4278 
4279 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4280 				ret = iommu_probe_device(pn->dev);
4281 				if (ret)
4282 					break;
4283 			}
4284 			mutex_unlock(&adev->physical_node_lock);
4285 
4286 			if (ret)
4287 				return ret;
4288 		}
4289 	}
4290 
4291 	return 0;
4292 }
4293 
4294 int __init intel_iommu_init(void)
4295 {
4296 	int ret = -ENODEV;
4297 	struct dmar_drhd_unit *drhd;
4298 	struct intel_iommu *iommu;
4299 
4300 	/*
4301 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4302 	 * opt in, so enforce that.
4303 	 */
4304 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4305 		    platform_optin_force_iommu();
4306 
4307 	if (iommu_init_mempool()) {
4308 		if (force_on)
4309 			panic("tboot: Failed to initialize iommu memory\n");
4310 		return -ENOMEM;
4311 	}
4312 
4313 	down_write(&dmar_global_lock);
4314 	if (dmar_table_init()) {
4315 		if (force_on)
4316 			panic("tboot: Failed to initialize DMAR table\n");
4317 		goto out_free_dmar;
4318 	}
4319 
4320 	if (dmar_dev_scope_init() < 0) {
4321 		if (force_on)
4322 			panic("tboot: Failed to initialize DMAR device scope\n");
4323 		goto out_free_dmar;
4324 	}
4325 
4326 	up_write(&dmar_global_lock);
4327 
4328 	/*
4329 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4330 	 * complain later when we register it under the lock.
4331 	 */
4332 	dmar_register_bus_notifier();
4333 
4334 	down_write(&dmar_global_lock);
4335 
4336 	if (!no_iommu)
4337 		intel_iommu_debugfs_init();
4338 
4339 	if (no_iommu || dmar_disabled) {
4340 		/*
4341 		 * We exit the function here to ensure IOMMU's remapping and
4342 		 * mempool aren't setup, which means that the IOMMU's PMRs
4343 		 * won't be disabled via the call to init_dmars(). So disable
4344 		 * it explicitly here. The PMRs were setup by tboot prior to
4345 		 * calling SENTER, but the kernel is expected to reset/tear
4346 		 * down the PMRs.
4347 		 */
4348 		if (intel_iommu_tboot_noforce) {
4349 			for_each_iommu(iommu, drhd)
4350 				iommu_disable_protect_mem_regions(iommu);
4351 		}
4352 
4353 		/*
4354 		 * Make sure the IOMMUs are switched off, even when we
4355 		 * boot into a kexec kernel and the previous kernel left
4356 		 * them enabled
4357 		 */
4358 		intel_disable_iommus();
4359 		goto out_free_dmar;
4360 	}
4361 
4362 	if (list_empty(&dmar_rmrr_units))
4363 		pr_info("No RMRR found\n");
4364 
4365 	if (list_empty(&dmar_atsr_units))
4366 		pr_info("No ATSR found\n");
4367 
4368 	if (list_empty(&dmar_satc_units))
4369 		pr_info("No SATC found\n");
4370 
4371 	if (dmar_map_gfx)
4372 		intel_iommu_gfx_mapped = 1;
4373 
4374 	init_no_remapping_devices();
4375 
4376 	ret = init_dmars();
4377 	if (ret) {
4378 		if (force_on)
4379 			panic("tboot: Failed to initialize DMARs\n");
4380 		pr_err("Initialization failed\n");
4381 		goto out_free_dmar;
4382 	}
4383 	up_write(&dmar_global_lock);
4384 
4385 	init_iommu_pm_ops();
4386 
4387 	down_read(&dmar_global_lock);
4388 	for_each_active_iommu(iommu, drhd) {
4389 		/*
4390 		 * The flush queue implementation does not perform
4391 		 * page-selective invalidations that are required for efficient
4392 		 * TLB flushes in virtual environments.  The benefit of batching
4393 		 * is likely to be much lower than the overhead of synchronizing
4394 		 * the virtual and physical IOMMU page-tables.
4395 		 */
4396 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
4397 			pr_warn("IOMMU batching is disabled due to virtualization");
4398 			intel_iommu_strict = 1;
4399 		}
4400 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4401 				       intel_iommu_groups,
4402 				       "%s", iommu->name);
4403 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4404 	}
4405 	up_read(&dmar_global_lock);
4406 
4407 	iommu_set_dma_strict(intel_iommu_strict);
4408 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4409 	if (si_domain && !hw_pass_through)
4410 		register_memory_notifier(&intel_iommu_memory_nb);
4411 
4412 	down_read(&dmar_global_lock);
4413 	if (probe_acpi_namespace_devices())
4414 		pr_warn("ACPI name space devices didn't probe correctly\n");
4415 
4416 	/* Finally, we enable the DMA remapping hardware. */
4417 	for_each_iommu(iommu, drhd) {
4418 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4419 			iommu_enable_translation(iommu);
4420 
4421 		iommu_disable_protect_mem_regions(iommu);
4422 	}
4423 	up_read(&dmar_global_lock);
4424 
4425 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4426 
4427 	intel_iommu_enabled = 1;
4428 
4429 	return 0;
4430 
4431 out_free_dmar:
4432 	intel_iommu_free_dmars();
4433 	up_write(&dmar_global_lock);
4434 	iommu_exit_mempool();
4435 	return ret;
4436 }
4437 
4438 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4439 {
4440 	struct device_domain_info *info = opaque;
4441 
4442 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4443 	return 0;
4444 }
4445 
4446 /*
4447  * NB - intel-iommu lacks any sort of reference counting for the users of
4448  * dependent devices.  If multiple endpoints have intersecting dependent
4449  * devices, unbinding the driver from any one of them will possibly leave
4450  * the others unable to operate.
4451  */
4452 static void domain_context_clear(struct device_domain_info *info)
4453 {
4454 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4455 		return;
4456 
4457 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4458 			       &domain_context_clear_one_cb, info);
4459 }
4460 
4461 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4462 {
4463 	struct dmar_domain *domain;
4464 	struct intel_iommu *iommu;
4465 	unsigned long flags;
4466 
4467 	assert_spin_locked(&device_domain_lock);
4468 
4469 	if (WARN_ON(!info))
4470 		return;
4471 
4472 	iommu = info->iommu;
4473 	domain = info->domain;
4474 
4475 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4476 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4477 			intel_pasid_tear_down_entry(iommu, info->dev,
4478 					PASID_RID2PASID, false);
4479 
4480 		iommu_disable_dev_iotlb(info);
4481 		domain_context_clear(info);
4482 		intel_pasid_free_table(info->dev);
4483 	}
4484 
4485 	unlink_domain_info(info);
4486 
4487 	spin_lock_irqsave(&iommu->lock, flags);
4488 	domain_detach_iommu(domain, iommu);
4489 	spin_unlock_irqrestore(&iommu->lock, flags);
4490 
4491 	free_devinfo_mem(info);
4492 }
4493 
4494 static void dmar_remove_one_dev_info(struct device *dev)
4495 {
4496 	struct device_domain_info *info;
4497 	unsigned long flags;
4498 
4499 	spin_lock_irqsave(&device_domain_lock, flags);
4500 	info = get_domain_info(dev);
4501 	if (info)
4502 		__dmar_remove_one_dev_info(info);
4503 	spin_unlock_irqrestore(&device_domain_lock, flags);
4504 }
4505 
4506 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4507 {
4508 	int adjust_width;
4509 
4510 	/* calculate AGAW */
4511 	domain->gaw = guest_width;
4512 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4513 	domain->agaw = width_to_agaw(adjust_width);
4514 
4515 	domain->iommu_coherency = false;
4516 	domain->iommu_snooping = false;
4517 	domain->iommu_superpage = 0;
4518 	domain->max_addr = 0;
4519 
4520 	/* always allocate the top pgd */
4521 	domain->pgd = alloc_pgtable_page(domain->nid);
4522 	if (!domain->pgd)
4523 		return -ENOMEM;
4524 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4525 	return 0;
4526 }
4527 
4528 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4529 {
4530 	struct dmar_domain *dmar_domain;
4531 	struct iommu_domain *domain;
4532 
4533 	switch (type) {
4534 	case IOMMU_DOMAIN_DMA:
4535 	case IOMMU_DOMAIN_UNMANAGED:
4536 		dmar_domain = alloc_domain(0);
4537 		if (!dmar_domain) {
4538 			pr_err("Can't allocate dmar_domain\n");
4539 			return NULL;
4540 		}
4541 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4542 			pr_err("Domain initialization failed\n");
4543 			domain_exit(dmar_domain);
4544 			return NULL;
4545 		}
4546 
4547 		if (type == IOMMU_DOMAIN_DMA &&
4548 		    iommu_get_dma_cookie(&dmar_domain->domain))
4549 			return NULL;
4550 
4551 		domain = &dmar_domain->domain;
4552 		domain->geometry.aperture_start = 0;
4553 		domain->geometry.aperture_end   =
4554 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4555 		domain->geometry.force_aperture = true;
4556 
4557 		return domain;
4558 	case IOMMU_DOMAIN_IDENTITY:
4559 		return &si_domain->domain;
4560 	default:
4561 		return NULL;
4562 	}
4563 
4564 	return NULL;
4565 }
4566 
4567 static void intel_iommu_domain_free(struct iommu_domain *domain)
4568 {
4569 	if (domain != &si_domain->domain)
4570 		domain_exit(to_dmar_domain(domain));
4571 }
4572 
4573 /*
4574  * Check whether a @domain could be attached to the @dev through the
4575  * aux-domain attach/detach APIs.
4576  */
4577 static inline bool
4578 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4579 {
4580 	struct device_domain_info *info = get_domain_info(dev);
4581 
4582 	return info && info->auxd_enabled &&
4583 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4584 }
4585 
4586 static inline struct subdev_domain_info *
4587 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4588 {
4589 	struct subdev_domain_info *sinfo;
4590 
4591 	if (!list_empty(&domain->subdevices)) {
4592 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4593 			if (sinfo->pdev == dev)
4594 				return sinfo;
4595 		}
4596 	}
4597 
4598 	return NULL;
4599 }
4600 
4601 static int auxiliary_link_device(struct dmar_domain *domain,
4602 				 struct device *dev)
4603 {
4604 	struct device_domain_info *info = get_domain_info(dev);
4605 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4606 
4607 	assert_spin_locked(&device_domain_lock);
4608 	if (WARN_ON(!info))
4609 		return -EINVAL;
4610 
4611 	if (!sinfo) {
4612 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4613 		if (!sinfo)
4614 			return -ENOMEM;
4615 		sinfo->domain = domain;
4616 		sinfo->pdev = dev;
4617 		list_add(&sinfo->link_phys, &info->subdevices);
4618 		list_add(&sinfo->link_domain, &domain->subdevices);
4619 	}
4620 
4621 	return ++sinfo->users;
4622 }
4623 
4624 static int auxiliary_unlink_device(struct dmar_domain *domain,
4625 				   struct device *dev)
4626 {
4627 	struct device_domain_info *info = get_domain_info(dev);
4628 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4629 	int ret;
4630 
4631 	assert_spin_locked(&device_domain_lock);
4632 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4633 		return -EINVAL;
4634 
4635 	ret = --sinfo->users;
4636 	if (!ret) {
4637 		list_del(&sinfo->link_phys);
4638 		list_del(&sinfo->link_domain);
4639 		kfree(sinfo);
4640 	}
4641 
4642 	return ret;
4643 }
4644 
4645 static int aux_domain_add_dev(struct dmar_domain *domain,
4646 			      struct device *dev)
4647 {
4648 	int ret;
4649 	unsigned long flags;
4650 	struct intel_iommu *iommu;
4651 
4652 	iommu = device_to_iommu(dev, NULL, NULL);
4653 	if (!iommu)
4654 		return -ENODEV;
4655 
4656 	if (domain->default_pasid <= 0) {
4657 		u32 pasid;
4658 
4659 		/* No private data needed for the default pasid */
4660 		pasid = ioasid_alloc(NULL, PASID_MIN,
4661 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4662 				     NULL);
4663 		if (pasid == INVALID_IOASID) {
4664 			pr_err("Can't allocate default pasid\n");
4665 			return -ENODEV;
4666 		}
4667 		domain->default_pasid = pasid;
4668 	}
4669 
4670 	spin_lock_irqsave(&device_domain_lock, flags);
4671 	ret = auxiliary_link_device(domain, dev);
4672 	if (ret <= 0)
4673 		goto link_failed;
4674 
4675 	/*
4676 	 * Subdevices from the same physical device can be attached to the
4677 	 * same domain. For such cases, only the first subdevice attachment
4678 	 * needs to go through the full steps in this function. So if ret >
4679 	 * 1, just goto out.
4680 	 */
4681 	if (ret > 1)
4682 		goto out;
4683 
4684 	/*
4685 	 * iommu->lock must be held to attach domain to iommu and setup the
4686 	 * pasid entry for second level translation.
4687 	 */
4688 	spin_lock(&iommu->lock);
4689 	ret = domain_attach_iommu(domain, iommu);
4690 	if (ret)
4691 		goto attach_failed;
4692 
4693 	/* Setup the PASID entry for mediated devices: */
4694 	if (domain_use_first_level(domain))
4695 		ret = domain_setup_first_level(iommu, domain, dev,
4696 					       domain->default_pasid);
4697 	else
4698 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4699 						     domain->default_pasid);
4700 	if (ret)
4701 		goto table_failed;
4702 
4703 	spin_unlock(&iommu->lock);
4704 out:
4705 	spin_unlock_irqrestore(&device_domain_lock, flags);
4706 
4707 	return 0;
4708 
4709 table_failed:
4710 	domain_detach_iommu(domain, iommu);
4711 attach_failed:
4712 	spin_unlock(&iommu->lock);
4713 	auxiliary_unlink_device(domain, dev);
4714 link_failed:
4715 	spin_unlock_irqrestore(&device_domain_lock, flags);
4716 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4717 		ioasid_put(domain->default_pasid);
4718 
4719 	return ret;
4720 }
4721 
4722 static void aux_domain_remove_dev(struct dmar_domain *domain,
4723 				  struct device *dev)
4724 {
4725 	struct device_domain_info *info;
4726 	struct intel_iommu *iommu;
4727 	unsigned long flags;
4728 
4729 	if (!is_aux_domain(dev, &domain->domain))
4730 		return;
4731 
4732 	spin_lock_irqsave(&device_domain_lock, flags);
4733 	info = get_domain_info(dev);
4734 	iommu = info->iommu;
4735 
4736 	if (!auxiliary_unlink_device(domain, dev)) {
4737 		spin_lock(&iommu->lock);
4738 		intel_pasid_tear_down_entry(iommu, dev,
4739 					    domain->default_pasid, false);
4740 		domain_detach_iommu(domain, iommu);
4741 		spin_unlock(&iommu->lock);
4742 	}
4743 
4744 	spin_unlock_irqrestore(&device_domain_lock, flags);
4745 
4746 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4747 		ioasid_put(domain->default_pasid);
4748 }
4749 
4750 static int prepare_domain_attach_device(struct iommu_domain *domain,
4751 					struct device *dev)
4752 {
4753 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4754 	struct intel_iommu *iommu;
4755 	int addr_width;
4756 
4757 	iommu = device_to_iommu(dev, NULL, NULL);
4758 	if (!iommu)
4759 		return -ENODEV;
4760 
4761 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4762 	    !ecap_nest(iommu->ecap)) {
4763 		dev_err(dev, "%s: iommu not support nested translation\n",
4764 			iommu->name);
4765 		return -EINVAL;
4766 	}
4767 
4768 	/* check if this iommu agaw is sufficient for max mapped address */
4769 	addr_width = agaw_to_width(iommu->agaw);
4770 	if (addr_width > cap_mgaw(iommu->cap))
4771 		addr_width = cap_mgaw(iommu->cap);
4772 
4773 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4774 		dev_err(dev, "%s: iommu width (%d) is not "
4775 		        "sufficient for the mapped address (%llx)\n",
4776 		        __func__, addr_width, dmar_domain->max_addr);
4777 		return -EFAULT;
4778 	}
4779 	dmar_domain->gaw = addr_width;
4780 
4781 	/*
4782 	 * Knock out extra levels of page tables if necessary
4783 	 */
4784 	while (iommu->agaw < dmar_domain->agaw) {
4785 		struct dma_pte *pte;
4786 
4787 		pte = dmar_domain->pgd;
4788 		if (dma_pte_present(pte)) {
4789 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4790 			free_pgtable_page(pte);
4791 		}
4792 		dmar_domain->agaw--;
4793 	}
4794 
4795 	return 0;
4796 }
4797 
4798 static int intel_iommu_attach_device(struct iommu_domain *domain,
4799 				     struct device *dev)
4800 {
4801 	int ret;
4802 
4803 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4804 	    device_is_rmrr_locked(dev)) {
4805 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4806 		return -EPERM;
4807 	}
4808 
4809 	if (is_aux_domain(dev, domain))
4810 		return -EPERM;
4811 
4812 	/* normally dev is not mapped */
4813 	if (unlikely(domain_context_mapped(dev))) {
4814 		struct dmar_domain *old_domain;
4815 
4816 		old_domain = find_domain(dev);
4817 		if (old_domain)
4818 			dmar_remove_one_dev_info(dev);
4819 	}
4820 
4821 	ret = prepare_domain_attach_device(domain, dev);
4822 	if (ret)
4823 		return ret;
4824 
4825 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4826 }
4827 
4828 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4829 					 struct device *dev)
4830 {
4831 	int ret;
4832 
4833 	if (!is_aux_domain(dev, domain))
4834 		return -EPERM;
4835 
4836 	ret = prepare_domain_attach_device(domain, dev);
4837 	if (ret)
4838 		return ret;
4839 
4840 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4841 }
4842 
4843 static void intel_iommu_detach_device(struct iommu_domain *domain,
4844 				      struct device *dev)
4845 {
4846 	dmar_remove_one_dev_info(dev);
4847 }
4848 
4849 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4850 					  struct device *dev)
4851 {
4852 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4853 }
4854 
4855 #ifdef CONFIG_INTEL_IOMMU_SVM
4856 /*
4857  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4858  * VT-d granularity. Invalidation is typically included in the unmap operation
4859  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4860  * owns the first level page tables. Invalidations of translation caches in the
4861  * guest are trapped and passed down to the host.
4862  *
4863  * vIOMMU in the guest will only expose first level page tables, therefore
4864  * we do not support IOTLB granularity for request without PASID (second level).
4865  *
4866  * For example, to find the VT-d granularity encoding for IOTLB
4867  * type and page selective granularity within PASID:
4868  * X: indexed by iommu cache type
4869  * Y: indexed by enum iommu_inv_granularity
4870  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4871  */
4872 
4873 static const int
4874 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4875 	/*
4876 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4877 	 * page selective (address granularity)
4878 	 */
4879 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4880 	/* PASID based dev TLBs */
4881 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4882 	/* PASID cache */
4883 	{-EINVAL, -EINVAL, -EINVAL}
4884 };
4885 
4886 static inline int to_vtd_granularity(int type, int granu)
4887 {
4888 	return inv_type_granu_table[type][granu];
4889 }
4890 
4891 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4892 {
4893 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4894 
4895 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4896 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4897 	 * granu size in contiguous memory.
4898 	 */
4899 	return order_base_2(nr_pages);
4900 }
4901 
4902 static int
4903 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4904 			   struct iommu_cache_invalidate_info *inv_info)
4905 {
4906 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4907 	struct device_domain_info *info;
4908 	struct intel_iommu *iommu;
4909 	unsigned long flags;
4910 	int cache_type;
4911 	u8 bus, devfn;
4912 	u16 did, sid;
4913 	int ret = 0;
4914 	u64 size = 0;
4915 
4916 	if (!inv_info || !dmar_domain)
4917 		return -EINVAL;
4918 
4919 	if (!dev || !dev_is_pci(dev))
4920 		return -ENODEV;
4921 
4922 	iommu = device_to_iommu(dev, &bus, &devfn);
4923 	if (!iommu)
4924 		return -ENODEV;
4925 
4926 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4927 		return -EINVAL;
4928 
4929 	spin_lock_irqsave(&device_domain_lock, flags);
4930 	spin_lock(&iommu->lock);
4931 	info = get_domain_info(dev);
4932 	if (!info) {
4933 		ret = -EINVAL;
4934 		goto out_unlock;
4935 	}
4936 	did = dmar_domain->iommu_did[iommu->seq_id];
4937 	sid = PCI_DEVID(bus, devfn);
4938 
4939 	/* Size is only valid in address selective invalidation */
4940 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4941 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4942 				   inv_info->granu.addr_info.nb_granules);
4943 
4944 	for_each_set_bit(cache_type,
4945 			 (unsigned long *)&inv_info->cache,
4946 			 IOMMU_CACHE_INV_TYPE_NR) {
4947 		int granu = 0;
4948 		u64 pasid = 0;
4949 		u64 addr = 0;
4950 
4951 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4952 		if (granu == -EINVAL) {
4953 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4954 					   cache_type, inv_info->granularity);
4955 			break;
4956 		}
4957 
4958 		/*
4959 		 * PASID is stored in different locations based on the
4960 		 * granularity.
4961 		 */
4962 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4963 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4964 			pasid = inv_info->granu.pasid_info.pasid;
4965 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4966 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4967 			pasid = inv_info->granu.addr_info.pasid;
4968 
4969 		switch (BIT(cache_type)) {
4970 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4971 			/* HW will ignore LSB bits based on address mask */
4972 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4973 			    size &&
4974 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4975 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4976 						   inv_info->granu.addr_info.addr, size);
4977 			}
4978 
4979 			/*
4980 			 * If granu is PASID-selective, address is ignored.
4981 			 * We use npages = -1 to indicate that.
4982 			 */
4983 			qi_flush_piotlb(iommu, did, pasid,
4984 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4985 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4986 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4987 
4988 			if (!info->ats_enabled)
4989 				break;
4990 			/*
4991 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4992 			 * in the guest may assume IOTLB flush is inclusive,
4993 			 * which is more efficient.
4994 			 */
4995 			fallthrough;
4996 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4997 			/*
4998 			 * PASID based device TLB invalidation does not support
4999 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5000 			 * IOMMU_INV_GRANU_ADDR.
5001 			 * The equivalent of that is we set the size to be the
5002 			 * entire range of 64 bit. User only provides PASID info
5003 			 * without address info. So we set addr to 0.
5004 			 */
5005 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5006 				size = 64 - VTD_PAGE_SHIFT;
5007 				addr = 0;
5008 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5009 				addr = inv_info->granu.addr_info.addr;
5010 			}
5011 
5012 			if (info->ats_enabled)
5013 				qi_flush_dev_iotlb_pasid(iommu, sid,
5014 						info->pfsid, pasid,
5015 						info->ats_qdep, addr,
5016 						size);
5017 			else
5018 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5019 			break;
5020 		default:
5021 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5022 					    cache_type);
5023 			ret = -EINVAL;
5024 		}
5025 	}
5026 out_unlock:
5027 	spin_unlock(&iommu->lock);
5028 	spin_unlock_irqrestore(&device_domain_lock, flags);
5029 
5030 	return ret;
5031 }
5032 #endif
5033 
5034 static int intel_iommu_map(struct iommu_domain *domain,
5035 			   unsigned long iova, phys_addr_t hpa,
5036 			   size_t size, int iommu_prot, gfp_t gfp)
5037 {
5038 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5039 	u64 max_addr;
5040 	int prot = 0;
5041 
5042 	if (iommu_prot & IOMMU_READ)
5043 		prot |= DMA_PTE_READ;
5044 	if (iommu_prot & IOMMU_WRITE)
5045 		prot |= DMA_PTE_WRITE;
5046 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5047 		prot |= DMA_PTE_SNP;
5048 
5049 	max_addr = iova + size;
5050 	if (dmar_domain->max_addr < max_addr) {
5051 		u64 end;
5052 
5053 		/* check if minimum agaw is sufficient for mapped address */
5054 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5055 		if (end < max_addr) {
5056 			pr_err("%s: iommu width (%d) is not "
5057 			       "sufficient for the mapped address (%llx)\n",
5058 			       __func__, dmar_domain->gaw, max_addr);
5059 			return -EFAULT;
5060 		}
5061 		dmar_domain->max_addr = max_addr;
5062 	}
5063 	/* Round up size to next multiple of PAGE_SIZE, if it and
5064 	   the low bits of hpa would take us onto the next page */
5065 	size = aligned_nrpages(hpa, size);
5066 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5067 				hpa >> VTD_PAGE_SHIFT, size, prot);
5068 }
5069 
5070 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5071 				unsigned long iova, size_t size,
5072 				struct iommu_iotlb_gather *gather)
5073 {
5074 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5075 	unsigned long start_pfn, last_pfn;
5076 	int level = 0;
5077 
5078 	/* Cope with horrid API which requires us to unmap more than the
5079 	   size argument if it happens to be a large-page mapping. */
5080 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5081 
5082 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5083 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5084 
5085 	start_pfn = iova >> VTD_PAGE_SHIFT;
5086 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5087 
5088 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5089 					last_pfn, gather->freelist);
5090 
5091 	if (dmar_domain->max_addr == iova + size)
5092 		dmar_domain->max_addr = iova;
5093 
5094 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5095 
5096 	return size;
5097 }
5098 
5099 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5100 				 struct iommu_iotlb_gather *gather)
5101 {
5102 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5103 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5104 	size_t size = gather->end - gather->start;
5105 	unsigned long start_pfn;
5106 	unsigned long nrpages;
5107 	int iommu_id;
5108 
5109 	nrpages = aligned_nrpages(gather->start, size);
5110 	start_pfn = mm_to_dma_pfn(iova_pfn);
5111 
5112 	for_each_domain_iommu(iommu_id, dmar_domain)
5113 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5114 				      start_pfn, nrpages, !gather->freelist, 0);
5115 
5116 	dma_free_pagelist(gather->freelist);
5117 }
5118 
5119 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5120 					    dma_addr_t iova)
5121 {
5122 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5123 	struct dma_pte *pte;
5124 	int level = 0;
5125 	u64 phys = 0;
5126 
5127 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5128 	if (pte && dma_pte_present(pte))
5129 		phys = dma_pte_addr(pte) +
5130 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5131 						VTD_PAGE_SHIFT) - 1));
5132 
5133 	return phys;
5134 }
5135 
5136 static bool intel_iommu_capable(enum iommu_cap cap)
5137 {
5138 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5139 		return domain_update_iommu_snooping(NULL);
5140 	if (cap == IOMMU_CAP_INTR_REMAP)
5141 		return irq_remapping_enabled == 1;
5142 
5143 	return false;
5144 }
5145 
5146 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5147 {
5148 	struct intel_iommu *iommu;
5149 
5150 	iommu = device_to_iommu(dev, NULL, NULL);
5151 	if (!iommu)
5152 		return ERR_PTR(-ENODEV);
5153 
5154 	if (translation_pre_enabled(iommu))
5155 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5156 
5157 	return &iommu->iommu;
5158 }
5159 
5160 static void intel_iommu_release_device(struct device *dev)
5161 {
5162 	struct intel_iommu *iommu;
5163 
5164 	iommu = device_to_iommu(dev, NULL, NULL);
5165 	if (!iommu)
5166 		return;
5167 
5168 	dmar_remove_one_dev_info(dev);
5169 
5170 	set_dma_ops(dev, NULL);
5171 }
5172 
5173 static void intel_iommu_probe_finalize(struct device *dev)
5174 {
5175 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5176 
5177 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5178 		iommu_setup_dma_ops(dev, 0, U64_MAX);
5179 	else
5180 		set_dma_ops(dev, NULL);
5181 }
5182 
5183 static void intel_iommu_get_resv_regions(struct device *device,
5184 					 struct list_head *head)
5185 {
5186 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5187 	struct iommu_resv_region *reg;
5188 	struct dmar_rmrr_unit *rmrr;
5189 	struct device *i_dev;
5190 	int i;
5191 
5192 	down_read(&dmar_global_lock);
5193 	for_each_rmrr_units(rmrr) {
5194 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5195 					  i, i_dev) {
5196 			struct iommu_resv_region *resv;
5197 			enum iommu_resv_type type;
5198 			size_t length;
5199 
5200 			if (i_dev != device &&
5201 			    !is_downstream_to_pci_bridge(device, i_dev))
5202 				continue;
5203 
5204 			length = rmrr->end_address - rmrr->base_address + 1;
5205 
5206 			type = device_rmrr_is_relaxable(device) ?
5207 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5208 
5209 			resv = iommu_alloc_resv_region(rmrr->base_address,
5210 						       length, prot, type);
5211 			if (!resv)
5212 				break;
5213 
5214 			list_add_tail(&resv->list, head);
5215 		}
5216 	}
5217 	up_read(&dmar_global_lock);
5218 
5219 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5220 	if (dev_is_pci(device)) {
5221 		struct pci_dev *pdev = to_pci_dev(device);
5222 
5223 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5224 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5225 						   IOMMU_RESV_DIRECT_RELAXABLE);
5226 			if (reg)
5227 				list_add_tail(&reg->list, head);
5228 		}
5229 	}
5230 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5231 
5232 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5233 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5234 				      0, IOMMU_RESV_MSI);
5235 	if (!reg)
5236 		return;
5237 	list_add_tail(&reg->list, head);
5238 }
5239 
5240 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5241 {
5242 	struct device_domain_info *info;
5243 	struct context_entry *context;
5244 	struct dmar_domain *domain;
5245 	unsigned long flags;
5246 	u64 ctx_lo;
5247 	int ret;
5248 
5249 	domain = find_domain(dev);
5250 	if (!domain)
5251 		return -EINVAL;
5252 
5253 	spin_lock_irqsave(&device_domain_lock, flags);
5254 	spin_lock(&iommu->lock);
5255 
5256 	ret = -EINVAL;
5257 	info = get_domain_info(dev);
5258 	if (!info || !info->pasid_supported)
5259 		goto out;
5260 
5261 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5262 	if (WARN_ON(!context))
5263 		goto out;
5264 
5265 	ctx_lo = context[0].lo;
5266 
5267 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5268 		ctx_lo |= CONTEXT_PASIDE;
5269 		context[0].lo = ctx_lo;
5270 		wmb();
5271 		iommu->flush.flush_context(iommu,
5272 					   domain->iommu_did[iommu->seq_id],
5273 					   PCI_DEVID(info->bus, info->devfn),
5274 					   DMA_CCMD_MASK_NOBIT,
5275 					   DMA_CCMD_DEVICE_INVL);
5276 	}
5277 
5278 	/* Enable PASID support in the device, if it wasn't already */
5279 	if (!info->pasid_enabled)
5280 		iommu_enable_dev_iotlb(info);
5281 
5282 	ret = 0;
5283 
5284  out:
5285 	spin_unlock(&iommu->lock);
5286 	spin_unlock_irqrestore(&device_domain_lock, flags);
5287 
5288 	return ret;
5289 }
5290 
5291 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5292 {
5293 	if (dev_is_pci(dev))
5294 		return pci_device_group(dev);
5295 	return generic_device_group(dev);
5296 }
5297 
5298 static int intel_iommu_enable_auxd(struct device *dev)
5299 {
5300 	struct device_domain_info *info;
5301 	struct intel_iommu *iommu;
5302 	unsigned long flags;
5303 	int ret;
5304 
5305 	iommu = device_to_iommu(dev, NULL, NULL);
5306 	if (!iommu || dmar_disabled)
5307 		return -EINVAL;
5308 
5309 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5310 		return -EINVAL;
5311 
5312 	ret = intel_iommu_enable_pasid(iommu, dev);
5313 	if (ret)
5314 		return -ENODEV;
5315 
5316 	spin_lock_irqsave(&device_domain_lock, flags);
5317 	info = get_domain_info(dev);
5318 	info->auxd_enabled = 1;
5319 	spin_unlock_irqrestore(&device_domain_lock, flags);
5320 
5321 	return 0;
5322 }
5323 
5324 static int intel_iommu_disable_auxd(struct device *dev)
5325 {
5326 	struct device_domain_info *info;
5327 	unsigned long flags;
5328 
5329 	spin_lock_irqsave(&device_domain_lock, flags);
5330 	info = get_domain_info(dev);
5331 	if (!WARN_ON(!info))
5332 		info->auxd_enabled = 0;
5333 	spin_unlock_irqrestore(&device_domain_lock, flags);
5334 
5335 	return 0;
5336 }
5337 
5338 static int intel_iommu_enable_sva(struct device *dev)
5339 {
5340 	struct device_domain_info *info = get_domain_info(dev);
5341 	struct intel_iommu *iommu;
5342 	int ret;
5343 
5344 	if (!info || dmar_disabled)
5345 		return -EINVAL;
5346 
5347 	iommu = info->iommu;
5348 	if (!iommu)
5349 		return -EINVAL;
5350 
5351 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5352 		return -ENODEV;
5353 
5354 	if (intel_iommu_enable_pasid(iommu, dev))
5355 		return -ENODEV;
5356 
5357 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5358 		return -EINVAL;
5359 
5360 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5361 	if (!ret)
5362 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5363 
5364 	return ret;
5365 }
5366 
5367 static int intel_iommu_disable_sva(struct device *dev)
5368 {
5369 	struct device_domain_info *info = get_domain_info(dev);
5370 	struct intel_iommu *iommu = info->iommu;
5371 	int ret;
5372 
5373 	ret = iommu_unregister_device_fault_handler(dev);
5374 	if (!ret)
5375 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5376 
5377 	return ret;
5378 }
5379 
5380 /*
5381  * A PCI express designated vendor specific extended capability is defined
5382  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5383  * for system software and tools to detect endpoint devices supporting the
5384  * Intel scalable IO virtualization without host driver dependency.
5385  *
5386  * Returns the address of the matching extended capability structure within
5387  * the device's PCI configuration space or 0 if the device does not support
5388  * it.
5389  */
5390 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5391 {
5392 	int pos;
5393 	u16 vendor, id;
5394 
5395 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5396 	while (pos) {
5397 		pci_read_config_word(pdev, pos + 4, &vendor);
5398 		pci_read_config_word(pdev, pos + 8, &id);
5399 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5400 			return pos;
5401 
5402 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5403 	}
5404 
5405 	return 0;
5406 }
5407 
5408 static bool
5409 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5410 {
5411 	struct device_domain_info *info = get_domain_info(dev);
5412 
5413 	if (feat == IOMMU_DEV_FEAT_AUX) {
5414 		int ret;
5415 
5416 		if (!dev_is_pci(dev) || dmar_disabled ||
5417 		    !scalable_mode_support() || !pasid_mode_support())
5418 			return false;
5419 
5420 		ret = pci_pasid_features(to_pci_dev(dev));
5421 		if (ret < 0)
5422 			return false;
5423 
5424 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5425 	}
5426 
5427 	if (feat == IOMMU_DEV_FEAT_IOPF)
5428 		return info && info->pri_supported;
5429 
5430 	if (feat == IOMMU_DEV_FEAT_SVA)
5431 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5432 			info->pasid_supported && info->pri_supported &&
5433 			info->ats_supported;
5434 
5435 	return false;
5436 }
5437 
5438 static int
5439 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5440 {
5441 	switch (feat) {
5442 	case IOMMU_DEV_FEAT_AUX:
5443 		return intel_iommu_enable_auxd(dev);
5444 
5445 	case IOMMU_DEV_FEAT_IOPF:
5446 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5447 
5448 	case IOMMU_DEV_FEAT_SVA:
5449 		return intel_iommu_enable_sva(dev);
5450 
5451 	default:
5452 		return -ENODEV;
5453 	}
5454 }
5455 
5456 static int
5457 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5458 {
5459 	switch (feat) {
5460 	case IOMMU_DEV_FEAT_AUX:
5461 		return intel_iommu_disable_auxd(dev);
5462 
5463 	case IOMMU_DEV_FEAT_IOPF:
5464 		return 0;
5465 
5466 	case IOMMU_DEV_FEAT_SVA:
5467 		return intel_iommu_disable_sva(dev);
5468 
5469 	default:
5470 		return -ENODEV;
5471 	}
5472 }
5473 
5474 static bool
5475 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5476 {
5477 	struct device_domain_info *info = get_domain_info(dev);
5478 
5479 	if (feat == IOMMU_DEV_FEAT_AUX)
5480 		return scalable_mode_support() && info && info->auxd_enabled;
5481 
5482 	return false;
5483 }
5484 
5485 static int
5486 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5487 {
5488 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5489 
5490 	return dmar_domain->default_pasid > 0 ?
5491 			dmar_domain->default_pasid : -EINVAL;
5492 }
5493 
5494 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5495 					   struct device *dev)
5496 {
5497 	return attach_deferred(dev);
5498 }
5499 
5500 static int
5501 intel_iommu_enable_nesting(struct iommu_domain *domain)
5502 {
5503 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504 	unsigned long flags;
5505 	int ret = -ENODEV;
5506 
5507 	spin_lock_irqsave(&device_domain_lock, flags);
5508 	if (list_empty(&dmar_domain->devices)) {
5509 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5510 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5511 		ret = 0;
5512 	}
5513 	spin_unlock_irqrestore(&device_domain_lock, flags);
5514 
5515 	return ret;
5516 }
5517 
5518 /*
5519  * Check that the device does not live on an external facing PCI port that is
5520  * marked as untrusted. Such devices should not be able to apply quirks and
5521  * thus not be able to bypass the IOMMU restrictions.
5522  */
5523 static bool risky_device(struct pci_dev *pdev)
5524 {
5525 	if (pdev->untrusted) {
5526 		pci_info(pdev,
5527 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5528 			 pdev->vendor, pdev->device);
5529 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5530 		return true;
5531 	}
5532 	return false;
5533 }
5534 
5535 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5536 			     unsigned long clf_pages)
5537 {
5538 	struct dma_pte *first_pte = NULL, *pte = NULL;
5539 	unsigned long lvl_pages = 0;
5540 	int level = 0;
5541 
5542 	while (clf_pages > 0) {
5543 		if (!pte) {
5544 			level = 0;
5545 			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5546 			if (WARN_ON(!pte))
5547 				return;
5548 			first_pte = pte;
5549 			lvl_pages = lvl_to_nr_pages(level);
5550 		}
5551 
5552 		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5553 			return;
5554 
5555 		clf_pages -= lvl_pages;
5556 		clf_pfn += lvl_pages;
5557 		pte++;
5558 
5559 		if (!clf_pages || first_pte_in_page(pte) ||
5560 		    (level > 1 && clf_pages < lvl_pages)) {
5561 			domain_flush_cache(domain, first_pte,
5562 					   (void *)pte - (void *)first_pte);
5563 			pte = NULL;
5564 		}
5565 	}
5566 }
5567 
5568 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5569 				       unsigned long iova, size_t size)
5570 {
5571 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5572 	unsigned long pages = aligned_nrpages(iova, size);
5573 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5574 	struct intel_iommu *iommu;
5575 	int iommu_id;
5576 
5577 	if (!dmar_domain->iommu_coherency)
5578 		clflush_sync_map(dmar_domain, pfn, pages);
5579 
5580 	for_each_domain_iommu(iommu_id, dmar_domain) {
5581 		iommu = g_iommus[iommu_id];
5582 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5583 	}
5584 }
5585 
5586 const struct iommu_ops intel_iommu_ops = {
5587 	.capable		= intel_iommu_capable,
5588 	.domain_alloc		= intel_iommu_domain_alloc,
5589 	.domain_free		= intel_iommu_domain_free,
5590 	.enable_nesting		= intel_iommu_enable_nesting,
5591 	.attach_dev		= intel_iommu_attach_device,
5592 	.detach_dev		= intel_iommu_detach_device,
5593 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5594 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5595 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5596 	.map			= intel_iommu_map,
5597 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5598 	.unmap			= intel_iommu_unmap,
5599 	.flush_iotlb_all        = intel_flush_iotlb_all,
5600 	.iotlb_sync		= intel_iommu_tlb_sync,
5601 	.iova_to_phys		= intel_iommu_iova_to_phys,
5602 	.probe_device		= intel_iommu_probe_device,
5603 	.probe_finalize		= intel_iommu_probe_finalize,
5604 	.release_device		= intel_iommu_release_device,
5605 	.get_resv_regions	= intel_iommu_get_resv_regions,
5606 	.put_resv_regions	= generic_iommu_put_resv_regions,
5607 	.device_group		= intel_iommu_device_group,
5608 	.dev_has_feat		= intel_iommu_dev_has_feat,
5609 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5610 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5611 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5612 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5613 	.def_domain_type	= device_def_domain_type,
5614 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5615 #ifdef CONFIG_INTEL_IOMMU_SVM
5616 	.cache_invalidate	= intel_iommu_sva_invalidate,
5617 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5618 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5619 	.sva_bind		= intel_svm_bind,
5620 	.sva_unbind		= intel_svm_unbind,
5621 	.sva_get_pasid		= intel_svm_get_pasid,
5622 	.page_response		= intel_svm_page_response,
5623 #endif
5624 };
5625 
5626 static void quirk_iommu_igfx(struct pci_dev *dev)
5627 {
5628 	if (risky_device(dev))
5629 		return;
5630 
5631 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5632 	dmar_map_gfx = 0;
5633 }
5634 
5635 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5643 
5644 /* Broadwell igfx malfunctions with dmar */
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5669 
5670 static void quirk_iommu_rwbf(struct pci_dev *dev)
5671 {
5672 	if (risky_device(dev))
5673 		return;
5674 
5675 	/*
5676 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5677 	 * but needs it. Same seems to hold for the desktop versions.
5678 	 */
5679 	pci_info(dev, "Forcing write-buffer flush capability\n");
5680 	rwbf_quirk = 1;
5681 }
5682 
5683 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5684 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5685 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5690 
5691 #define GGC 0x52
5692 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5693 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5694 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5695 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5696 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5697 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5698 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5699 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5700 
5701 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5702 {
5703 	unsigned short ggc;
5704 
5705 	if (risky_device(dev))
5706 		return;
5707 
5708 	if (pci_read_config_word(dev, GGC, &ggc))
5709 		return;
5710 
5711 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5712 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5713 		dmar_map_gfx = 0;
5714 	} else if (dmar_map_gfx) {
5715 		/* we have to ensure the gfx device is idle before we flush */
5716 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5717 		intel_iommu_strict = 1;
5718        }
5719 }
5720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5724 
5725 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5726 {
5727 	unsigned short ver;
5728 
5729 	if (!IS_GFX_DEVICE(dev))
5730 		return;
5731 
5732 	ver = (dev->device >> 8) & 0xff;
5733 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5734 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5735 	    ver != 0x9a)
5736 		return;
5737 
5738 	if (risky_device(dev))
5739 		return;
5740 
5741 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5742 	iommu_skip_te_disable = 1;
5743 }
5744 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5745 
5746 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5747    ISOCH DMAR unit for the Azalia sound device, but not give it any
5748    TLB entries, which causes it to deadlock. Check for that.  We do
5749    this in a function called from init_dmars(), instead of in a PCI
5750    quirk, because we don't want to print the obnoxious "BIOS broken"
5751    message if VT-d is actually disabled.
5752 */
5753 static void __init check_tylersburg_isoch(void)
5754 {
5755 	struct pci_dev *pdev;
5756 	uint32_t vtisochctrl;
5757 
5758 	/* If there's no Azalia in the system anyway, forget it. */
5759 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5760 	if (!pdev)
5761 		return;
5762 
5763 	if (risky_device(pdev)) {
5764 		pci_dev_put(pdev);
5765 		return;
5766 	}
5767 
5768 	pci_dev_put(pdev);
5769 
5770 	/* System Management Registers. Might be hidden, in which case
5771 	   we can't do the sanity check. But that's OK, because the
5772 	   known-broken BIOSes _don't_ actually hide it, so far. */
5773 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5774 	if (!pdev)
5775 		return;
5776 
5777 	if (risky_device(pdev)) {
5778 		pci_dev_put(pdev);
5779 		return;
5780 	}
5781 
5782 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5783 		pci_dev_put(pdev);
5784 		return;
5785 	}
5786 
5787 	pci_dev_put(pdev);
5788 
5789 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5790 	if (vtisochctrl & 1)
5791 		return;
5792 
5793 	/* Drop all bits other than the number of TLB entries */
5794 	vtisochctrl &= 0x1c;
5795 
5796 	/* If we have the recommended number of TLB entries (16), fine. */
5797 	if (vtisochctrl == 0x10)
5798 		return;
5799 
5800 	/* Zero TLB entries? You get to ride the short bus to school. */
5801 	if (!vtisochctrl) {
5802 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5803 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5804 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5805 		     dmi_get_system_info(DMI_BIOS_VERSION),
5806 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5807 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5808 		return;
5809 	}
5810 
5811 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5812 	       vtisochctrl);
5813 }
5814