xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 19b438592238b3b40c3f945bb5f9c4ca971c0c45)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 
48 #include "../irq_remapping.h"
49 #include "pasid.h"
50 #include "cap_audit.h"
51 
52 #define ROOT_SIZE		VTD_PAGE_SIZE
53 #define CONTEXT_SIZE		VTD_PAGE_SIZE
54 
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 
60 #define IOAPIC_RANGE_START	(0xfee00000)
61 #define IOAPIC_RANGE_END	(0xfeefffff)
62 #define IOVA_START_ADDR		(0x1000)
63 
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 
69 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
71 
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
75 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 
78 /* IO virtual address start page frame number */
79 #define IOVA_START_PFN		(1)
80 
81 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
82 
83 /* page table handling */
84 #define LEVEL_STRIDE		(9)
85 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
86 
87 /*
88  * This bitmap is used to advertise the page sizes our hardware support
89  * to the IOMMU core, which will then use this information to split
90  * physically contiguous memory regions it is mapping into page sizes
91  * that we support.
92  *
93  * Traditionally the IOMMU core just handed us the mappings directly,
94  * after making sure the size is an order of a 4KiB page and that the
95  * mapping has natural alignment.
96  *
97  * To retain this behavior, we currently advertise that we support
98  * all page sizes that are an order of 4KiB.
99  *
100  * If at some point we'd like to utilize the IOMMU core's new behavior,
101  * we could change this to advertise the real page sizes we support.
102  */
103 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
104 
105 static inline int agaw_to_level(int agaw)
106 {
107 	return agaw + 2;
108 }
109 
110 static inline int agaw_to_width(int agaw)
111 {
112 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 }
114 
115 static inline int width_to_agaw(int width)
116 {
117 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 }
119 
120 static inline unsigned int level_to_offset_bits(int level)
121 {
122 	return (level - 1) * LEVEL_STRIDE;
123 }
124 
125 static inline int pfn_level_offset(u64 pfn, int level)
126 {
127 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 }
129 
130 static inline u64 level_mask(int level)
131 {
132 	return -1ULL << level_to_offset_bits(level);
133 }
134 
135 static inline u64 level_size(int level)
136 {
137 	return 1ULL << level_to_offset_bits(level);
138 }
139 
140 static inline u64 align_to_level(u64 pfn, int level)
141 {
142 	return (pfn + level_size(level) - 1) & level_mask(level);
143 }
144 
145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 {
147 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 }
149 
150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
151    are never going to work. */
152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 {
154 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 
157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 {
159 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 {
163 	return mm_to_dma_pfn(page_to_pfn(pg));
164 }
165 static inline unsigned long virt_to_dma_pfn(void *p)
166 {
167 	return page_to_dma_pfn(virt_to_page(p));
168 }
169 
170 /* global iommu list, set NULL for ignored DMAR units */
171 static struct intel_iommu **g_iommus;
172 
173 static void __init check_tylersburg_isoch(void);
174 static int rwbf_quirk;
175 
176 /*
177  * set to 1 to panic kernel if can't successfully enable VT-d
178  * (used when kernel is launched w/ TXT)
179  */
180 static int force_on = 0;
181 static int intel_iommu_tboot_noforce;
182 static int no_platform_optin;
183 
184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
185 
186 /*
187  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
188  * if marked present.
189  */
190 static phys_addr_t root_entry_lctp(struct root_entry *re)
191 {
192 	if (!(re->lo & 1))
193 		return 0;
194 
195 	return re->lo & VTD_PAGE_MASK;
196 }
197 
198 /*
199  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_uctp(struct root_entry *re)
203 {
204 	if (!(re->hi & 1))
205 		return 0;
206 
207 	return re->hi & VTD_PAGE_MASK;
208 }
209 
210 static inline void context_clear_pasid_enable(struct context_entry *context)
211 {
212 	context->lo &= ~(1ULL << 11);
213 }
214 
215 static inline bool context_pasid_enabled(struct context_entry *context)
216 {
217 	return !!(context->lo & (1ULL << 11));
218 }
219 
220 static inline void context_set_copied(struct context_entry *context)
221 {
222 	context->hi |= (1ull << 3);
223 }
224 
225 static inline bool context_copied(struct context_entry *context)
226 {
227 	return !!(context->hi & (1ULL << 3));
228 }
229 
230 static inline bool __context_present(struct context_entry *context)
231 {
232 	return (context->lo & 1);
233 }
234 
235 bool context_present(struct context_entry *context)
236 {
237 	return context_pasid_enabled(context) ?
238 	     __context_present(context) :
239 	     __context_present(context) && !context_copied(context);
240 }
241 
242 static inline void context_set_present(struct context_entry *context)
243 {
244 	context->lo |= 1;
245 }
246 
247 static inline void context_set_fault_enable(struct context_entry *context)
248 {
249 	context->lo &= (((u64)-1) << 2) | 1;
250 }
251 
252 static inline void context_set_translation_type(struct context_entry *context,
253 						unsigned long value)
254 {
255 	context->lo &= (((u64)-1) << 4) | 3;
256 	context->lo |= (value & 3) << 2;
257 }
258 
259 static inline void context_set_address_root(struct context_entry *context,
260 					    unsigned long value)
261 {
262 	context->lo &= ~VTD_PAGE_MASK;
263 	context->lo |= value & VTD_PAGE_MASK;
264 }
265 
266 static inline void context_set_address_width(struct context_entry *context,
267 					     unsigned long value)
268 {
269 	context->hi |= value & 7;
270 }
271 
272 static inline void context_set_domain_id(struct context_entry *context,
273 					 unsigned long value)
274 {
275 	context->hi |= (value & ((1 << 16) - 1)) << 8;
276 }
277 
278 static inline int context_domain_id(struct context_entry *c)
279 {
280 	return((c->hi >> 8) & 0xffff);
281 }
282 
283 static inline void context_clear_entry(struct context_entry *context)
284 {
285 	context->lo = 0;
286 	context->hi = 0;
287 }
288 
289 /*
290  * This domain is a statically identity mapping domain.
291  *	1. This domain creats a static 1:1 mapping to all usable memory.
292  * 	2. It maps to each iommu if successful.
293  *	3. Each iommu mapps to this domain if successful.
294  */
295 static struct dmar_domain *si_domain;
296 static int hw_pass_through = 1;
297 
298 #define for_each_domain_iommu(idx, domain)			\
299 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
300 		if (domain->iommu_refcnt[idx])
301 
302 struct dmar_rmrr_unit {
303 	struct list_head list;		/* list of rmrr units	*/
304 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
305 	u64	base_address;		/* reserved base address*/
306 	u64	end_address;		/* reserved end address */
307 	struct dmar_dev_scope *devices;	/* target devices */
308 	int	devices_cnt;		/* target device count */
309 };
310 
311 struct dmar_atsr_unit {
312 	struct list_head list;		/* list of ATSR units */
313 	struct acpi_dmar_header *hdr;	/* ACPI header */
314 	struct dmar_dev_scope *devices;	/* target devices */
315 	int devices_cnt;		/* target device count */
316 	u8 include_all:1;		/* include all ports */
317 };
318 
319 struct dmar_satc_unit {
320 	struct list_head list;		/* list of SATC units */
321 	struct acpi_dmar_header *hdr;	/* ACPI header */
322 	struct dmar_dev_scope *devices;	/* target devices */
323 	struct intel_iommu *iommu;	/* the corresponding iommu */
324 	int devices_cnt;		/* target device count */
325 	u8 atc_required:1;		/* ATS is required */
326 };
327 
328 static LIST_HEAD(dmar_atsr_units);
329 static LIST_HEAD(dmar_rmrr_units);
330 static LIST_HEAD(dmar_satc_units);
331 
332 #define for_each_rmrr_units(rmrr) \
333 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334 
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337 
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int intel_iommu_attach_device(struct iommu_domain *domain,
343 				     struct device *dev);
344 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
345 					    dma_addr_t iova);
346 
347 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
348 int dmar_disabled = 0;
349 #else
350 int dmar_disabled = 1;
351 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
352 
353 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
354 int intel_iommu_sm = 1;
355 #else
356 int intel_iommu_sm;
357 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
358 
359 int intel_iommu_enabled = 0;
360 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
361 
362 static int dmar_map_gfx = 1;
363 static int intel_iommu_strict;
364 static int intel_iommu_superpage = 1;
365 static int iommu_identity_mapping;
366 static int iommu_skip_te_disable;
367 
368 #define IDENTMAP_GFX		2
369 #define IDENTMAP_AZALIA		4
370 
371 int intel_iommu_gfx_mapped;
372 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
373 
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 struct device_domain_info *get_domain_info(struct device *dev)
376 {
377 	struct device_domain_info *info;
378 
379 	if (!dev)
380 		return NULL;
381 
382 	info = dev_iommu_priv_get(dev);
383 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
384 		return NULL;
385 
386 	return info;
387 }
388 
389 DEFINE_SPINLOCK(device_domain_lock);
390 static LIST_HEAD(device_domain_list);
391 
392 /*
393  * Iterate over elements in device_domain_list and call the specified
394  * callback @fn against each element.
395  */
396 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
397 				     void *data), void *data)
398 {
399 	int ret = 0;
400 	unsigned long flags;
401 	struct device_domain_info *info;
402 
403 	spin_lock_irqsave(&device_domain_lock, flags);
404 	list_for_each_entry(info, &device_domain_list, global) {
405 		ret = fn(info, data);
406 		if (ret) {
407 			spin_unlock_irqrestore(&device_domain_lock, flags);
408 			return ret;
409 		}
410 	}
411 	spin_unlock_irqrestore(&device_domain_lock, flags);
412 
413 	return 0;
414 }
415 
416 const struct iommu_ops intel_iommu_ops;
417 
418 static bool translation_pre_enabled(struct intel_iommu *iommu)
419 {
420 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
421 }
422 
423 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
424 {
425 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
426 }
427 
428 static void init_translation_status(struct intel_iommu *iommu)
429 {
430 	u32 gsts;
431 
432 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
433 	if (gsts & DMA_GSTS_TES)
434 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
435 }
436 
437 static int __init intel_iommu_setup(char *str)
438 {
439 	if (!str)
440 		return -EINVAL;
441 	while (*str) {
442 		if (!strncmp(str, "on", 2)) {
443 			dmar_disabled = 0;
444 			pr_info("IOMMU enabled\n");
445 		} else if (!strncmp(str, "off", 3)) {
446 			dmar_disabled = 1;
447 			no_platform_optin = 1;
448 			pr_info("IOMMU disabled\n");
449 		} else if (!strncmp(str, "igfx_off", 8)) {
450 			dmar_map_gfx = 0;
451 			pr_info("Disable GFX device mapping\n");
452 		} else if (!strncmp(str, "forcedac", 8)) {
453 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
454 			iommu_dma_forcedac = true;
455 		} else if (!strncmp(str, "strict", 6)) {
456 			pr_info("Disable batched IOTLB flush\n");
457 			intel_iommu_strict = 1;
458 		} else if (!strncmp(str, "sp_off", 6)) {
459 			pr_info("Disable supported super page\n");
460 			intel_iommu_superpage = 0;
461 		} else if (!strncmp(str, "sm_on", 5)) {
462 			pr_info("Intel-IOMMU: scalable mode supported\n");
463 			intel_iommu_sm = 1;
464 		} else if (!strncmp(str, "tboot_noforce", 13)) {
465 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
466 			intel_iommu_tboot_noforce = 1;
467 		}
468 
469 		str += strcspn(str, ",");
470 		while (*str == ',')
471 			str++;
472 	}
473 	return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476 
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479 
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482 	struct dmar_domain **domains;
483 	int idx = did >> 8;
484 
485 	domains = iommu->domains[idx];
486 	if (!domains)
487 		return NULL;
488 
489 	return domains[did & 0xff];
490 }
491 
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 			     struct dmar_domain *domain)
494 {
495 	struct dmar_domain **domains;
496 	int idx = did >> 8;
497 
498 	if (!iommu->domains[idx]) {
499 		size_t size = 256 * sizeof(struct dmar_domain *);
500 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501 	}
502 
503 	domains = iommu->domains[idx];
504 	if (WARN_ON(!domains))
505 		return;
506 	else
507 		domains[did & 0xff] = domain;
508 }
509 
510 void *alloc_pgtable_page(int node)
511 {
512 	struct page *page;
513 	void *vaddr = NULL;
514 
515 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516 	if (page)
517 		vaddr = page_address(page);
518 	return vaddr;
519 }
520 
521 void free_pgtable_page(void *vaddr)
522 {
523 	free_page((unsigned long)vaddr);
524 }
525 
526 static inline void *alloc_domain_mem(void)
527 {
528 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530 
531 static void free_domain_mem(void *vaddr)
532 {
533 	kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535 
536 static inline void * alloc_devinfo_mem(void)
537 {
538 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540 
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543 	kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545 
546 static inline int domain_type_is_si(struct dmar_domain *domain)
547 {
548 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
549 }
550 
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 {
553 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
554 }
555 
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
557 				       unsigned long pfn)
558 {
559 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 
561 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
562 }
563 
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 {
566 	unsigned long sagaw;
567 	int agaw = -1;
568 
569 	sagaw = cap_sagaw(iommu->cap);
570 	for (agaw = width_to_agaw(max_gaw);
571 	     agaw >= 0; agaw--) {
572 		if (test_bit(agaw, &sagaw))
573 			break;
574 	}
575 
576 	return agaw;
577 }
578 
579 /*
580  * Calculate max SAGAW for each iommu.
581  */
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 {
584 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
585 }
586 
587 /*
588  * calculate agaw for each iommu.
589  * "SAGAW" may be different across iommus, use a default agaw, and
590  * get a supported less agaw for iommus that don't support the default agaw.
591  */
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 {
594 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
595 }
596 
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 {
600 	int iommu_id;
601 
602 	/* si_domain and vm domain should not get here. */
603 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
604 		return NULL;
605 
606 	for_each_domain_iommu(iommu_id, domain)
607 		break;
608 
609 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
610 		return NULL;
611 
612 	return g_iommus[iommu_id];
613 }
614 
615 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 {
617 	return sm_supported(iommu) ?
618 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
619 }
620 
621 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 {
623 	struct dmar_drhd_unit *drhd;
624 	struct intel_iommu *iommu;
625 	bool found = false;
626 	int i;
627 
628 	domain->iommu_coherency = 1;
629 
630 	for_each_domain_iommu(i, domain) {
631 		found = true;
632 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
633 			domain->iommu_coherency = 0;
634 			break;
635 		}
636 	}
637 	if (found)
638 		return;
639 
640 	/* No hardware attached; use lowest common denominator */
641 	rcu_read_lock();
642 	for_each_active_iommu(iommu, drhd) {
643 		if (!iommu_paging_structure_coherency(iommu)) {
644 			domain->iommu_coherency = 0;
645 			break;
646 		}
647 	}
648 	rcu_read_unlock();
649 }
650 
651 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 {
653 	struct dmar_drhd_unit *drhd;
654 	struct intel_iommu *iommu;
655 	int ret = 1;
656 
657 	rcu_read_lock();
658 	for_each_active_iommu(iommu, drhd) {
659 		if (iommu != skip) {
660 			/*
661 			 * If the hardware is operating in the scalable mode,
662 			 * the snooping control is always supported since we
663 			 * always set PASID-table-entry.PGSNP bit if the domain
664 			 * is managed outside (UNMANAGED).
665 			 */
666 			if (!sm_supported(iommu) &&
667 			    !ecap_sc_support(iommu->ecap)) {
668 				ret = 0;
669 				break;
670 			}
671 		}
672 	}
673 	rcu_read_unlock();
674 
675 	return ret;
676 }
677 
678 static int domain_update_iommu_superpage(struct dmar_domain *domain,
679 					 struct intel_iommu *skip)
680 {
681 	struct dmar_drhd_unit *drhd;
682 	struct intel_iommu *iommu;
683 	int mask = 0x3;
684 
685 	if (!intel_iommu_superpage) {
686 		return 0;
687 	}
688 
689 	/* set iommu_superpage to the smallest common denominator */
690 	rcu_read_lock();
691 	for_each_active_iommu(iommu, drhd) {
692 		if (iommu != skip) {
693 			if (domain && domain_use_first_level(domain)) {
694 				if (!cap_fl1gp_support(iommu->cap))
695 					mask = 0x1;
696 			} else {
697 				mask &= cap_super_page_val(iommu->cap);
698 			}
699 
700 			if (!mask)
701 				break;
702 		}
703 	}
704 	rcu_read_unlock();
705 
706 	return fls(mask);
707 }
708 
709 static int domain_update_device_node(struct dmar_domain *domain)
710 {
711 	struct device_domain_info *info;
712 	int nid = NUMA_NO_NODE;
713 
714 	assert_spin_locked(&device_domain_lock);
715 
716 	if (list_empty(&domain->devices))
717 		return NUMA_NO_NODE;
718 
719 	list_for_each_entry(info, &domain->devices, link) {
720 		if (!info->dev)
721 			continue;
722 
723 		/*
724 		 * There could possibly be multiple device numa nodes as devices
725 		 * within the same domain may sit behind different IOMMUs. There
726 		 * isn't perfect answer in such situation, so we select first
727 		 * come first served policy.
728 		 */
729 		nid = dev_to_node(info->dev);
730 		if (nid != NUMA_NO_NODE)
731 			break;
732 	}
733 
734 	return nid;
735 }
736 
737 static void domain_update_iotlb(struct dmar_domain *domain);
738 
739 /* Some capabilities may be different across iommus */
740 static void domain_update_iommu_cap(struct dmar_domain *domain)
741 {
742 	domain_update_iommu_coherency(domain);
743 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
744 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
745 
746 	/*
747 	 * If RHSA is missing, we should default to the device numa domain
748 	 * as fall back.
749 	 */
750 	if (domain->nid == NUMA_NO_NODE)
751 		domain->nid = domain_update_device_node(domain);
752 
753 	/*
754 	 * First-level translation restricts the input-address to a
755 	 * canonical address (i.e., address bits 63:N have the same
756 	 * value as address bit [N-1], where N is 48-bits with 4-level
757 	 * paging and 57-bits with 5-level paging). Hence, skip bit
758 	 * [N-1].
759 	 */
760 	if (domain_use_first_level(domain))
761 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
762 	else
763 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
764 
765 	domain_update_iotlb(domain);
766 }
767 
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 					 u8 devfn, int alloc)
770 {
771 	struct root_entry *root = &iommu->root_entry[bus];
772 	struct context_entry *context;
773 	u64 *entry;
774 
775 	entry = &root->lo;
776 	if (sm_supported(iommu)) {
777 		if (devfn >= 0x80) {
778 			devfn -= 0x80;
779 			entry = &root->hi;
780 		}
781 		devfn *= 2;
782 	}
783 	if (*entry & 1)
784 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 	else {
786 		unsigned long phy_addr;
787 		if (!alloc)
788 			return NULL;
789 
790 		context = alloc_pgtable_page(iommu->node);
791 		if (!context)
792 			return NULL;
793 
794 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 		phy_addr = virt_to_phys((void *)context);
796 		*entry = phy_addr | 1;
797 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
798 	}
799 	return &context[devfn];
800 }
801 
802 static bool attach_deferred(struct device *dev)
803 {
804 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805 }
806 
807 /**
808  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809  *				 sub-hierarchy of a candidate PCI-PCI bridge
810  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811  * @bridge: the candidate PCI-PCI bridge
812  *
813  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814  */
815 static bool
816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817 {
818 	struct pci_dev *pdev, *pbridge;
819 
820 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 		return false;
822 
823 	pdev = to_pci_dev(dev);
824 	pbridge = to_pci_dev(bridge);
825 
826 	if (pbridge->subordinate &&
827 	    pbridge->subordinate->number <= pdev->bus->number &&
828 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 		return true;
830 
831 	return false;
832 }
833 
834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835 {
836 	struct dmar_drhd_unit *drhd;
837 	u32 vtbar;
838 	int rc;
839 
840 	/* We know that this device on this chipset has its own IOMMU.
841 	 * If we find it under a different IOMMU, then the BIOS is lying
842 	 * to us. Hope that the IOMMU for this device is actually
843 	 * disabled, and it needs no translation...
844 	 */
845 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 	if (rc) {
847 		/* "can't" happen */
848 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 		return false;
850 	}
851 	vtbar &= 0xffff0000;
852 
853 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
854 	drhd = dmar_find_matched_drhd_unit(pdev);
855 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 		return true;
859 	}
860 
861 	return false;
862 }
863 
864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865 {
866 	if (!iommu || iommu->drhd->ignored)
867 		return true;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pdev = to_pci_dev(dev);
871 
872 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 		    quirk_ioat_snb_local_iommu(pdev))
875 			return true;
876 	}
877 
878 	return false;
879 }
880 
881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882 {
883 	struct dmar_drhd_unit *drhd = NULL;
884 	struct pci_dev *pdev = NULL;
885 	struct intel_iommu *iommu;
886 	struct device *tmp;
887 	u16 segment = 0;
888 	int i;
889 
890 	if (!dev)
891 		return NULL;
892 
893 	if (dev_is_pci(dev)) {
894 		struct pci_dev *pf_pdev;
895 
896 		pdev = pci_real_dma_dev(to_pci_dev(dev));
897 
898 		/* VFs aren't listed in scope tables; we need to look up
899 		 * the PF instead to find the IOMMU. */
900 		pf_pdev = pci_physfn(pdev);
901 		dev = &pf_pdev->dev;
902 		segment = pci_domain_nr(pdev->bus);
903 	} else if (has_acpi_companion(dev))
904 		dev = &ACPI_COMPANION(dev)->dev;
905 
906 	rcu_read_lock();
907 	for_each_iommu(iommu, drhd) {
908 		if (pdev && segment != drhd->segment)
909 			continue;
910 
911 		for_each_active_dev_scope(drhd->devices,
912 					  drhd->devices_cnt, i, tmp) {
913 			if (tmp == dev) {
914 				/* For a VF use its original BDF# not that of the PF
915 				 * which we used for the IOMMU lookup. Strictly speaking
916 				 * we could do this for all PCI devices; we only need to
917 				 * get the BDF# from the scope table for ACPI matches. */
918 				if (pdev && pdev->is_virtfn)
919 					goto got_pdev;
920 
921 				if (bus && devfn) {
922 					*bus = drhd->devices[i].bus;
923 					*devfn = drhd->devices[i].devfn;
924 				}
925 				goto out;
926 			}
927 
928 			if (is_downstream_to_pci_bridge(dev, tmp))
929 				goto got_pdev;
930 		}
931 
932 		if (pdev && drhd->include_all) {
933 		got_pdev:
934 			if (bus && devfn) {
935 				*bus = pdev->bus->number;
936 				*devfn = pdev->devfn;
937 			}
938 			goto out;
939 		}
940 	}
941 	iommu = NULL;
942  out:
943 	if (iommu_is_dummy(iommu, dev))
944 		iommu = NULL;
945 
946 	rcu_read_unlock();
947 
948 	return iommu;
949 }
950 
951 static void domain_flush_cache(struct dmar_domain *domain,
952 			       void *addr, int size)
953 {
954 	if (!domain->iommu_coherency)
955 		clflush_cache_range(addr, size);
956 }
957 
958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959 {
960 	struct context_entry *context;
961 	int ret = 0;
962 	unsigned long flags;
963 
964 	spin_lock_irqsave(&iommu->lock, flags);
965 	context = iommu_context_addr(iommu, bus, devfn, 0);
966 	if (context)
967 		ret = context_present(context);
968 	spin_unlock_irqrestore(&iommu->lock, flags);
969 	return ret;
970 }
971 
972 static void free_context_table(struct intel_iommu *iommu)
973 {
974 	int i;
975 	unsigned long flags;
976 	struct context_entry *context;
977 
978 	spin_lock_irqsave(&iommu->lock, flags);
979 	if (!iommu->root_entry) {
980 		goto out;
981 	}
982 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 		context = iommu_context_addr(iommu, i, 0, 0);
984 		if (context)
985 			free_pgtable_page(context);
986 
987 		if (!sm_supported(iommu))
988 			continue;
989 
990 		context = iommu_context_addr(iommu, i, 0x80, 0);
991 		if (context)
992 			free_pgtable_page(context);
993 
994 	}
995 	free_pgtable_page(iommu->root_entry);
996 	iommu->root_entry = NULL;
997 out:
998 	spin_unlock_irqrestore(&iommu->lock, flags);
999 }
1000 
1001 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002 				      unsigned long pfn, int *target_level)
1003 {
1004 	struct dma_pte *parent, *pte;
1005 	int level = agaw_to_level(domain->agaw);
1006 	int offset;
1007 
1008 	BUG_ON(!domain->pgd);
1009 
1010 	if (!domain_pfn_supported(domain, pfn))
1011 		/* Address beyond IOMMU's addressing capabilities. */
1012 		return NULL;
1013 
1014 	parent = domain->pgd;
1015 
1016 	while (1) {
1017 		void *tmp_page;
1018 
1019 		offset = pfn_level_offset(pfn, level);
1020 		pte = &parent[offset];
1021 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022 			break;
1023 		if (level == *target_level)
1024 			break;
1025 
1026 		if (!dma_pte_present(pte)) {
1027 			uint64_t pteval;
1028 
1029 			tmp_page = alloc_pgtable_page(domain->nid);
1030 
1031 			if (!tmp_page)
1032 				return NULL;
1033 
1034 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036 			if (domain_use_first_level(domain)) {
1037 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1038 				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1039 					pteval |= DMA_FL_PTE_ACCESS;
1040 			}
1041 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1042 				/* Someone else set it while we were thinking; use theirs. */
1043 				free_pgtable_page(tmp_page);
1044 			else
1045 				domain_flush_cache(domain, pte, sizeof(*pte));
1046 		}
1047 		if (level == 1)
1048 			break;
1049 
1050 		parent = phys_to_virt(dma_pte_addr(pte));
1051 		level--;
1052 	}
1053 
1054 	if (!*target_level)
1055 		*target_level = level;
1056 
1057 	return pte;
1058 }
1059 
1060 /* return address's pte at specific level */
1061 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1062 					 unsigned long pfn,
1063 					 int level, int *large_page)
1064 {
1065 	struct dma_pte *parent, *pte;
1066 	int total = agaw_to_level(domain->agaw);
1067 	int offset;
1068 
1069 	parent = domain->pgd;
1070 	while (level <= total) {
1071 		offset = pfn_level_offset(pfn, total);
1072 		pte = &parent[offset];
1073 		if (level == total)
1074 			return pte;
1075 
1076 		if (!dma_pte_present(pte)) {
1077 			*large_page = total;
1078 			break;
1079 		}
1080 
1081 		if (dma_pte_superpage(pte)) {
1082 			*large_page = total;
1083 			return pte;
1084 		}
1085 
1086 		parent = phys_to_virt(dma_pte_addr(pte));
1087 		total--;
1088 	}
1089 	return NULL;
1090 }
1091 
1092 /* clear last level pte, a tlb flush should be followed */
1093 static void dma_pte_clear_range(struct dmar_domain *domain,
1094 				unsigned long start_pfn,
1095 				unsigned long last_pfn)
1096 {
1097 	unsigned int large_page;
1098 	struct dma_pte *first_pte, *pte;
1099 
1100 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1101 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1102 	BUG_ON(start_pfn > last_pfn);
1103 
1104 	/* we don't need lock here; nobody else touches the iova range */
1105 	do {
1106 		large_page = 1;
1107 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1108 		if (!pte) {
1109 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1110 			continue;
1111 		}
1112 		do {
1113 			dma_clear_pte(pte);
1114 			start_pfn += lvl_to_nr_pages(large_page);
1115 			pte++;
1116 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1117 
1118 		domain_flush_cache(domain, first_pte,
1119 				   (void *)pte - (void *)first_pte);
1120 
1121 	} while (start_pfn && start_pfn <= last_pfn);
1122 }
1123 
1124 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1125 			       int retain_level, struct dma_pte *pte,
1126 			       unsigned long pfn, unsigned long start_pfn,
1127 			       unsigned long last_pfn)
1128 {
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn;
1134 		struct dma_pte *level_pte;
1135 
1136 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1137 			goto next;
1138 
1139 		level_pfn = pfn & level_mask(level);
1140 		level_pte = phys_to_virt(dma_pte_addr(pte));
1141 
1142 		if (level > 2) {
1143 			dma_pte_free_level(domain, level - 1, retain_level,
1144 					   level_pte, level_pfn, start_pfn,
1145 					   last_pfn);
1146 		}
1147 
1148 		/*
1149 		 * Free the page table if we're below the level we want to
1150 		 * retain and the range covers the entire table.
1151 		 */
1152 		if (level < retain_level && !(start_pfn > level_pfn ||
1153 		      last_pfn < level_pfn + level_size(level) - 1)) {
1154 			dma_clear_pte(pte);
1155 			domain_flush_cache(domain, pte, sizeof(*pte));
1156 			free_pgtable_page(level_pte);
1157 		}
1158 next:
1159 		pfn += level_size(level);
1160 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161 }
1162 
1163 /*
1164  * clear last level (leaf) ptes and free page table pages below the
1165  * level we wish to keep intact.
1166  */
1167 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168 				   unsigned long start_pfn,
1169 				   unsigned long last_pfn,
1170 				   int retain_level)
1171 {
1172 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174 	BUG_ON(start_pfn > last_pfn);
1175 
1176 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1177 
1178 	/* We don't need lock here; nobody else touches the iova range */
1179 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1180 			   domain->pgd, 0, start_pfn, last_pfn);
1181 
1182 	/* free pgd */
1183 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184 		free_pgtable_page(domain->pgd);
1185 		domain->pgd = NULL;
1186 	}
1187 }
1188 
1189 /* When a page at a given level is being unlinked from its parent, we don't
1190    need to *modify* it at all. All we need to do is make a list of all the
1191    pages which can be freed just as soon as we've flushed the IOTLB and we
1192    know the hardware page-walk will no longer touch them.
1193    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194    be freed. */
1195 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196 					    int level, struct dma_pte *pte,
1197 					    struct page *freelist)
1198 {
1199 	struct page *pg;
1200 
1201 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202 	pg->freelist = freelist;
1203 	freelist = pg;
1204 
1205 	if (level == 1)
1206 		return freelist;
1207 
1208 	pte = page_address(pg);
1209 	do {
1210 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211 			freelist = dma_pte_list_pagetables(domain, level - 1,
1212 							   pte, freelist);
1213 		pte++;
1214 	} while (!first_pte_in_page(pte));
1215 
1216 	return freelist;
1217 }
1218 
1219 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 					struct dma_pte *pte, unsigned long pfn,
1221 					unsigned long start_pfn,
1222 					unsigned long last_pfn,
1223 					struct page *freelist)
1224 {
1225 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226 
1227 	pfn = max(start_pfn, pfn);
1228 	pte = &pte[pfn_level_offset(pfn, level)];
1229 
1230 	do {
1231 		unsigned long level_pfn;
1232 
1233 		if (!dma_pte_present(pte))
1234 			goto next;
1235 
1236 		level_pfn = pfn & level_mask(level);
1237 
1238 		/* If range covers entire pagetable, free it */
1239 		if (start_pfn <= level_pfn &&
1240 		    last_pfn >= level_pfn + level_size(level) - 1) {
1241 			/* These suborbinate page tables are going away entirely. Don't
1242 			   bother to clear them; we're just going to *free* them. */
1243 			if (level > 1 && !dma_pte_superpage(pte))
1244 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245 
1246 			dma_clear_pte(pte);
1247 			if (!first_pte)
1248 				first_pte = pte;
1249 			last_pte = pte;
1250 		} else if (level > 1) {
1251 			/* Recurse down into a level that isn't *entirely* obsolete */
1252 			freelist = dma_pte_clear_level(domain, level - 1,
1253 						       phys_to_virt(dma_pte_addr(pte)),
1254 						       level_pfn, start_pfn, last_pfn,
1255 						       freelist);
1256 		}
1257 next:
1258 		pfn += level_size(level);
1259 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260 
1261 	if (first_pte)
1262 		domain_flush_cache(domain, first_pte,
1263 				   (void *)++last_pte - (void *)first_pte);
1264 
1265 	return freelist;
1266 }
1267 
1268 /* We can't just free the pages because the IOMMU may still be walking
1269    the page tables, and may have cached the intermediate levels. The
1270    pages can only be freed after the IOTLB flush has been done. */
1271 static struct page *domain_unmap(struct dmar_domain *domain,
1272 				 unsigned long start_pfn,
1273 				 unsigned long last_pfn,
1274 				 struct page *freelist)
1275 {
1276 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278 	BUG_ON(start_pfn > last_pfn);
1279 
1280 	/* we don't need lock here; nobody else touches the iova range */
1281 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282 				       domain->pgd, 0, start_pfn, last_pfn,
1283 				       freelist);
1284 
1285 	/* free pgd */
1286 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287 		struct page *pgd_page = virt_to_page(domain->pgd);
1288 		pgd_page->freelist = freelist;
1289 		freelist = pgd_page;
1290 
1291 		domain->pgd = NULL;
1292 	}
1293 
1294 	return freelist;
1295 }
1296 
1297 static void dma_free_pagelist(struct page *freelist)
1298 {
1299 	struct page *pg;
1300 
1301 	while ((pg = freelist)) {
1302 		freelist = pg->freelist;
1303 		free_pgtable_page(page_address(pg));
1304 	}
1305 }
1306 
1307 /* iommu handling */
1308 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309 {
1310 	struct root_entry *root;
1311 	unsigned long flags;
1312 
1313 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314 	if (!root) {
1315 		pr_err("Allocating root entry for %s failed\n",
1316 			iommu->name);
1317 		return -ENOMEM;
1318 	}
1319 
1320 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1321 
1322 	spin_lock_irqsave(&iommu->lock, flags);
1323 	iommu->root_entry = root;
1324 	spin_unlock_irqrestore(&iommu->lock, flags);
1325 
1326 	return 0;
1327 }
1328 
1329 static void iommu_set_root_entry(struct intel_iommu *iommu)
1330 {
1331 	u64 addr;
1332 	u32 sts;
1333 	unsigned long flag;
1334 
1335 	addr = virt_to_phys(iommu->root_entry);
1336 	if (sm_supported(iommu))
1337 		addr |= DMA_RTADDR_SMT;
1338 
1339 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341 
1342 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343 
1344 	/* Make sure hardware complete it */
1345 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346 		      readl, (sts & DMA_GSTS_RTPS), sts);
1347 
1348 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349 
1350 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1351 	if (sm_supported(iommu))
1352 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1353 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1354 }
1355 
1356 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1357 {
1358 	u32 val;
1359 	unsigned long flag;
1360 
1361 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1362 		return;
1363 
1364 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1365 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1366 
1367 	/* Make sure hardware complete it */
1368 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1369 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1370 
1371 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1372 }
1373 
1374 /* return value determine if we need a write buffer flush */
1375 static void __iommu_flush_context(struct intel_iommu *iommu,
1376 				  u16 did, u16 source_id, u8 function_mask,
1377 				  u64 type)
1378 {
1379 	u64 val = 0;
1380 	unsigned long flag;
1381 
1382 	switch (type) {
1383 	case DMA_CCMD_GLOBAL_INVL:
1384 		val = DMA_CCMD_GLOBAL_INVL;
1385 		break;
1386 	case DMA_CCMD_DOMAIN_INVL:
1387 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1388 		break;
1389 	case DMA_CCMD_DEVICE_INVL:
1390 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1391 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1392 		break;
1393 	default:
1394 		BUG();
1395 	}
1396 	val |= DMA_CCMD_ICC;
1397 
1398 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1399 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1400 
1401 	/* Make sure hardware complete it */
1402 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1403 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1404 
1405 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406 }
1407 
1408 /* return value determine if we need a write buffer flush */
1409 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1410 				u64 addr, unsigned int size_order, u64 type)
1411 {
1412 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1413 	u64 val = 0, val_iva = 0;
1414 	unsigned long flag;
1415 
1416 	switch (type) {
1417 	case DMA_TLB_GLOBAL_FLUSH:
1418 		/* global flush doesn't need set IVA_REG */
1419 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1420 		break;
1421 	case DMA_TLB_DSI_FLUSH:
1422 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1423 		break;
1424 	case DMA_TLB_PSI_FLUSH:
1425 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1426 		/* IH bit is passed in as part of address */
1427 		val_iva = size_order | addr;
1428 		break;
1429 	default:
1430 		BUG();
1431 	}
1432 	/* Note: set drain read/write */
1433 #if 0
1434 	/*
1435 	 * This is probably to be super secure.. Looks like we can
1436 	 * ignore it without any impact.
1437 	 */
1438 	if (cap_read_drain(iommu->cap))
1439 		val |= DMA_TLB_READ_DRAIN;
1440 #endif
1441 	if (cap_write_drain(iommu->cap))
1442 		val |= DMA_TLB_WRITE_DRAIN;
1443 
1444 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1445 	/* Note: Only uses first TLB reg currently */
1446 	if (val_iva)
1447 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1448 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1449 
1450 	/* Make sure hardware complete it */
1451 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1452 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1453 
1454 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1455 
1456 	/* check IOTLB invalidation granularity */
1457 	if (DMA_TLB_IAIG(val) == 0)
1458 		pr_err("Flush IOTLB failed\n");
1459 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1460 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1461 			(unsigned long long)DMA_TLB_IIRG(type),
1462 			(unsigned long long)DMA_TLB_IAIG(val));
1463 }
1464 
1465 static struct device_domain_info *
1466 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1467 			 u8 bus, u8 devfn)
1468 {
1469 	struct device_domain_info *info;
1470 
1471 	assert_spin_locked(&device_domain_lock);
1472 
1473 	if (!iommu->qi)
1474 		return NULL;
1475 
1476 	list_for_each_entry(info, &domain->devices, link)
1477 		if (info->iommu == iommu && info->bus == bus &&
1478 		    info->devfn == devfn) {
1479 			if (info->ats_supported && info->dev)
1480 				return info;
1481 			break;
1482 		}
1483 
1484 	return NULL;
1485 }
1486 
1487 static void domain_update_iotlb(struct dmar_domain *domain)
1488 {
1489 	struct device_domain_info *info;
1490 	bool has_iotlb_device = false;
1491 
1492 	assert_spin_locked(&device_domain_lock);
1493 
1494 	list_for_each_entry(info, &domain->devices, link)
1495 		if (info->ats_enabled) {
1496 			has_iotlb_device = true;
1497 			break;
1498 		}
1499 
1500 	if (!has_iotlb_device) {
1501 		struct subdev_domain_info *sinfo;
1502 
1503 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1504 			info = get_domain_info(sinfo->pdev);
1505 			if (info && info->ats_enabled) {
1506 				has_iotlb_device = true;
1507 				break;
1508 			}
1509 		}
1510 	}
1511 
1512 	domain->has_iotlb_device = has_iotlb_device;
1513 }
1514 
1515 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1516 {
1517 	struct pci_dev *pdev;
1518 
1519 	assert_spin_locked(&device_domain_lock);
1520 
1521 	if (!info || !dev_is_pci(info->dev))
1522 		return;
1523 
1524 	pdev = to_pci_dev(info->dev);
1525 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1526 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1527 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1528 	 * reserved, which should be set to 0.
1529 	 */
1530 	if (!ecap_dit(info->iommu->ecap))
1531 		info->pfsid = 0;
1532 	else {
1533 		struct pci_dev *pf_pdev;
1534 
1535 		/* pdev will be returned if device is not a vf */
1536 		pf_pdev = pci_physfn(pdev);
1537 		info->pfsid = pci_dev_id(pf_pdev);
1538 	}
1539 
1540 #ifdef CONFIG_INTEL_IOMMU_SVM
1541 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1542 	   the device if you enable PASID support after ATS support is
1543 	   undefined. So always enable PASID support on devices which
1544 	   have it, even if we can't yet know if we're ever going to
1545 	   use it. */
1546 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1547 		info->pasid_enabled = 1;
1548 
1549 	if (info->pri_supported &&
1550 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1551 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1552 		info->pri_enabled = 1;
1553 #endif
1554 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1555 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1556 		info->ats_enabled = 1;
1557 		domain_update_iotlb(info->domain);
1558 		info->ats_qdep = pci_ats_queue_depth(pdev);
1559 	}
1560 }
1561 
1562 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1563 {
1564 	struct pci_dev *pdev;
1565 
1566 	assert_spin_locked(&device_domain_lock);
1567 
1568 	if (!dev_is_pci(info->dev))
1569 		return;
1570 
1571 	pdev = to_pci_dev(info->dev);
1572 
1573 	if (info->ats_enabled) {
1574 		pci_disable_ats(pdev);
1575 		info->ats_enabled = 0;
1576 		domain_update_iotlb(info->domain);
1577 	}
1578 #ifdef CONFIG_INTEL_IOMMU_SVM
1579 	if (info->pri_enabled) {
1580 		pci_disable_pri(pdev);
1581 		info->pri_enabled = 0;
1582 	}
1583 	if (info->pasid_enabled) {
1584 		pci_disable_pasid(pdev);
1585 		info->pasid_enabled = 0;
1586 	}
1587 #endif
1588 }
1589 
1590 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1591 				    u64 addr, unsigned int mask)
1592 {
1593 	u16 sid, qdep;
1594 
1595 	if (!info || !info->ats_enabled)
1596 		return;
1597 
1598 	sid = info->bus << 8 | info->devfn;
1599 	qdep = info->ats_qdep;
1600 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1601 			   qdep, addr, mask);
1602 }
1603 
1604 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1605 				  u64 addr, unsigned mask)
1606 {
1607 	unsigned long flags;
1608 	struct device_domain_info *info;
1609 	struct subdev_domain_info *sinfo;
1610 
1611 	if (!domain->has_iotlb_device)
1612 		return;
1613 
1614 	spin_lock_irqsave(&device_domain_lock, flags);
1615 	list_for_each_entry(info, &domain->devices, link)
1616 		__iommu_flush_dev_iotlb(info, addr, mask);
1617 
1618 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1619 		info = get_domain_info(sinfo->pdev);
1620 		__iommu_flush_dev_iotlb(info, addr, mask);
1621 	}
1622 	spin_unlock_irqrestore(&device_domain_lock, flags);
1623 }
1624 
1625 static void domain_flush_piotlb(struct intel_iommu *iommu,
1626 				struct dmar_domain *domain,
1627 				u64 addr, unsigned long npages, bool ih)
1628 {
1629 	u16 did = domain->iommu_did[iommu->seq_id];
1630 
1631 	if (domain->default_pasid)
1632 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1633 				addr, npages, ih);
1634 
1635 	if (!list_empty(&domain->devices))
1636 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1637 }
1638 
1639 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1640 				  struct dmar_domain *domain,
1641 				  unsigned long pfn, unsigned int pages,
1642 				  int ih, int map)
1643 {
1644 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1645 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1646 	u16 did = domain->iommu_did[iommu->seq_id];
1647 
1648 	BUG_ON(pages == 0);
1649 
1650 	if (ih)
1651 		ih = 1 << 6;
1652 
1653 	if (domain_use_first_level(domain)) {
1654 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1655 	} else {
1656 		/*
1657 		 * Fallback to domain selective flush if no PSI support or
1658 		 * the size is too big. PSI requires page size to be 2 ^ x,
1659 		 * and the base address is naturally aligned to the size.
1660 		 */
1661 		if (!cap_pgsel_inv(iommu->cap) ||
1662 		    mask > cap_max_amask_val(iommu->cap))
1663 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664 							DMA_TLB_DSI_FLUSH);
1665 		else
1666 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1667 							DMA_TLB_PSI_FLUSH);
1668 	}
1669 
1670 	/*
1671 	 * In caching mode, changes of pages from non-present to present require
1672 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1673 	 */
1674 	if (!cap_caching_mode(iommu->cap) || !map)
1675 		iommu_flush_dev_iotlb(domain, addr, mask);
1676 }
1677 
1678 /* Notification for newly created mappings */
1679 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1680 					struct dmar_domain *domain,
1681 					unsigned long pfn, unsigned int pages)
1682 {
1683 	/*
1684 	 * It's a non-present to present mapping. Only flush if caching mode
1685 	 * and second level.
1686 	 */
1687 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1688 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1689 	else
1690 		iommu_flush_write_buffer(iommu);
1691 }
1692 
1693 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1694 {
1695 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1696 	int idx;
1697 
1698 	for_each_domain_iommu(idx, dmar_domain) {
1699 		struct intel_iommu *iommu = g_iommus[idx];
1700 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1701 
1702 		if (domain_use_first_level(dmar_domain))
1703 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1704 		else
1705 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1706 						 DMA_TLB_DSI_FLUSH);
1707 
1708 		if (!cap_caching_mode(iommu->cap))
1709 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1710 					      0, MAX_AGAW_PFN_WIDTH);
1711 	}
1712 }
1713 
1714 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1715 {
1716 	u32 pmen;
1717 	unsigned long flags;
1718 
1719 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1720 		return;
1721 
1722 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1723 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1724 	pmen &= ~DMA_PMEN_EPM;
1725 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1726 
1727 	/* wait for the protected region status bit to clear */
1728 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1729 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1730 
1731 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1732 }
1733 
1734 static void iommu_enable_translation(struct intel_iommu *iommu)
1735 {
1736 	u32 sts;
1737 	unsigned long flags;
1738 
1739 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1740 	iommu->gcmd |= DMA_GCMD_TE;
1741 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1742 
1743 	/* Make sure hardware complete it */
1744 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1745 		      readl, (sts & DMA_GSTS_TES), sts);
1746 
1747 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1748 }
1749 
1750 static void iommu_disable_translation(struct intel_iommu *iommu)
1751 {
1752 	u32 sts;
1753 	unsigned long flag;
1754 
1755 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1756 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1757 		return;
1758 
1759 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1760 	iommu->gcmd &= ~DMA_GCMD_TE;
1761 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1762 
1763 	/* Make sure hardware complete it */
1764 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1765 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1766 
1767 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1768 }
1769 
1770 static int iommu_init_domains(struct intel_iommu *iommu)
1771 {
1772 	u32 ndomains, nlongs;
1773 	size_t size;
1774 
1775 	ndomains = cap_ndoms(iommu->cap);
1776 	pr_debug("%s: Number of Domains supported <%d>\n",
1777 		 iommu->name, ndomains);
1778 	nlongs = BITS_TO_LONGS(ndomains);
1779 
1780 	spin_lock_init(&iommu->lock);
1781 
1782 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1783 	if (!iommu->domain_ids) {
1784 		pr_err("%s: Allocating domain id array failed\n",
1785 		       iommu->name);
1786 		return -ENOMEM;
1787 	}
1788 
1789 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1790 	iommu->domains = kzalloc(size, GFP_KERNEL);
1791 
1792 	if (iommu->domains) {
1793 		size = 256 * sizeof(struct dmar_domain *);
1794 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1795 	}
1796 
1797 	if (!iommu->domains || !iommu->domains[0]) {
1798 		pr_err("%s: Allocating domain array failed\n",
1799 		       iommu->name);
1800 		kfree(iommu->domain_ids);
1801 		kfree(iommu->domains);
1802 		iommu->domain_ids = NULL;
1803 		iommu->domains    = NULL;
1804 		return -ENOMEM;
1805 	}
1806 
1807 	/*
1808 	 * If Caching mode is set, then invalid translations are tagged
1809 	 * with domain-id 0, hence we need to pre-allocate it. We also
1810 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1811 	 * make sure it is not used for a real domain.
1812 	 */
1813 	set_bit(0, iommu->domain_ids);
1814 
1815 	/*
1816 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1817 	 * entry for first-level or pass-through translation modes should
1818 	 * be programmed with a domain id different from those used for
1819 	 * second-level or nested translation. We reserve a domain id for
1820 	 * this purpose.
1821 	 */
1822 	if (sm_supported(iommu))
1823 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1824 
1825 	return 0;
1826 }
1827 
1828 static void disable_dmar_iommu(struct intel_iommu *iommu)
1829 {
1830 	struct device_domain_info *info, *tmp;
1831 	unsigned long flags;
1832 
1833 	if (!iommu->domains || !iommu->domain_ids)
1834 		return;
1835 
1836 	spin_lock_irqsave(&device_domain_lock, flags);
1837 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1838 		if (info->iommu != iommu)
1839 			continue;
1840 
1841 		if (!info->dev || !info->domain)
1842 			continue;
1843 
1844 		__dmar_remove_one_dev_info(info);
1845 	}
1846 	spin_unlock_irqrestore(&device_domain_lock, flags);
1847 
1848 	if (iommu->gcmd & DMA_GCMD_TE)
1849 		iommu_disable_translation(iommu);
1850 }
1851 
1852 static void free_dmar_iommu(struct intel_iommu *iommu)
1853 {
1854 	if ((iommu->domains) && (iommu->domain_ids)) {
1855 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1856 		int i;
1857 
1858 		for (i = 0; i < elems; i++)
1859 			kfree(iommu->domains[i]);
1860 		kfree(iommu->domains);
1861 		kfree(iommu->domain_ids);
1862 		iommu->domains = NULL;
1863 		iommu->domain_ids = NULL;
1864 	}
1865 
1866 	g_iommus[iommu->seq_id] = NULL;
1867 
1868 	/* free context mapping */
1869 	free_context_table(iommu);
1870 
1871 #ifdef CONFIG_INTEL_IOMMU_SVM
1872 	if (pasid_supported(iommu)) {
1873 		if (ecap_prs(iommu->ecap))
1874 			intel_svm_finish_prq(iommu);
1875 	}
1876 	if (vccap_pasid(iommu->vccap))
1877 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1878 
1879 #endif
1880 }
1881 
1882 /*
1883  * Check and return whether first level is used by default for
1884  * DMA translation.
1885  */
1886 static bool first_level_by_default(void)
1887 {
1888 	return scalable_mode_support() && intel_cap_flts_sanity();
1889 }
1890 
1891 static struct dmar_domain *alloc_domain(int flags)
1892 {
1893 	struct dmar_domain *domain;
1894 
1895 	domain = alloc_domain_mem();
1896 	if (!domain)
1897 		return NULL;
1898 
1899 	memset(domain, 0, sizeof(*domain));
1900 	domain->nid = NUMA_NO_NODE;
1901 	domain->flags = flags;
1902 	if (first_level_by_default())
1903 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1904 	domain->has_iotlb_device = false;
1905 	INIT_LIST_HEAD(&domain->devices);
1906 	INIT_LIST_HEAD(&domain->subdevices);
1907 
1908 	return domain;
1909 }
1910 
1911 /* Must be called with iommu->lock */
1912 static int domain_attach_iommu(struct dmar_domain *domain,
1913 			       struct intel_iommu *iommu)
1914 {
1915 	unsigned long ndomains;
1916 	int num;
1917 
1918 	assert_spin_locked(&device_domain_lock);
1919 	assert_spin_locked(&iommu->lock);
1920 
1921 	domain->iommu_refcnt[iommu->seq_id] += 1;
1922 	domain->iommu_count += 1;
1923 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1924 		ndomains = cap_ndoms(iommu->cap);
1925 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1926 
1927 		if (num >= ndomains) {
1928 			pr_err("%s: No free domain ids\n", iommu->name);
1929 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1930 			domain->iommu_count -= 1;
1931 			return -ENOSPC;
1932 		}
1933 
1934 		set_bit(num, iommu->domain_ids);
1935 		set_iommu_domain(iommu, num, domain);
1936 
1937 		domain->iommu_did[iommu->seq_id] = num;
1938 		domain->nid			 = iommu->node;
1939 
1940 		domain_update_iommu_cap(domain);
1941 	}
1942 
1943 	return 0;
1944 }
1945 
1946 static int domain_detach_iommu(struct dmar_domain *domain,
1947 			       struct intel_iommu *iommu)
1948 {
1949 	int num, count;
1950 
1951 	assert_spin_locked(&device_domain_lock);
1952 	assert_spin_locked(&iommu->lock);
1953 
1954 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1955 	count = --domain->iommu_count;
1956 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1957 		num = domain->iommu_did[iommu->seq_id];
1958 		clear_bit(num, iommu->domain_ids);
1959 		set_iommu_domain(iommu, num, NULL);
1960 
1961 		domain_update_iommu_cap(domain);
1962 		domain->iommu_did[iommu->seq_id] = 0;
1963 	}
1964 
1965 	return count;
1966 }
1967 
1968 static inline int guestwidth_to_adjustwidth(int gaw)
1969 {
1970 	int agaw;
1971 	int r = (gaw - 12) % 9;
1972 
1973 	if (r == 0)
1974 		agaw = gaw;
1975 	else
1976 		agaw = gaw + 9 - r;
1977 	if (agaw > 64)
1978 		agaw = 64;
1979 	return agaw;
1980 }
1981 
1982 static void domain_exit(struct dmar_domain *domain)
1983 {
1984 
1985 	/* Remove associated devices and clear attached or cached domains */
1986 	domain_remove_dev_info(domain);
1987 
1988 	/* destroy iovas */
1989 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1990 		iommu_put_dma_cookie(&domain->domain);
1991 
1992 	if (domain->pgd) {
1993 		struct page *freelist;
1994 
1995 		freelist = domain_unmap(domain, 0,
1996 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1997 		dma_free_pagelist(freelist);
1998 	}
1999 
2000 	free_domain_mem(domain);
2001 }
2002 
2003 /*
2004  * Get the PASID directory size for scalable mode context entry.
2005  * Value of X in the PDTS field of a scalable mode context entry
2006  * indicates PASID directory with 2^(X + 7) entries.
2007  */
2008 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2009 {
2010 	int pds, max_pde;
2011 
2012 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2013 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2014 	if (pds < 7)
2015 		return 0;
2016 
2017 	return pds - 7;
2018 }
2019 
2020 /*
2021  * Set the RID_PASID field of a scalable mode context entry. The
2022  * IOMMU hardware will use the PASID value set in this field for
2023  * DMA translations of DMA requests without PASID.
2024  */
2025 static inline void
2026 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2027 {
2028 	context->hi |= pasid & ((1 << 20) - 1);
2029 }
2030 
2031 /*
2032  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2033  * entry.
2034  */
2035 static inline void context_set_sm_dte(struct context_entry *context)
2036 {
2037 	context->lo |= (1 << 2);
2038 }
2039 
2040 /*
2041  * Set the PRE(Page Request Enable) field of a scalable mode context
2042  * entry.
2043  */
2044 static inline void context_set_sm_pre(struct context_entry *context)
2045 {
2046 	context->lo |= (1 << 4);
2047 }
2048 
2049 /* Convert value to context PASID directory size field coding. */
2050 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2051 
2052 static int domain_context_mapping_one(struct dmar_domain *domain,
2053 				      struct intel_iommu *iommu,
2054 				      struct pasid_table *table,
2055 				      u8 bus, u8 devfn)
2056 {
2057 	u16 did = domain->iommu_did[iommu->seq_id];
2058 	int translation = CONTEXT_TT_MULTI_LEVEL;
2059 	struct device_domain_info *info = NULL;
2060 	struct context_entry *context;
2061 	unsigned long flags;
2062 	int ret;
2063 
2064 	WARN_ON(did == 0);
2065 
2066 	if (hw_pass_through && domain_type_is_si(domain))
2067 		translation = CONTEXT_TT_PASS_THROUGH;
2068 
2069 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2070 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2071 
2072 	BUG_ON(!domain->pgd);
2073 
2074 	spin_lock_irqsave(&device_domain_lock, flags);
2075 	spin_lock(&iommu->lock);
2076 
2077 	ret = -ENOMEM;
2078 	context = iommu_context_addr(iommu, bus, devfn, 1);
2079 	if (!context)
2080 		goto out_unlock;
2081 
2082 	ret = 0;
2083 	if (context_present(context))
2084 		goto out_unlock;
2085 
2086 	/*
2087 	 * For kdump cases, old valid entries may be cached due to the
2088 	 * in-flight DMA and copied pgtable, but there is no unmapping
2089 	 * behaviour for them, thus we need an explicit cache flush for
2090 	 * the newly-mapped device. For kdump, at this point, the device
2091 	 * is supposed to finish reset at its driver probe stage, so no
2092 	 * in-flight DMA will exist, and we don't need to worry anymore
2093 	 * hereafter.
2094 	 */
2095 	if (context_copied(context)) {
2096 		u16 did_old = context_domain_id(context);
2097 
2098 		if (did_old < cap_ndoms(iommu->cap)) {
2099 			iommu->flush.flush_context(iommu, did_old,
2100 						   (((u16)bus) << 8) | devfn,
2101 						   DMA_CCMD_MASK_NOBIT,
2102 						   DMA_CCMD_DEVICE_INVL);
2103 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2104 						 DMA_TLB_DSI_FLUSH);
2105 		}
2106 	}
2107 
2108 	context_clear_entry(context);
2109 
2110 	if (sm_supported(iommu)) {
2111 		unsigned long pds;
2112 
2113 		WARN_ON(!table);
2114 
2115 		/* Setup the PASID DIR pointer: */
2116 		pds = context_get_sm_pds(table);
2117 		context->lo = (u64)virt_to_phys(table->table) |
2118 				context_pdts(pds);
2119 
2120 		/* Setup the RID_PASID field: */
2121 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2122 
2123 		/*
2124 		 * Setup the Device-TLB enable bit and Page request
2125 		 * Enable bit:
2126 		 */
2127 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2128 		if (info && info->ats_supported)
2129 			context_set_sm_dte(context);
2130 		if (info && info->pri_supported)
2131 			context_set_sm_pre(context);
2132 	} else {
2133 		struct dma_pte *pgd = domain->pgd;
2134 		int agaw;
2135 
2136 		context_set_domain_id(context, did);
2137 
2138 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2139 			/*
2140 			 * Skip top levels of page tables for iommu which has
2141 			 * less agaw than default. Unnecessary for PT mode.
2142 			 */
2143 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2144 				ret = -ENOMEM;
2145 				pgd = phys_to_virt(dma_pte_addr(pgd));
2146 				if (!dma_pte_present(pgd))
2147 					goto out_unlock;
2148 			}
2149 
2150 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2151 			if (info && info->ats_supported)
2152 				translation = CONTEXT_TT_DEV_IOTLB;
2153 			else
2154 				translation = CONTEXT_TT_MULTI_LEVEL;
2155 
2156 			context_set_address_root(context, virt_to_phys(pgd));
2157 			context_set_address_width(context, agaw);
2158 		} else {
2159 			/*
2160 			 * In pass through mode, AW must be programmed to
2161 			 * indicate the largest AGAW value supported by
2162 			 * hardware. And ASR is ignored by hardware.
2163 			 */
2164 			context_set_address_width(context, iommu->msagaw);
2165 		}
2166 
2167 		context_set_translation_type(context, translation);
2168 	}
2169 
2170 	context_set_fault_enable(context);
2171 	context_set_present(context);
2172 	if (!ecap_coherent(iommu->ecap))
2173 		clflush_cache_range(context, sizeof(*context));
2174 
2175 	/*
2176 	 * It's a non-present to present mapping. If hardware doesn't cache
2177 	 * non-present entry we only need to flush the write-buffer. If the
2178 	 * _does_ cache non-present entries, then it does so in the special
2179 	 * domain #0, which we have to flush:
2180 	 */
2181 	if (cap_caching_mode(iommu->cap)) {
2182 		iommu->flush.flush_context(iommu, 0,
2183 					   (((u16)bus) << 8) | devfn,
2184 					   DMA_CCMD_MASK_NOBIT,
2185 					   DMA_CCMD_DEVICE_INVL);
2186 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2187 	} else {
2188 		iommu_flush_write_buffer(iommu);
2189 	}
2190 	iommu_enable_dev_iotlb(info);
2191 
2192 	ret = 0;
2193 
2194 out_unlock:
2195 	spin_unlock(&iommu->lock);
2196 	spin_unlock_irqrestore(&device_domain_lock, flags);
2197 
2198 	return ret;
2199 }
2200 
2201 struct domain_context_mapping_data {
2202 	struct dmar_domain *domain;
2203 	struct intel_iommu *iommu;
2204 	struct pasid_table *table;
2205 };
2206 
2207 static int domain_context_mapping_cb(struct pci_dev *pdev,
2208 				     u16 alias, void *opaque)
2209 {
2210 	struct domain_context_mapping_data *data = opaque;
2211 
2212 	return domain_context_mapping_one(data->domain, data->iommu,
2213 					  data->table, PCI_BUS_NUM(alias),
2214 					  alias & 0xff);
2215 }
2216 
2217 static int
2218 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2219 {
2220 	struct domain_context_mapping_data data;
2221 	struct pasid_table *table;
2222 	struct intel_iommu *iommu;
2223 	u8 bus, devfn;
2224 
2225 	iommu = device_to_iommu(dev, &bus, &devfn);
2226 	if (!iommu)
2227 		return -ENODEV;
2228 
2229 	table = intel_pasid_get_table(dev);
2230 
2231 	if (!dev_is_pci(dev))
2232 		return domain_context_mapping_one(domain, iommu, table,
2233 						  bus, devfn);
2234 
2235 	data.domain = domain;
2236 	data.iommu = iommu;
2237 	data.table = table;
2238 
2239 	return pci_for_each_dma_alias(to_pci_dev(dev),
2240 				      &domain_context_mapping_cb, &data);
2241 }
2242 
2243 static int domain_context_mapped_cb(struct pci_dev *pdev,
2244 				    u16 alias, void *opaque)
2245 {
2246 	struct intel_iommu *iommu = opaque;
2247 
2248 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2249 }
2250 
2251 static int domain_context_mapped(struct device *dev)
2252 {
2253 	struct intel_iommu *iommu;
2254 	u8 bus, devfn;
2255 
2256 	iommu = device_to_iommu(dev, &bus, &devfn);
2257 	if (!iommu)
2258 		return -ENODEV;
2259 
2260 	if (!dev_is_pci(dev))
2261 		return device_context_mapped(iommu, bus, devfn);
2262 
2263 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2264 				       domain_context_mapped_cb, iommu);
2265 }
2266 
2267 /* Returns a number of VTD pages, but aligned to MM page size */
2268 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2269 					    size_t size)
2270 {
2271 	host_addr &= ~PAGE_MASK;
2272 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2273 }
2274 
2275 /* Return largest possible superpage level for a given mapping */
2276 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2277 					  unsigned long iov_pfn,
2278 					  unsigned long phy_pfn,
2279 					  unsigned long pages)
2280 {
2281 	int support, level = 1;
2282 	unsigned long pfnmerge;
2283 
2284 	support = domain->iommu_superpage;
2285 
2286 	/* To use a large page, the virtual *and* physical addresses
2287 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2288 	   of them will mean we have to use smaller pages. So just
2289 	   merge them and check both at once. */
2290 	pfnmerge = iov_pfn | phy_pfn;
2291 
2292 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2293 		pages >>= VTD_STRIDE_SHIFT;
2294 		if (!pages)
2295 			break;
2296 		pfnmerge >>= VTD_STRIDE_SHIFT;
2297 		level++;
2298 		support--;
2299 	}
2300 	return level;
2301 }
2302 
2303 /*
2304  * Ensure that old small page tables are removed to make room for superpage(s).
2305  * We're going to add new large pages, so make sure we don't remove their parent
2306  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2307  */
2308 static void switch_to_super_page(struct dmar_domain *domain,
2309 				 unsigned long start_pfn,
2310 				 unsigned long end_pfn, int level)
2311 {
2312 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2313 	struct dma_pte *pte = NULL;
2314 	int i;
2315 
2316 	while (start_pfn <= end_pfn) {
2317 		if (!pte)
2318 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2319 
2320 		if (dma_pte_present(pte)) {
2321 			dma_pte_free_pagetable(domain, start_pfn,
2322 					       start_pfn + lvl_pages - 1,
2323 					       level + 1);
2324 
2325 			for_each_domain_iommu(i, domain)
2326 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2327 						      start_pfn, lvl_pages,
2328 						      0, 0);
2329 		}
2330 
2331 		pte++;
2332 		start_pfn += lvl_pages;
2333 		if (first_pte_in_page(pte))
2334 			pte = NULL;
2335 	}
2336 }
2337 
2338 static int
2339 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2340 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2341 {
2342 	unsigned int largepage_lvl = 0;
2343 	unsigned long lvl_pages = 0;
2344 	struct dma_pte *pte = NULL;
2345 	phys_addr_t pteval;
2346 	u64 attr;
2347 
2348 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2349 
2350 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2351 		return -EINVAL;
2352 
2353 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2354 	attr |= DMA_FL_PTE_PRESENT;
2355 	if (domain_use_first_level(domain)) {
2356 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2357 
2358 		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2359 			attr |= DMA_FL_PTE_ACCESS;
2360 			if (prot & DMA_PTE_WRITE)
2361 				attr |= DMA_FL_PTE_DIRTY;
2362 		}
2363 	}
2364 
2365 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2366 
2367 	while (nr_pages > 0) {
2368 		uint64_t tmp;
2369 
2370 		if (!pte) {
2371 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2372 					phys_pfn, nr_pages);
2373 
2374 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2375 			if (!pte)
2376 				return -ENOMEM;
2377 			/* It is large page*/
2378 			if (largepage_lvl > 1) {
2379 				unsigned long end_pfn;
2380 
2381 				pteval |= DMA_PTE_LARGE_PAGE;
2382 				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2383 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2384 			} else {
2385 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2386 			}
2387 
2388 		}
2389 		/* We don't need lock here, nobody else
2390 		 * touches the iova range
2391 		 */
2392 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2393 		if (tmp) {
2394 			static int dumps = 5;
2395 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2396 				iov_pfn, tmp, (unsigned long long)pteval);
2397 			if (dumps) {
2398 				dumps--;
2399 				debug_dma_dump_mappings(NULL);
2400 			}
2401 			WARN_ON(1);
2402 		}
2403 
2404 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2405 
2406 		BUG_ON(nr_pages < lvl_pages);
2407 
2408 		nr_pages -= lvl_pages;
2409 		iov_pfn += lvl_pages;
2410 		phys_pfn += lvl_pages;
2411 		pteval += lvl_pages * VTD_PAGE_SIZE;
2412 
2413 		/* If the next PTE would be the first in a new page, then we
2414 		 * need to flush the cache on the entries we've just written.
2415 		 * And then we'll need to recalculate 'pte', so clear it and
2416 		 * let it get set again in the if (!pte) block above.
2417 		 *
2418 		 * If we're done (!nr_pages) we need to flush the cache too.
2419 		 *
2420 		 * Also if we've been setting superpages, we may need to
2421 		 * recalculate 'pte' and switch back to smaller pages for the
2422 		 * end of the mapping, if the trailing size is not enough to
2423 		 * use another superpage (i.e. nr_pages < lvl_pages).
2424 		 *
2425 		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2426 		 * callback.
2427 		 */
2428 		pte++;
2429 		if (!nr_pages || first_pte_in_page(pte) ||
2430 		    (largepage_lvl > 1 && nr_pages < lvl_pages))
2431 			pte = NULL;
2432 	}
2433 
2434 	return 0;
2435 }
2436 
2437 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2438 {
2439 	unsigned long flags;
2440 	struct context_entry *context;
2441 	u16 did_old;
2442 
2443 	if (!iommu)
2444 		return;
2445 
2446 	spin_lock_irqsave(&iommu->lock, flags);
2447 	context = iommu_context_addr(iommu, bus, devfn, 0);
2448 	if (!context) {
2449 		spin_unlock_irqrestore(&iommu->lock, flags);
2450 		return;
2451 	}
2452 	did_old = context_domain_id(context);
2453 	context_clear_entry(context);
2454 	__iommu_flush_cache(iommu, context, sizeof(*context));
2455 	spin_unlock_irqrestore(&iommu->lock, flags);
2456 	iommu->flush.flush_context(iommu,
2457 				   did_old,
2458 				   (((u16)bus) << 8) | devfn,
2459 				   DMA_CCMD_MASK_NOBIT,
2460 				   DMA_CCMD_DEVICE_INVL);
2461 
2462 	if (sm_supported(iommu))
2463 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2464 
2465 	iommu->flush.flush_iotlb(iommu,
2466 				 did_old,
2467 				 0,
2468 				 0,
2469 				 DMA_TLB_DSI_FLUSH);
2470 }
2471 
2472 static inline void unlink_domain_info(struct device_domain_info *info)
2473 {
2474 	assert_spin_locked(&device_domain_lock);
2475 	list_del(&info->link);
2476 	list_del(&info->global);
2477 	if (info->dev)
2478 		dev_iommu_priv_set(info->dev, NULL);
2479 }
2480 
2481 static void domain_remove_dev_info(struct dmar_domain *domain)
2482 {
2483 	struct device_domain_info *info, *tmp;
2484 	unsigned long flags;
2485 
2486 	spin_lock_irqsave(&device_domain_lock, flags);
2487 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2488 		__dmar_remove_one_dev_info(info);
2489 	spin_unlock_irqrestore(&device_domain_lock, flags);
2490 }
2491 
2492 struct dmar_domain *find_domain(struct device *dev)
2493 {
2494 	struct device_domain_info *info;
2495 
2496 	if (unlikely(!dev || !dev->iommu))
2497 		return NULL;
2498 
2499 	if (unlikely(attach_deferred(dev)))
2500 		return NULL;
2501 
2502 	/* No lock here, assumes no domain exit in normal case */
2503 	info = get_domain_info(dev);
2504 	if (likely(info))
2505 		return info->domain;
2506 
2507 	return NULL;
2508 }
2509 
2510 static inline struct device_domain_info *
2511 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2512 {
2513 	struct device_domain_info *info;
2514 
2515 	list_for_each_entry(info, &device_domain_list, global)
2516 		if (info->segment == segment && info->bus == bus &&
2517 		    info->devfn == devfn)
2518 			return info;
2519 
2520 	return NULL;
2521 }
2522 
2523 static int domain_setup_first_level(struct intel_iommu *iommu,
2524 				    struct dmar_domain *domain,
2525 				    struct device *dev,
2526 				    u32 pasid)
2527 {
2528 	struct dma_pte *pgd = domain->pgd;
2529 	int agaw, level;
2530 	int flags = 0;
2531 
2532 	/*
2533 	 * Skip top levels of page tables for iommu which has
2534 	 * less agaw than default. Unnecessary for PT mode.
2535 	 */
2536 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2537 		pgd = phys_to_virt(dma_pte_addr(pgd));
2538 		if (!dma_pte_present(pgd))
2539 			return -ENOMEM;
2540 	}
2541 
2542 	level = agaw_to_level(agaw);
2543 	if (level != 4 && level != 5)
2544 		return -EINVAL;
2545 
2546 	if (pasid != PASID_RID2PASID)
2547 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2548 	if (level == 5)
2549 		flags |= PASID_FLAG_FL5LP;
2550 
2551 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2552 		flags |= PASID_FLAG_PAGE_SNOOP;
2553 
2554 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2555 					     domain->iommu_did[iommu->seq_id],
2556 					     flags);
2557 }
2558 
2559 static bool dev_is_real_dma_subdevice(struct device *dev)
2560 {
2561 	return dev && dev_is_pci(dev) &&
2562 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2563 }
2564 
2565 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2566 						    int bus, int devfn,
2567 						    struct device *dev,
2568 						    struct dmar_domain *domain)
2569 {
2570 	struct dmar_domain *found = NULL;
2571 	struct device_domain_info *info;
2572 	unsigned long flags;
2573 	int ret;
2574 
2575 	info = alloc_devinfo_mem();
2576 	if (!info)
2577 		return NULL;
2578 
2579 	if (!dev_is_real_dma_subdevice(dev)) {
2580 		info->bus = bus;
2581 		info->devfn = devfn;
2582 		info->segment = iommu->segment;
2583 	} else {
2584 		struct pci_dev *pdev = to_pci_dev(dev);
2585 
2586 		info->bus = pdev->bus->number;
2587 		info->devfn = pdev->devfn;
2588 		info->segment = pci_domain_nr(pdev->bus);
2589 	}
2590 
2591 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2592 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2593 	info->ats_qdep = 0;
2594 	info->dev = dev;
2595 	info->domain = domain;
2596 	info->iommu = iommu;
2597 	info->pasid_table = NULL;
2598 	info->auxd_enabled = 0;
2599 	INIT_LIST_HEAD(&info->subdevices);
2600 
2601 	if (dev && dev_is_pci(dev)) {
2602 		struct pci_dev *pdev = to_pci_dev(info->dev);
2603 
2604 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2605 		    pci_ats_supported(pdev) &&
2606 		    dmar_find_matched_atsr_unit(pdev))
2607 			info->ats_supported = 1;
2608 
2609 		if (sm_supported(iommu)) {
2610 			if (pasid_supported(iommu)) {
2611 				int features = pci_pasid_features(pdev);
2612 				if (features >= 0)
2613 					info->pasid_supported = features | 1;
2614 			}
2615 
2616 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2617 			    pci_pri_supported(pdev))
2618 				info->pri_supported = 1;
2619 		}
2620 	}
2621 
2622 	spin_lock_irqsave(&device_domain_lock, flags);
2623 	if (dev)
2624 		found = find_domain(dev);
2625 
2626 	if (!found) {
2627 		struct device_domain_info *info2;
2628 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2629 						       info->devfn);
2630 		if (info2) {
2631 			found      = info2->domain;
2632 			info2->dev = dev;
2633 		}
2634 	}
2635 
2636 	if (found) {
2637 		spin_unlock_irqrestore(&device_domain_lock, flags);
2638 		free_devinfo_mem(info);
2639 		/* Caller must free the original domain */
2640 		return found;
2641 	}
2642 
2643 	spin_lock(&iommu->lock);
2644 	ret = domain_attach_iommu(domain, iommu);
2645 	spin_unlock(&iommu->lock);
2646 
2647 	if (ret) {
2648 		spin_unlock_irqrestore(&device_domain_lock, flags);
2649 		free_devinfo_mem(info);
2650 		return NULL;
2651 	}
2652 
2653 	list_add(&info->link, &domain->devices);
2654 	list_add(&info->global, &device_domain_list);
2655 	if (dev)
2656 		dev_iommu_priv_set(dev, info);
2657 	spin_unlock_irqrestore(&device_domain_lock, flags);
2658 
2659 	/* PASID table is mandatory for a PCI device in scalable mode. */
2660 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2661 		ret = intel_pasid_alloc_table(dev);
2662 		if (ret) {
2663 			dev_err(dev, "PASID table allocation failed\n");
2664 			dmar_remove_one_dev_info(dev);
2665 			return NULL;
2666 		}
2667 
2668 		/* Setup the PASID entry for requests without PASID: */
2669 		spin_lock_irqsave(&iommu->lock, flags);
2670 		if (hw_pass_through && domain_type_is_si(domain))
2671 			ret = intel_pasid_setup_pass_through(iommu, domain,
2672 					dev, PASID_RID2PASID);
2673 		else if (domain_use_first_level(domain))
2674 			ret = domain_setup_first_level(iommu, domain, dev,
2675 					PASID_RID2PASID);
2676 		else
2677 			ret = intel_pasid_setup_second_level(iommu, domain,
2678 					dev, PASID_RID2PASID);
2679 		spin_unlock_irqrestore(&iommu->lock, flags);
2680 		if (ret) {
2681 			dev_err(dev, "Setup RID2PASID failed\n");
2682 			dmar_remove_one_dev_info(dev);
2683 			return NULL;
2684 		}
2685 	}
2686 
2687 	if (dev && domain_context_mapping(domain, dev)) {
2688 		dev_err(dev, "Domain context map failed\n");
2689 		dmar_remove_one_dev_info(dev);
2690 		return NULL;
2691 	}
2692 
2693 	return domain;
2694 }
2695 
2696 static int iommu_domain_identity_map(struct dmar_domain *domain,
2697 				     unsigned long first_vpfn,
2698 				     unsigned long last_vpfn)
2699 {
2700 	/*
2701 	 * RMRR range might have overlap with physical memory range,
2702 	 * clear it first
2703 	 */
2704 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2705 
2706 	return __domain_mapping(domain, first_vpfn,
2707 				first_vpfn, last_vpfn - first_vpfn + 1,
2708 				DMA_PTE_READ|DMA_PTE_WRITE);
2709 }
2710 
2711 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2712 
2713 static int __init si_domain_init(int hw)
2714 {
2715 	struct dmar_rmrr_unit *rmrr;
2716 	struct device *dev;
2717 	int i, nid, ret;
2718 
2719 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2720 	if (!si_domain)
2721 		return -EFAULT;
2722 
2723 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2724 		domain_exit(si_domain);
2725 		return -EFAULT;
2726 	}
2727 
2728 	if (hw)
2729 		return 0;
2730 
2731 	for_each_online_node(nid) {
2732 		unsigned long start_pfn, end_pfn;
2733 		int i;
2734 
2735 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2736 			ret = iommu_domain_identity_map(si_domain,
2737 					mm_to_dma_pfn(start_pfn),
2738 					mm_to_dma_pfn(end_pfn));
2739 			if (ret)
2740 				return ret;
2741 		}
2742 	}
2743 
2744 	/*
2745 	 * Identity map the RMRRs so that devices with RMRRs could also use
2746 	 * the si_domain.
2747 	 */
2748 	for_each_rmrr_units(rmrr) {
2749 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2750 					  i, dev) {
2751 			unsigned long long start = rmrr->base_address;
2752 			unsigned long long end = rmrr->end_address;
2753 
2754 			if (WARN_ON(end < start ||
2755 				    end >> agaw_to_width(si_domain->agaw)))
2756 				continue;
2757 
2758 			ret = iommu_domain_identity_map(si_domain,
2759 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2760 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2761 			if (ret)
2762 				return ret;
2763 		}
2764 	}
2765 
2766 	return 0;
2767 }
2768 
2769 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2770 {
2771 	struct dmar_domain *ndomain;
2772 	struct intel_iommu *iommu;
2773 	u8 bus, devfn;
2774 
2775 	iommu = device_to_iommu(dev, &bus, &devfn);
2776 	if (!iommu)
2777 		return -ENODEV;
2778 
2779 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2780 	if (ndomain != domain)
2781 		return -EBUSY;
2782 
2783 	return 0;
2784 }
2785 
2786 static bool device_has_rmrr(struct device *dev)
2787 {
2788 	struct dmar_rmrr_unit *rmrr;
2789 	struct device *tmp;
2790 	int i;
2791 
2792 	rcu_read_lock();
2793 	for_each_rmrr_units(rmrr) {
2794 		/*
2795 		 * Return TRUE if this RMRR contains the device that
2796 		 * is passed in.
2797 		 */
2798 		for_each_active_dev_scope(rmrr->devices,
2799 					  rmrr->devices_cnt, i, tmp)
2800 			if (tmp == dev ||
2801 			    is_downstream_to_pci_bridge(dev, tmp)) {
2802 				rcu_read_unlock();
2803 				return true;
2804 			}
2805 	}
2806 	rcu_read_unlock();
2807 	return false;
2808 }
2809 
2810 /**
2811  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2812  * is relaxable (ie. is allowed to be not enforced under some conditions)
2813  * @dev: device handle
2814  *
2815  * We assume that PCI USB devices with RMRRs have them largely
2816  * for historical reasons and that the RMRR space is not actively used post
2817  * boot.  This exclusion may change if vendors begin to abuse it.
2818  *
2819  * The same exception is made for graphics devices, with the requirement that
2820  * any use of the RMRR regions will be torn down before assigning the device
2821  * to a guest.
2822  *
2823  * Return: true if the RMRR is relaxable, false otherwise
2824  */
2825 static bool device_rmrr_is_relaxable(struct device *dev)
2826 {
2827 	struct pci_dev *pdev;
2828 
2829 	if (!dev_is_pci(dev))
2830 		return false;
2831 
2832 	pdev = to_pci_dev(dev);
2833 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2834 		return true;
2835 	else
2836 		return false;
2837 }
2838 
2839 /*
2840  * There are a couple cases where we need to restrict the functionality of
2841  * devices associated with RMRRs.  The first is when evaluating a device for
2842  * identity mapping because problems exist when devices are moved in and out
2843  * of domains and their respective RMRR information is lost.  This means that
2844  * a device with associated RMRRs will never be in a "passthrough" domain.
2845  * The second is use of the device through the IOMMU API.  This interface
2846  * expects to have full control of the IOVA space for the device.  We cannot
2847  * satisfy both the requirement that RMRR access is maintained and have an
2848  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2849  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2850  * We therefore prevent devices associated with an RMRR from participating in
2851  * the IOMMU API, which eliminates them from device assignment.
2852  *
2853  * In both cases, devices which have relaxable RMRRs are not concerned by this
2854  * restriction. See device_rmrr_is_relaxable comment.
2855  */
2856 static bool device_is_rmrr_locked(struct device *dev)
2857 {
2858 	if (!device_has_rmrr(dev))
2859 		return false;
2860 
2861 	if (device_rmrr_is_relaxable(dev))
2862 		return false;
2863 
2864 	return true;
2865 }
2866 
2867 /*
2868  * Return the required default domain type for a specific device.
2869  *
2870  * @dev: the device in query
2871  * @startup: true if this is during early boot
2872  *
2873  * Returns:
2874  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2875  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2876  *  - 0: both identity and dynamic domains work for this device
2877  */
2878 static int device_def_domain_type(struct device *dev)
2879 {
2880 	if (dev_is_pci(dev)) {
2881 		struct pci_dev *pdev = to_pci_dev(dev);
2882 
2883 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2884 			return IOMMU_DOMAIN_IDENTITY;
2885 
2886 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2887 			return IOMMU_DOMAIN_IDENTITY;
2888 	}
2889 
2890 	return 0;
2891 }
2892 
2893 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2894 {
2895 	/*
2896 	 * Start from the sane iommu hardware state.
2897 	 * If the queued invalidation is already initialized by us
2898 	 * (for example, while enabling interrupt-remapping) then
2899 	 * we got the things already rolling from a sane state.
2900 	 */
2901 	if (!iommu->qi) {
2902 		/*
2903 		 * Clear any previous faults.
2904 		 */
2905 		dmar_fault(-1, iommu);
2906 		/*
2907 		 * Disable queued invalidation if supported and already enabled
2908 		 * before OS handover.
2909 		 */
2910 		dmar_disable_qi(iommu);
2911 	}
2912 
2913 	if (dmar_enable_qi(iommu)) {
2914 		/*
2915 		 * Queued Invalidate not enabled, use Register Based Invalidate
2916 		 */
2917 		iommu->flush.flush_context = __iommu_flush_context;
2918 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2919 		pr_info("%s: Using Register based invalidation\n",
2920 			iommu->name);
2921 	} else {
2922 		iommu->flush.flush_context = qi_flush_context;
2923 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2924 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2925 	}
2926 }
2927 
2928 static int copy_context_table(struct intel_iommu *iommu,
2929 			      struct root_entry *old_re,
2930 			      struct context_entry **tbl,
2931 			      int bus, bool ext)
2932 {
2933 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2934 	struct context_entry *new_ce = NULL, ce;
2935 	struct context_entry *old_ce = NULL;
2936 	struct root_entry re;
2937 	phys_addr_t old_ce_phys;
2938 
2939 	tbl_idx = ext ? bus * 2 : bus;
2940 	memcpy(&re, old_re, sizeof(re));
2941 
2942 	for (devfn = 0; devfn < 256; devfn++) {
2943 		/* First calculate the correct index */
2944 		idx = (ext ? devfn * 2 : devfn) % 256;
2945 
2946 		if (idx == 0) {
2947 			/* First save what we may have and clean up */
2948 			if (new_ce) {
2949 				tbl[tbl_idx] = new_ce;
2950 				__iommu_flush_cache(iommu, new_ce,
2951 						    VTD_PAGE_SIZE);
2952 				pos = 1;
2953 			}
2954 
2955 			if (old_ce)
2956 				memunmap(old_ce);
2957 
2958 			ret = 0;
2959 			if (devfn < 0x80)
2960 				old_ce_phys = root_entry_lctp(&re);
2961 			else
2962 				old_ce_phys = root_entry_uctp(&re);
2963 
2964 			if (!old_ce_phys) {
2965 				if (ext && devfn == 0) {
2966 					/* No LCTP, try UCTP */
2967 					devfn = 0x7f;
2968 					continue;
2969 				} else {
2970 					goto out;
2971 				}
2972 			}
2973 
2974 			ret = -ENOMEM;
2975 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2976 					MEMREMAP_WB);
2977 			if (!old_ce)
2978 				goto out;
2979 
2980 			new_ce = alloc_pgtable_page(iommu->node);
2981 			if (!new_ce)
2982 				goto out_unmap;
2983 
2984 			ret = 0;
2985 		}
2986 
2987 		/* Now copy the context entry */
2988 		memcpy(&ce, old_ce + idx, sizeof(ce));
2989 
2990 		if (!__context_present(&ce))
2991 			continue;
2992 
2993 		did = context_domain_id(&ce);
2994 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2995 			set_bit(did, iommu->domain_ids);
2996 
2997 		/*
2998 		 * We need a marker for copied context entries. This
2999 		 * marker needs to work for the old format as well as
3000 		 * for extended context entries.
3001 		 *
3002 		 * Bit 67 of the context entry is used. In the old
3003 		 * format this bit is available to software, in the
3004 		 * extended format it is the PGE bit, but PGE is ignored
3005 		 * by HW if PASIDs are disabled (and thus still
3006 		 * available).
3007 		 *
3008 		 * So disable PASIDs first and then mark the entry
3009 		 * copied. This means that we don't copy PASID
3010 		 * translations from the old kernel, but this is fine as
3011 		 * faults there are not fatal.
3012 		 */
3013 		context_clear_pasid_enable(&ce);
3014 		context_set_copied(&ce);
3015 
3016 		new_ce[idx] = ce;
3017 	}
3018 
3019 	tbl[tbl_idx + pos] = new_ce;
3020 
3021 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3022 
3023 out_unmap:
3024 	memunmap(old_ce);
3025 
3026 out:
3027 	return ret;
3028 }
3029 
3030 static int copy_translation_tables(struct intel_iommu *iommu)
3031 {
3032 	struct context_entry **ctxt_tbls;
3033 	struct root_entry *old_rt;
3034 	phys_addr_t old_rt_phys;
3035 	int ctxt_table_entries;
3036 	unsigned long flags;
3037 	u64 rtaddr_reg;
3038 	int bus, ret;
3039 	bool new_ext, ext;
3040 
3041 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3042 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3043 	new_ext    = !!ecap_ecs(iommu->ecap);
3044 
3045 	/*
3046 	 * The RTT bit can only be changed when translation is disabled,
3047 	 * but disabling translation means to open a window for data
3048 	 * corruption. So bail out and don't copy anything if we would
3049 	 * have to change the bit.
3050 	 */
3051 	if (new_ext != ext)
3052 		return -EINVAL;
3053 
3054 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3055 	if (!old_rt_phys)
3056 		return -EINVAL;
3057 
3058 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3059 	if (!old_rt)
3060 		return -ENOMEM;
3061 
3062 	/* This is too big for the stack - allocate it from slab */
3063 	ctxt_table_entries = ext ? 512 : 256;
3064 	ret = -ENOMEM;
3065 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3066 	if (!ctxt_tbls)
3067 		goto out_unmap;
3068 
3069 	for (bus = 0; bus < 256; bus++) {
3070 		ret = copy_context_table(iommu, &old_rt[bus],
3071 					 ctxt_tbls, bus, ext);
3072 		if (ret) {
3073 			pr_err("%s: Failed to copy context table for bus %d\n",
3074 				iommu->name, bus);
3075 			continue;
3076 		}
3077 	}
3078 
3079 	spin_lock_irqsave(&iommu->lock, flags);
3080 
3081 	/* Context tables are copied, now write them to the root_entry table */
3082 	for (bus = 0; bus < 256; bus++) {
3083 		int idx = ext ? bus * 2 : bus;
3084 		u64 val;
3085 
3086 		if (ctxt_tbls[idx]) {
3087 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3088 			iommu->root_entry[bus].lo = val;
3089 		}
3090 
3091 		if (!ext || !ctxt_tbls[idx + 1])
3092 			continue;
3093 
3094 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3095 		iommu->root_entry[bus].hi = val;
3096 	}
3097 
3098 	spin_unlock_irqrestore(&iommu->lock, flags);
3099 
3100 	kfree(ctxt_tbls);
3101 
3102 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3103 
3104 	ret = 0;
3105 
3106 out_unmap:
3107 	memunmap(old_rt);
3108 
3109 	return ret;
3110 }
3111 
3112 #ifdef CONFIG_INTEL_IOMMU_SVM
3113 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3114 {
3115 	struct intel_iommu *iommu = data;
3116 	ioasid_t ioasid;
3117 
3118 	if (!iommu)
3119 		return INVALID_IOASID;
3120 	/*
3121 	 * VT-d virtual command interface always uses the full 20 bit
3122 	 * PASID range. Host can partition guest PASID range based on
3123 	 * policies but it is out of guest's control.
3124 	 */
3125 	if (min < PASID_MIN || max > intel_pasid_max_id)
3126 		return INVALID_IOASID;
3127 
3128 	if (vcmd_alloc_pasid(iommu, &ioasid))
3129 		return INVALID_IOASID;
3130 
3131 	return ioasid;
3132 }
3133 
3134 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3135 {
3136 	struct intel_iommu *iommu = data;
3137 
3138 	if (!iommu)
3139 		return;
3140 	/*
3141 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3142 	 * We can only free the PASID when all the devices are unbound.
3143 	 */
3144 	if (ioasid_find(NULL, ioasid, NULL)) {
3145 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3146 		return;
3147 	}
3148 	vcmd_free_pasid(iommu, ioasid);
3149 }
3150 
3151 static void register_pasid_allocator(struct intel_iommu *iommu)
3152 {
3153 	/*
3154 	 * If we are running in the host, no need for custom allocator
3155 	 * in that PASIDs are allocated from the host system-wide.
3156 	 */
3157 	if (!cap_caching_mode(iommu->cap))
3158 		return;
3159 
3160 	if (!sm_supported(iommu)) {
3161 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3162 		return;
3163 	}
3164 
3165 	/*
3166 	 * Register a custom PASID allocator if we are running in a guest,
3167 	 * guest PASID must be obtained via virtual command interface.
3168 	 * There can be multiple vIOMMUs in each guest but only one allocator
3169 	 * is active. All vIOMMU allocators will eventually be calling the same
3170 	 * host allocator.
3171 	 */
3172 	if (!vccap_pasid(iommu->vccap))
3173 		return;
3174 
3175 	pr_info("Register custom PASID allocator\n");
3176 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3177 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3178 	iommu->pasid_allocator.pdata = (void *)iommu;
3179 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3180 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3181 		/*
3182 		 * Disable scalable mode on this IOMMU if there
3183 		 * is no custom allocator. Mixing SM capable vIOMMU
3184 		 * and non-SM vIOMMU are not supported.
3185 		 */
3186 		intel_iommu_sm = 0;
3187 	}
3188 }
3189 #endif
3190 
3191 static int __init init_dmars(void)
3192 {
3193 	struct dmar_drhd_unit *drhd;
3194 	struct intel_iommu *iommu;
3195 	int ret;
3196 
3197 	/*
3198 	 * for each drhd
3199 	 *    allocate root
3200 	 *    initialize and program root entry to not present
3201 	 * endfor
3202 	 */
3203 	for_each_drhd_unit(drhd) {
3204 		/*
3205 		 * lock not needed as this is only incremented in the single
3206 		 * threaded kernel __init code path all other access are read
3207 		 * only
3208 		 */
3209 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3210 			g_num_of_iommus++;
3211 			continue;
3212 		}
3213 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3214 	}
3215 
3216 	/* Preallocate enough resources for IOMMU hot-addition */
3217 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3218 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3219 
3220 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3221 			GFP_KERNEL);
3222 	if (!g_iommus) {
3223 		pr_err("Allocating global iommu array failed\n");
3224 		ret = -ENOMEM;
3225 		goto error;
3226 	}
3227 
3228 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3229 	if (ret)
3230 		goto free_iommu;
3231 
3232 	for_each_iommu(iommu, drhd) {
3233 		if (drhd->ignored) {
3234 			iommu_disable_translation(iommu);
3235 			continue;
3236 		}
3237 
3238 		/*
3239 		 * Find the max pasid size of all IOMMU's in the system.
3240 		 * We need to ensure the system pasid table is no bigger
3241 		 * than the smallest supported.
3242 		 */
3243 		if (pasid_supported(iommu)) {
3244 			u32 temp = 2 << ecap_pss(iommu->ecap);
3245 
3246 			intel_pasid_max_id = min_t(u32, temp,
3247 						   intel_pasid_max_id);
3248 		}
3249 
3250 		g_iommus[iommu->seq_id] = iommu;
3251 
3252 		intel_iommu_init_qi(iommu);
3253 
3254 		ret = iommu_init_domains(iommu);
3255 		if (ret)
3256 			goto free_iommu;
3257 
3258 		init_translation_status(iommu);
3259 
3260 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3261 			iommu_disable_translation(iommu);
3262 			clear_translation_pre_enabled(iommu);
3263 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3264 				iommu->name);
3265 		}
3266 
3267 		/*
3268 		 * TBD:
3269 		 * we could share the same root & context tables
3270 		 * among all IOMMU's. Need to Split it later.
3271 		 */
3272 		ret = iommu_alloc_root_entry(iommu);
3273 		if (ret)
3274 			goto free_iommu;
3275 
3276 		if (translation_pre_enabled(iommu)) {
3277 			pr_info("Translation already enabled - trying to copy translation structures\n");
3278 
3279 			ret = copy_translation_tables(iommu);
3280 			if (ret) {
3281 				/*
3282 				 * We found the IOMMU with translation
3283 				 * enabled - but failed to copy over the
3284 				 * old root-entry table. Try to proceed
3285 				 * by disabling translation now and
3286 				 * allocating a clean root-entry table.
3287 				 * This might cause DMAR faults, but
3288 				 * probably the dump will still succeed.
3289 				 */
3290 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3291 				       iommu->name);
3292 				iommu_disable_translation(iommu);
3293 				clear_translation_pre_enabled(iommu);
3294 			} else {
3295 				pr_info("Copied translation tables from previous kernel for %s\n",
3296 					iommu->name);
3297 			}
3298 		}
3299 
3300 		if (!ecap_pass_through(iommu->ecap))
3301 			hw_pass_through = 0;
3302 		intel_svm_check(iommu);
3303 	}
3304 
3305 	/*
3306 	 * Now that qi is enabled on all iommus, set the root entry and flush
3307 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3308 	 * flush_context function will loop forever and the boot hangs.
3309 	 */
3310 	for_each_active_iommu(iommu, drhd) {
3311 		iommu_flush_write_buffer(iommu);
3312 #ifdef CONFIG_INTEL_IOMMU_SVM
3313 		register_pasid_allocator(iommu);
3314 #endif
3315 		iommu_set_root_entry(iommu);
3316 	}
3317 
3318 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3319 	dmar_map_gfx = 0;
3320 #endif
3321 
3322 	if (!dmar_map_gfx)
3323 		iommu_identity_mapping |= IDENTMAP_GFX;
3324 
3325 	check_tylersburg_isoch();
3326 
3327 	ret = si_domain_init(hw_pass_through);
3328 	if (ret)
3329 		goto free_iommu;
3330 
3331 	/*
3332 	 * for each drhd
3333 	 *   enable fault log
3334 	 *   global invalidate context cache
3335 	 *   global invalidate iotlb
3336 	 *   enable translation
3337 	 */
3338 	for_each_iommu(iommu, drhd) {
3339 		if (drhd->ignored) {
3340 			/*
3341 			 * we always have to disable PMRs or DMA may fail on
3342 			 * this device
3343 			 */
3344 			if (force_on)
3345 				iommu_disable_protect_mem_regions(iommu);
3346 			continue;
3347 		}
3348 
3349 		iommu_flush_write_buffer(iommu);
3350 
3351 #ifdef CONFIG_INTEL_IOMMU_SVM
3352 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3353 			/*
3354 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3355 			 * could cause possible lock race condition.
3356 			 */
3357 			up_write(&dmar_global_lock);
3358 			ret = intel_svm_enable_prq(iommu);
3359 			down_write(&dmar_global_lock);
3360 			if (ret)
3361 				goto free_iommu;
3362 		}
3363 #endif
3364 		ret = dmar_set_interrupt(iommu);
3365 		if (ret)
3366 			goto free_iommu;
3367 	}
3368 
3369 	return 0;
3370 
3371 free_iommu:
3372 	for_each_active_iommu(iommu, drhd) {
3373 		disable_dmar_iommu(iommu);
3374 		free_dmar_iommu(iommu);
3375 	}
3376 
3377 	kfree(g_iommus);
3378 
3379 error:
3380 	return ret;
3381 }
3382 
3383 static inline int iommu_domain_cache_init(void)
3384 {
3385 	int ret = 0;
3386 
3387 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3388 					 sizeof(struct dmar_domain),
3389 					 0,
3390 					 SLAB_HWCACHE_ALIGN,
3391 
3392 					 NULL);
3393 	if (!iommu_domain_cache) {
3394 		pr_err("Couldn't create iommu_domain cache\n");
3395 		ret = -ENOMEM;
3396 	}
3397 
3398 	return ret;
3399 }
3400 
3401 static inline int iommu_devinfo_cache_init(void)
3402 {
3403 	int ret = 0;
3404 
3405 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3406 					 sizeof(struct device_domain_info),
3407 					 0,
3408 					 SLAB_HWCACHE_ALIGN,
3409 					 NULL);
3410 	if (!iommu_devinfo_cache) {
3411 		pr_err("Couldn't create devinfo cache\n");
3412 		ret = -ENOMEM;
3413 	}
3414 
3415 	return ret;
3416 }
3417 
3418 static int __init iommu_init_mempool(void)
3419 {
3420 	int ret;
3421 	ret = iova_cache_get();
3422 	if (ret)
3423 		return ret;
3424 
3425 	ret = iommu_domain_cache_init();
3426 	if (ret)
3427 		goto domain_error;
3428 
3429 	ret = iommu_devinfo_cache_init();
3430 	if (!ret)
3431 		return ret;
3432 
3433 	kmem_cache_destroy(iommu_domain_cache);
3434 domain_error:
3435 	iova_cache_put();
3436 
3437 	return -ENOMEM;
3438 }
3439 
3440 static void __init iommu_exit_mempool(void)
3441 {
3442 	kmem_cache_destroy(iommu_devinfo_cache);
3443 	kmem_cache_destroy(iommu_domain_cache);
3444 	iova_cache_put();
3445 }
3446 
3447 static void __init init_no_remapping_devices(void)
3448 {
3449 	struct dmar_drhd_unit *drhd;
3450 	struct device *dev;
3451 	int i;
3452 
3453 	for_each_drhd_unit(drhd) {
3454 		if (!drhd->include_all) {
3455 			for_each_active_dev_scope(drhd->devices,
3456 						  drhd->devices_cnt, i, dev)
3457 				break;
3458 			/* ignore DMAR unit if no devices exist */
3459 			if (i == drhd->devices_cnt)
3460 				drhd->ignored = 1;
3461 		}
3462 	}
3463 
3464 	for_each_active_drhd_unit(drhd) {
3465 		if (drhd->include_all)
3466 			continue;
3467 
3468 		for_each_active_dev_scope(drhd->devices,
3469 					  drhd->devices_cnt, i, dev)
3470 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3471 				break;
3472 		if (i < drhd->devices_cnt)
3473 			continue;
3474 
3475 		/* This IOMMU has *only* gfx devices. Either bypass it or
3476 		   set the gfx_mapped flag, as appropriate */
3477 		drhd->gfx_dedicated = 1;
3478 		if (!dmar_map_gfx)
3479 			drhd->ignored = 1;
3480 	}
3481 }
3482 
3483 #ifdef CONFIG_SUSPEND
3484 static int init_iommu_hw(void)
3485 {
3486 	struct dmar_drhd_unit *drhd;
3487 	struct intel_iommu *iommu = NULL;
3488 
3489 	for_each_active_iommu(iommu, drhd)
3490 		if (iommu->qi)
3491 			dmar_reenable_qi(iommu);
3492 
3493 	for_each_iommu(iommu, drhd) {
3494 		if (drhd->ignored) {
3495 			/*
3496 			 * we always have to disable PMRs or DMA may fail on
3497 			 * this device
3498 			 */
3499 			if (force_on)
3500 				iommu_disable_protect_mem_regions(iommu);
3501 			continue;
3502 		}
3503 
3504 		iommu_flush_write_buffer(iommu);
3505 		iommu_set_root_entry(iommu);
3506 		iommu_enable_translation(iommu);
3507 		iommu_disable_protect_mem_regions(iommu);
3508 	}
3509 
3510 	return 0;
3511 }
3512 
3513 static void iommu_flush_all(void)
3514 {
3515 	struct dmar_drhd_unit *drhd;
3516 	struct intel_iommu *iommu;
3517 
3518 	for_each_active_iommu(iommu, drhd) {
3519 		iommu->flush.flush_context(iommu, 0, 0, 0,
3520 					   DMA_CCMD_GLOBAL_INVL);
3521 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3522 					 DMA_TLB_GLOBAL_FLUSH);
3523 	}
3524 }
3525 
3526 static int iommu_suspend(void)
3527 {
3528 	struct dmar_drhd_unit *drhd;
3529 	struct intel_iommu *iommu = NULL;
3530 	unsigned long flag;
3531 
3532 	for_each_active_iommu(iommu, drhd) {
3533 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3534 					     GFP_KERNEL);
3535 		if (!iommu->iommu_state)
3536 			goto nomem;
3537 	}
3538 
3539 	iommu_flush_all();
3540 
3541 	for_each_active_iommu(iommu, drhd) {
3542 		iommu_disable_translation(iommu);
3543 
3544 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3545 
3546 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3547 			readl(iommu->reg + DMAR_FECTL_REG);
3548 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3549 			readl(iommu->reg + DMAR_FEDATA_REG);
3550 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3551 			readl(iommu->reg + DMAR_FEADDR_REG);
3552 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3553 			readl(iommu->reg + DMAR_FEUADDR_REG);
3554 
3555 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3556 	}
3557 	return 0;
3558 
3559 nomem:
3560 	for_each_active_iommu(iommu, drhd)
3561 		kfree(iommu->iommu_state);
3562 
3563 	return -ENOMEM;
3564 }
3565 
3566 static void iommu_resume(void)
3567 {
3568 	struct dmar_drhd_unit *drhd;
3569 	struct intel_iommu *iommu = NULL;
3570 	unsigned long flag;
3571 
3572 	if (init_iommu_hw()) {
3573 		if (force_on)
3574 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3575 		else
3576 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3577 		return;
3578 	}
3579 
3580 	for_each_active_iommu(iommu, drhd) {
3581 
3582 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3583 
3584 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3585 			iommu->reg + DMAR_FECTL_REG);
3586 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3587 			iommu->reg + DMAR_FEDATA_REG);
3588 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3589 			iommu->reg + DMAR_FEADDR_REG);
3590 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3591 			iommu->reg + DMAR_FEUADDR_REG);
3592 
3593 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3594 	}
3595 
3596 	for_each_active_iommu(iommu, drhd)
3597 		kfree(iommu->iommu_state);
3598 }
3599 
3600 static struct syscore_ops iommu_syscore_ops = {
3601 	.resume		= iommu_resume,
3602 	.suspend	= iommu_suspend,
3603 };
3604 
3605 static void __init init_iommu_pm_ops(void)
3606 {
3607 	register_syscore_ops(&iommu_syscore_ops);
3608 }
3609 
3610 #else
3611 static inline void init_iommu_pm_ops(void) {}
3612 #endif	/* CONFIG_PM */
3613 
3614 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3615 {
3616 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3617 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3618 	    rmrr->end_address <= rmrr->base_address ||
3619 	    arch_rmrr_sanity_check(rmrr))
3620 		return -EINVAL;
3621 
3622 	return 0;
3623 }
3624 
3625 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3626 {
3627 	struct acpi_dmar_reserved_memory *rmrr;
3628 	struct dmar_rmrr_unit *rmrru;
3629 
3630 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3631 	if (rmrr_sanity_check(rmrr)) {
3632 		pr_warn(FW_BUG
3633 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3634 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3635 			   rmrr->base_address, rmrr->end_address,
3636 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3637 			   dmi_get_system_info(DMI_BIOS_VERSION),
3638 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3639 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3640 	}
3641 
3642 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3643 	if (!rmrru)
3644 		goto out;
3645 
3646 	rmrru->hdr = header;
3647 
3648 	rmrru->base_address = rmrr->base_address;
3649 	rmrru->end_address = rmrr->end_address;
3650 
3651 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3652 				((void *)rmrr) + rmrr->header.length,
3653 				&rmrru->devices_cnt);
3654 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3655 		goto free_rmrru;
3656 
3657 	list_add(&rmrru->list, &dmar_rmrr_units);
3658 
3659 	return 0;
3660 free_rmrru:
3661 	kfree(rmrru);
3662 out:
3663 	return -ENOMEM;
3664 }
3665 
3666 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3667 {
3668 	struct dmar_atsr_unit *atsru;
3669 	struct acpi_dmar_atsr *tmp;
3670 
3671 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3672 				dmar_rcu_check()) {
3673 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3674 		if (atsr->segment != tmp->segment)
3675 			continue;
3676 		if (atsr->header.length != tmp->header.length)
3677 			continue;
3678 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3679 			return atsru;
3680 	}
3681 
3682 	return NULL;
3683 }
3684 
3685 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3686 {
3687 	struct acpi_dmar_atsr *atsr;
3688 	struct dmar_atsr_unit *atsru;
3689 
3690 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3691 		return 0;
3692 
3693 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3694 	atsru = dmar_find_atsr(atsr);
3695 	if (atsru)
3696 		return 0;
3697 
3698 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3699 	if (!atsru)
3700 		return -ENOMEM;
3701 
3702 	/*
3703 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3704 	 * copy the memory content because the memory buffer will be freed
3705 	 * on return.
3706 	 */
3707 	atsru->hdr = (void *)(atsru + 1);
3708 	memcpy(atsru->hdr, hdr, hdr->length);
3709 	atsru->include_all = atsr->flags & 0x1;
3710 	if (!atsru->include_all) {
3711 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3712 				(void *)atsr + atsr->header.length,
3713 				&atsru->devices_cnt);
3714 		if (atsru->devices_cnt && atsru->devices == NULL) {
3715 			kfree(atsru);
3716 			return -ENOMEM;
3717 		}
3718 	}
3719 
3720 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3721 
3722 	return 0;
3723 }
3724 
3725 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3726 {
3727 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3728 	kfree(atsru);
3729 }
3730 
3731 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3732 {
3733 	struct acpi_dmar_atsr *atsr;
3734 	struct dmar_atsr_unit *atsru;
3735 
3736 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3737 	atsru = dmar_find_atsr(atsr);
3738 	if (atsru) {
3739 		list_del_rcu(&atsru->list);
3740 		synchronize_rcu();
3741 		intel_iommu_free_atsr(atsru);
3742 	}
3743 
3744 	return 0;
3745 }
3746 
3747 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3748 {
3749 	int i;
3750 	struct device *dev;
3751 	struct acpi_dmar_atsr *atsr;
3752 	struct dmar_atsr_unit *atsru;
3753 
3754 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3755 	atsru = dmar_find_atsr(atsr);
3756 	if (!atsru)
3757 		return 0;
3758 
3759 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3760 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3761 					  i, dev)
3762 			return -EBUSY;
3763 	}
3764 
3765 	return 0;
3766 }
3767 
3768 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3769 {
3770 	struct dmar_satc_unit *satcu;
3771 	struct acpi_dmar_satc *tmp;
3772 
3773 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3774 				dmar_rcu_check()) {
3775 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3776 		if (satc->segment != tmp->segment)
3777 			continue;
3778 		if (satc->header.length != tmp->header.length)
3779 			continue;
3780 		if (memcmp(satc, tmp, satc->header.length) == 0)
3781 			return satcu;
3782 	}
3783 
3784 	return NULL;
3785 }
3786 
3787 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3788 {
3789 	struct acpi_dmar_satc *satc;
3790 	struct dmar_satc_unit *satcu;
3791 
3792 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3793 		return 0;
3794 
3795 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3796 	satcu = dmar_find_satc(satc);
3797 	if (satcu)
3798 		return 0;
3799 
3800 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3801 	if (!satcu)
3802 		return -ENOMEM;
3803 
3804 	satcu->hdr = (void *)(satcu + 1);
3805 	memcpy(satcu->hdr, hdr, hdr->length);
3806 	satcu->atc_required = satc->flags & 0x1;
3807 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3808 					      (void *)satc + satc->header.length,
3809 					      &satcu->devices_cnt);
3810 	if (satcu->devices_cnt && !satcu->devices) {
3811 		kfree(satcu);
3812 		return -ENOMEM;
3813 	}
3814 	list_add_rcu(&satcu->list, &dmar_satc_units);
3815 
3816 	return 0;
3817 }
3818 
3819 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3820 {
3821 	int sp, ret;
3822 	struct intel_iommu *iommu = dmaru->iommu;
3823 
3824 	if (g_iommus[iommu->seq_id])
3825 		return 0;
3826 
3827 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3828 	if (ret)
3829 		goto out;
3830 
3831 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3832 		pr_warn("%s: Doesn't support hardware pass through.\n",
3833 			iommu->name);
3834 		return -ENXIO;
3835 	}
3836 	if (!ecap_sc_support(iommu->ecap) &&
3837 	    domain_update_iommu_snooping(iommu)) {
3838 		pr_warn("%s: Doesn't support snooping.\n",
3839 			iommu->name);
3840 		return -ENXIO;
3841 	}
3842 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3843 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3844 		pr_warn("%s: Doesn't support large page.\n",
3845 			iommu->name);
3846 		return -ENXIO;
3847 	}
3848 
3849 	/*
3850 	 * Disable translation if already enabled prior to OS handover.
3851 	 */
3852 	if (iommu->gcmd & DMA_GCMD_TE)
3853 		iommu_disable_translation(iommu);
3854 
3855 	g_iommus[iommu->seq_id] = iommu;
3856 	ret = iommu_init_domains(iommu);
3857 	if (ret == 0)
3858 		ret = iommu_alloc_root_entry(iommu);
3859 	if (ret)
3860 		goto out;
3861 
3862 	intel_svm_check(iommu);
3863 
3864 	if (dmaru->ignored) {
3865 		/*
3866 		 * we always have to disable PMRs or DMA may fail on this device
3867 		 */
3868 		if (force_on)
3869 			iommu_disable_protect_mem_regions(iommu);
3870 		return 0;
3871 	}
3872 
3873 	intel_iommu_init_qi(iommu);
3874 	iommu_flush_write_buffer(iommu);
3875 
3876 #ifdef CONFIG_INTEL_IOMMU_SVM
3877 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3878 		ret = intel_svm_enable_prq(iommu);
3879 		if (ret)
3880 			goto disable_iommu;
3881 	}
3882 #endif
3883 	ret = dmar_set_interrupt(iommu);
3884 	if (ret)
3885 		goto disable_iommu;
3886 
3887 	iommu_set_root_entry(iommu);
3888 	iommu_enable_translation(iommu);
3889 
3890 	iommu_disable_protect_mem_regions(iommu);
3891 	return 0;
3892 
3893 disable_iommu:
3894 	disable_dmar_iommu(iommu);
3895 out:
3896 	free_dmar_iommu(iommu);
3897 	return ret;
3898 }
3899 
3900 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3901 {
3902 	int ret = 0;
3903 	struct intel_iommu *iommu = dmaru->iommu;
3904 
3905 	if (!intel_iommu_enabled)
3906 		return 0;
3907 	if (iommu == NULL)
3908 		return -EINVAL;
3909 
3910 	if (insert) {
3911 		ret = intel_iommu_add(dmaru);
3912 	} else {
3913 		disable_dmar_iommu(iommu);
3914 		free_dmar_iommu(iommu);
3915 	}
3916 
3917 	return ret;
3918 }
3919 
3920 static void intel_iommu_free_dmars(void)
3921 {
3922 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3923 	struct dmar_atsr_unit *atsru, *atsr_n;
3924 	struct dmar_satc_unit *satcu, *satc_n;
3925 
3926 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3927 		list_del(&rmrru->list);
3928 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3929 		kfree(rmrru);
3930 	}
3931 
3932 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3933 		list_del(&atsru->list);
3934 		intel_iommu_free_atsr(atsru);
3935 	}
3936 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3937 		list_del(&satcu->list);
3938 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3939 		kfree(satcu);
3940 	}
3941 }
3942 
3943 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3944 {
3945 	int i, ret = 1;
3946 	struct pci_bus *bus;
3947 	struct pci_dev *bridge = NULL;
3948 	struct device *tmp;
3949 	struct acpi_dmar_atsr *atsr;
3950 	struct dmar_atsr_unit *atsru;
3951 
3952 	dev = pci_physfn(dev);
3953 	for (bus = dev->bus; bus; bus = bus->parent) {
3954 		bridge = bus->self;
3955 		/* If it's an integrated device, allow ATS */
3956 		if (!bridge)
3957 			return 1;
3958 		/* Connected via non-PCIe: no ATS */
3959 		if (!pci_is_pcie(bridge) ||
3960 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3961 			return 0;
3962 		/* If we found the root port, look it up in the ATSR */
3963 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3964 			break;
3965 	}
3966 
3967 	rcu_read_lock();
3968 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3969 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3970 		if (atsr->segment != pci_domain_nr(dev->bus))
3971 			continue;
3972 
3973 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3974 			if (tmp == &bridge->dev)
3975 				goto out;
3976 
3977 		if (atsru->include_all)
3978 			goto out;
3979 	}
3980 	ret = 0;
3981 out:
3982 	rcu_read_unlock();
3983 
3984 	return ret;
3985 }
3986 
3987 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3988 {
3989 	int ret;
3990 	struct dmar_rmrr_unit *rmrru;
3991 	struct dmar_atsr_unit *atsru;
3992 	struct dmar_satc_unit *satcu;
3993 	struct acpi_dmar_atsr *atsr;
3994 	struct acpi_dmar_reserved_memory *rmrr;
3995 	struct acpi_dmar_satc *satc;
3996 
3997 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3998 		return 0;
3999 
4000 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4001 		rmrr = container_of(rmrru->hdr,
4002 				    struct acpi_dmar_reserved_memory, header);
4003 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4004 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4005 				((void *)rmrr) + rmrr->header.length,
4006 				rmrr->segment, rmrru->devices,
4007 				rmrru->devices_cnt);
4008 			if (ret < 0)
4009 				return ret;
4010 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4011 			dmar_remove_dev_scope(info, rmrr->segment,
4012 				rmrru->devices, rmrru->devices_cnt);
4013 		}
4014 	}
4015 
4016 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4017 		if (atsru->include_all)
4018 			continue;
4019 
4020 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4021 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4022 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4023 					(void *)atsr + atsr->header.length,
4024 					atsr->segment, atsru->devices,
4025 					atsru->devices_cnt);
4026 			if (ret > 0)
4027 				break;
4028 			else if (ret < 0)
4029 				return ret;
4030 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4031 			if (dmar_remove_dev_scope(info, atsr->segment,
4032 					atsru->devices, atsru->devices_cnt))
4033 				break;
4034 		}
4035 	}
4036 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4037 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4038 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4039 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4040 					(void *)satc + satc->header.length,
4041 					satc->segment, satcu->devices,
4042 					satcu->devices_cnt);
4043 			if (ret > 0)
4044 				break;
4045 			else if (ret < 0)
4046 				return ret;
4047 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4048 			if (dmar_remove_dev_scope(info, satc->segment,
4049 					satcu->devices, satcu->devices_cnt))
4050 				break;
4051 		}
4052 	}
4053 
4054 	return 0;
4055 }
4056 
4057 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4058 				       unsigned long val, void *v)
4059 {
4060 	struct memory_notify *mhp = v;
4061 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4062 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4063 			mhp->nr_pages - 1);
4064 
4065 	switch (val) {
4066 	case MEM_GOING_ONLINE:
4067 		if (iommu_domain_identity_map(si_domain,
4068 					      start_vpfn, last_vpfn)) {
4069 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4070 				start_vpfn, last_vpfn);
4071 			return NOTIFY_BAD;
4072 		}
4073 		break;
4074 
4075 	case MEM_OFFLINE:
4076 	case MEM_CANCEL_ONLINE:
4077 		{
4078 			struct dmar_drhd_unit *drhd;
4079 			struct intel_iommu *iommu;
4080 			struct page *freelist;
4081 
4082 			freelist = domain_unmap(si_domain,
4083 						start_vpfn, last_vpfn,
4084 						NULL);
4085 
4086 			rcu_read_lock();
4087 			for_each_active_iommu(iommu, drhd)
4088 				iommu_flush_iotlb_psi(iommu, si_domain,
4089 					start_vpfn, mhp->nr_pages,
4090 					!freelist, 0);
4091 			rcu_read_unlock();
4092 			dma_free_pagelist(freelist);
4093 		}
4094 		break;
4095 	}
4096 
4097 	return NOTIFY_OK;
4098 }
4099 
4100 static struct notifier_block intel_iommu_memory_nb = {
4101 	.notifier_call = intel_iommu_memory_notifier,
4102 	.priority = 0
4103 };
4104 
4105 static void intel_disable_iommus(void)
4106 {
4107 	struct intel_iommu *iommu = NULL;
4108 	struct dmar_drhd_unit *drhd;
4109 
4110 	for_each_iommu(iommu, drhd)
4111 		iommu_disable_translation(iommu);
4112 }
4113 
4114 void intel_iommu_shutdown(void)
4115 {
4116 	struct dmar_drhd_unit *drhd;
4117 	struct intel_iommu *iommu = NULL;
4118 
4119 	if (no_iommu || dmar_disabled)
4120 		return;
4121 
4122 	down_write(&dmar_global_lock);
4123 
4124 	/* Disable PMRs explicitly here. */
4125 	for_each_iommu(iommu, drhd)
4126 		iommu_disable_protect_mem_regions(iommu);
4127 
4128 	/* Make sure the IOMMUs are switched off */
4129 	intel_disable_iommus();
4130 
4131 	up_write(&dmar_global_lock);
4132 }
4133 
4134 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4135 {
4136 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4137 
4138 	return container_of(iommu_dev, struct intel_iommu, iommu);
4139 }
4140 
4141 static ssize_t intel_iommu_show_version(struct device *dev,
4142 					struct device_attribute *attr,
4143 					char *buf)
4144 {
4145 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4146 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4147 	return sprintf(buf, "%d:%d\n",
4148 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4149 }
4150 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4151 
4152 static ssize_t intel_iommu_show_address(struct device *dev,
4153 					struct device_attribute *attr,
4154 					char *buf)
4155 {
4156 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4157 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4158 }
4159 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4160 
4161 static ssize_t intel_iommu_show_cap(struct device *dev,
4162 				    struct device_attribute *attr,
4163 				    char *buf)
4164 {
4165 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4166 	return sprintf(buf, "%llx\n", iommu->cap);
4167 }
4168 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4169 
4170 static ssize_t intel_iommu_show_ecap(struct device *dev,
4171 				    struct device_attribute *attr,
4172 				    char *buf)
4173 {
4174 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4175 	return sprintf(buf, "%llx\n", iommu->ecap);
4176 }
4177 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4178 
4179 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4180 				      struct device_attribute *attr,
4181 				      char *buf)
4182 {
4183 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4184 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4185 }
4186 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4187 
4188 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4189 					   struct device_attribute *attr,
4190 					   char *buf)
4191 {
4192 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4193 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4194 						  cap_ndoms(iommu->cap)));
4195 }
4196 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4197 
4198 static struct attribute *intel_iommu_attrs[] = {
4199 	&dev_attr_version.attr,
4200 	&dev_attr_address.attr,
4201 	&dev_attr_cap.attr,
4202 	&dev_attr_ecap.attr,
4203 	&dev_attr_domains_supported.attr,
4204 	&dev_attr_domains_used.attr,
4205 	NULL,
4206 };
4207 
4208 static struct attribute_group intel_iommu_group = {
4209 	.name = "intel-iommu",
4210 	.attrs = intel_iommu_attrs,
4211 };
4212 
4213 const struct attribute_group *intel_iommu_groups[] = {
4214 	&intel_iommu_group,
4215 	NULL,
4216 };
4217 
4218 static inline bool has_external_pci(void)
4219 {
4220 	struct pci_dev *pdev = NULL;
4221 
4222 	for_each_pci_dev(pdev)
4223 		if (pdev->external_facing)
4224 			return true;
4225 
4226 	return false;
4227 }
4228 
4229 static int __init platform_optin_force_iommu(void)
4230 {
4231 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4232 		return 0;
4233 
4234 	if (no_iommu || dmar_disabled)
4235 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4236 
4237 	/*
4238 	 * If Intel-IOMMU is disabled by default, we will apply identity
4239 	 * map for all devices except those marked as being untrusted.
4240 	 */
4241 	if (dmar_disabled)
4242 		iommu_set_default_passthrough(false);
4243 
4244 	dmar_disabled = 0;
4245 	no_iommu = 0;
4246 
4247 	return 1;
4248 }
4249 
4250 static int __init probe_acpi_namespace_devices(void)
4251 {
4252 	struct dmar_drhd_unit *drhd;
4253 	/* To avoid a -Wunused-but-set-variable warning. */
4254 	struct intel_iommu *iommu __maybe_unused;
4255 	struct device *dev;
4256 	int i, ret = 0;
4257 
4258 	for_each_active_iommu(iommu, drhd) {
4259 		for_each_active_dev_scope(drhd->devices,
4260 					  drhd->devices_cnt, i, dev) {
4261 			struct acpi_device_physical_node *pn;
4262 			struct iommu_group *group;
4263 			struct acpi_device *adev;
4264 
4265 			if (dev->bus != &acpi_bus_type)
4266 				continue;
4267 
4268 			adev = to_acpi_device(dev);
4269 			mutex_lock(&adev->physical_node_lock);
4270 			list_for_each_entry(pn,
4271 					    &adev->physical_node_list, node) {
4272 				group = iommu_group_get(pn->dev);
4273 				if (group) {
4274 					iommu_group_put(group);
4275 					continue;
4276 				}
4277 
4278 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4279 				ret = iommu_probe_device(pn->dev);
4280 				if (ret)
4281 					break;
4282 			}
4283 			mutex_unlock(&adev->physical_node_lock);
4284 
4285 			if (ret)
4286 				return ret;
4287 		}
4288 	}
4289 
4290 	return 0;
4291 }
4292 
4293 int __init intel_iommu_init(void)
4294 {
4295 	int ret = -ENODEV;
4296 	struct dmar_drhd_unit *drhd;
4297 	struct intel_iommu *iommu;
4298 
4299 	/*
4300 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4301 	 * opt in, so enforce that.
4302 	 */
4303 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4304 		    platform_optin_force_iommu();
4305 
4306 	if (iommu_init_mempool()) {
4307 		if (force_on)
4308 			panic("tboot: Failed to initialize iommu memory\n");
4309 		return -ENOMEM;
4310 	}
4311 
4312 	down_write(&dmar_global_lock);
4313 	if (dmar_table_init()) {
4314 		if (force_on)
4315 			panic("tboot: Failed to initialize DMAR table\n");
4316 		goto out_free_dmar;
4317 	}
4318 
4319 	if (dmar_dev_scope_init() < 0) {
4320 		if (force_on)
4321 			panic("tboot: Failed to initialize DMAR device scope\n");
4322 		goto out_free_dmar;
4323 	}
4324 
4325 	up_write(&dmar_global_lock);
4326 
4327 	/*
4328 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4329 	 * complain later when we register it under the lock.
4330 	 */
4331 	dmar_register_bus_notifier();
4332 
4333 	down_write(&dmar_global_lock);
4334 
4335 	if (!no_iommu)
4336 		intel_iommu_debugfs_init();
4337 
4338 	if (no_iommu || dmar_disabled) {
4339 		/*
4340 		 * We exit the function here to ensure IOMMU's remapping and
4341 		 * mempool aren't setup, which means that the IOMMU's PMRs
4342 		 * won't be disabled via the call to init_dmars(). So disable
4343 		 * it explicitly here. The PMRs were setup by tboot prior to
4344 		 * calling SENTER, but the kernel is expected to reset/tear
4345 		 * down the PMRs.
4346 		 */
4347 		if (intel_iommu_tboot_noforce) {
4348 			for_each_iommu(iommu, drhd)
4349 				iommu_disable_protect_mem_regions(iommu);
4350 		}
4351 
4352 		/*
4353 		 * Make sure the IOMMUs are switched off, even when we
4354 		 * boot into a kexec kernel and the previous kernel left
4355 		 * them enabled
4356 		 */
4357 		intel_disable_iommus();
4358 		goto out_free_dmar;
4359 	}
4360 
4361 	if (list_empty(&dmar_rmrr_units))
4362 		pr_info("No RMRR found\n");
4363 
4364 	if (list_empty(&dmar_atsr_units))
4365 		pr_info("No ATSR found\n");
4366 
4367 	if (list_empty(&dmar_satc_units))
4368 		pr_info("No SATC found\n");
4369 
4370 	if (dmar_map_gfx)
4371 		intel_iommu_gfx_mapped = 1;
4372 
4373 	init_no_remapping_devices();
4374 
4375 	ret = init_dmars();
4376 	if (ret) {
4377 		if (force_on)
4378 			panic("tboot: Failed to initialize DMARs\n");
4379 		pr_err("Initialization failed\n");
4380 		goto out_free_dmar;
4381 	}
4382 	up_write(&dmar_global_lock);
4383 
4384 	init_iommu_pm_ops();
4385 
4386 	down_read(&dmar_global_lock);
4387 	for_each_active_iommu(iommu, drhd) {
4388 		/*
4389 		 * The flush queue implementation does not perform
4390 		 * page-selective invalidations that are required for efficient
4391 		 * TLB flushes in virtual environments.  The benefit of batching
4392 		 * is likely to be much lower than the overhead of synchronizing
4393 		 * the virtual and physical IOMMU page-tables.
4394 		 */
4395 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
4396 			pr_warn("IOMMU batching is disabled due to virtualization");
4397 			intel_iommu_strict = 1;
4398 		}
4399 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4400 				       intel_iommu_groups,
4401 				       "%s", iommu->name);
4402 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4403 	}
4404 	up_read(&dmar_global_lock);
4405 
4406 	iommu_set_dma_strict(intel_iommu_strict);
4407 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4408 	if (si_domain && !hw_pass_through)
4409 		register_memory_notifier(&intel_iommu_memory_nb);
4410 
4411 	down_read(&dmar_global_lock);
4412 	if (probe_acpi_namespace_devices())
4413 		pr_warn("ACPI name space devices didn't probe correctly\n");
4414 
4415 	/* Finally, we enable the DMA remapping hardware. */
4416 	for_each_iommu(iommu, drhd) {
4417 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4418 			iommu_enable_translation(iommu);
4419 
4420 		iommu_disable_protect_mem_regions(iommu);
4421 	}
4422 	up_read(&dmar_global_lock);
4423 
4424 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4425 
4426 	intel_iommu_enabled = 1;
4427 
4428 	return 0;
4429 
4430 out_free_dmar:
4431 	intel_iommu_free_dmars();
4432 	up_write(&dmar_global_lock);
4433 	iommu_exit_mempool();
4434 	return ret;
4435 }
4436 
4437 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4438 {
4439 	struct intel_iommu *iommu = opaque;
4440 
4441 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4442 	return 0;
4443 }
4444 
4445 /*
4446  * NB - intel-iommu lacks any sort of reference counting for the users of
4447  * dependent devices.  If multiple endpoints have intersecting dependent
4448  * devices, unbinding the driver from any one of them will possibly leave
4449  * the others unable to operate.
4450  */
4451 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4452 {
4453 	if (!iommu || !dev || !dev_is_pci(dev))
4454 		return;
4455 
4456 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4457 }
4458 
4459 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4460 {
4461 	struct dmar_domain *domain;
4462 	struct intel_iommu *iommu;
4463 	unsigned long flags;
4464 
4465 	assert_spin_locked(&device_domain_lock);
4466 
4467 	if (WARN_ON(!info))
4468 		return;
4469 
4470 	iommu = info->iommu;
4471 	domain = info->domain;
4472 
4473 	if (info->dev) {
4474 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4475 			intel_pasid_tear_down_entry(iommu, info->dev,
4476 					PASID_RID2PASID, false);
4477 
4478 		iommu_disable_dev_iotlb(info);
4479 		if (!dev_is_real_dma_subdevice(info->dev))
4480 			domain_context_clear(iommu, info->dev);
4481 		intel_pasid_free_table(info->dev);
4482 	}
4483 
4484 	unlink_domain_info(info);
4485 
4486 	spin_lock_irqsave(&iommu->lock, flags);
4487 	domain_detach_iommu(domain, iommu);
4488 	spin_unlock_irqrestore(&iommu->lock, flags);
4489 
4490 	free_devinfo_mem(info);
4491 }
4492 
4493 static void dmar_remove_one_dev_info(struct device *dev)
4494 {
4495 	struct device_domain_info *info;
4496 	unsigned long flags;
4497 
4498 	spin_lock_irqsave(&device_domain_lock, flags);
4499 	info = get_domain_info(dev);
4500 	if (info)
4501 		__dmar_remove_one_dev_info(info);
4502 	spin_unlock_irqrestore(&device_domain_lock, flags);
4503 }
4504 
4505 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4506 {
4507 	int adjust_width;
4508 
4509 	/* calculate AGAW */
4510 	domain->gaw = guest_width;
4511 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4512 	domain->agaw = width_to_agaw(adjust_width);
4513 
4514 	domain->iommu_coherency = 0;
4515 	domain->iommu_snooping = 0;
4516 	domain->iommu_superpage = 0;
4517 	domain->max_addr = 0;
4518 
4519 	/* always allocate the top pgd */
4520 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4521 	if (!domain->pgd)
4522 		return -ENOMEM;
4523 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4524 	return 0;
4525 }
4526 
4527 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4528 {
4529 	struct dmar_domain *dmar_domain;
4530 	struct iommu_domain *domain;
4531 
4532 	switch (type) {
4533 	case IOMMU_DOMAIN_DMA:
4534 	case IOMMU_DOMAIN_UNMANAGED:
4535 		dmar_domain = alloc_domain(0);
4536 		if (!dmar_domain) {
4537 			pr_err("Can't allocate dmar_domain\n");
4538 			return NULL;
4539 		}
4540 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4541 			pr_err("Domain initialization failed\n");
4542 			domain_exit(dmar_domain);
4543 			return NULL;
4544 		}
4545 
4546 		if (type == IOMMU_DOMAIN_DMA &&
4547 		    iommu_get_dma_cookie(&dmar_domain->domain))
4548 			return NULL;
4549 
4550 		domain = &dmar_domain->domain;
4551 		domain->geometry.aperture_start = 0;
4552 		domain->geometry.aperture_end   =
4553 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4554 		domain->geometry.force_aperture = true;
4555 
4556 		return domain;
4557 	case IOMMU_DOMAIN_IDENTITY:
4558 		return &si_domain->domain;
4559 	default:
4560 		return NULL;
4561 	}
4562 
4563 	return NULL;
4564 }
4565 
4566 static void intel_iommu_domain_free(struct iommu_domain *domain)
4567 {
4568 	if (domain != &si_domain->domain)
4569 		domain_exit(to_dmar_domain(domain));
4570 }
4571 
4572 /*
4573  * Check whether a @domain could be attached to the @dev through the
4574  * aux-domain attach/detach APIs.
4575  */
4576 static inline bool
4577 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4578 {
4579 	struct device_domain_info *info = get_domain_info(dev);
4580 
4581 	return info && info->auxd_enabled &&
4582 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4583 }
4584 
4585 static inline struct subdev_domain_info *
4586 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4587 {
4588 	struct subdev_domain_info *sinfo;
4589 
4590 	if (!list_empty(&domain->subdevices)) {
4591 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4592 			if (sinfo->pdev == dev)
4593 				return sinfo;
4594 		}
4595 	}
4596 
4597 	return NULL;
4598 }
4599 
4600 static int auxiliary_link_device(struct dmar_domain *domain,
4601 				 struct device *dev)
4602 {
4603 	struct device_domain_info *info = get_domain_info(dev);
4604 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4605 
4606 	assert_spin_locked(&device_domain_lock);
4607 	if (WARN_ON(!info))
4608 		return -EINVAL;
4609 
4610 	if (!sinfo) {
4611 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4612 		if (!sinfo)
4613 			return -ENOMEM;
4614 		sinfo->domain = domain;
4615 		sinfo->pdev = dev;
4616 		list_add(&sinfo->link_phys, &info->subdevices);
4617 		list_add(&sinfo->link_domain, &domain->subdevices);
4618 	}
4619 
4620 	return ++sinfo->users;
4621 }
4622 
4623 static int auxiliary_unlink_device(struct dmar_domain *domain,
4624 				   struct device *dev)
4625 {
4626 	struct device_domain_info *info = get_domain_info(dev);
4627 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4628 	int ret;
4629 
4630 	assert_spin_locked(&device_domain_lock);
4631 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4632 		return -EINVAL;
4633 
4634 	ret = --sinfo->users;
4635 	if (!ret) {
4636 		list_del(&sinfo->link_phys);
4637 		list_del(&sinfo->link_domain);
4638 		kfree(sinfo);
4639 	}
4640 
4641 	return ret;
4642 }
4643 
4644 static int aux_domain_add_dev(struct dmar_domain *domain,
4645 			      struct device *dev)
4646 {
4647 	int ret;
4648 	unsigned long flags;
4649 	struct intel_iommu *iommu;
4650 
4651 	iommu = device_to_iommu(dev, NULL, NULL);
4652 	if (!iommu)
4653 		return -ENODEV;
4654 
4655 	if (domain->default_pasid <= 0) {
4656 		u32 pasid;
4657 
4658 		/* No private data needed for the default pasid */
4659 		pasid = ioasid_alloc(NULL, PASID_MIN,
4660 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4661 				     NULL);
4662 		if (pasid == INVALID_IOASID) {
4663 			pr_err("Can't allocate default pasid\n");
4664 			return -ENODEV;
4665 		}
4666 		domain->default_pasid = pasid;
4667 	}
4668 
4669 	spin_lock_irqsave(&device_domain_lock, flags);
4670 	ret = auxiliary_link_device(domain, dev);
4671 	if (ret <= 0)
4672 		goto link_failed;
4673 
4674 	/*
4675 	 * Subdevices from the same physical device can be attached to the
4676 	 * same domain. For such cases, only the first subdevice attachment
4677 	 * needs to go through the full steps in this function. So if ret >
4678 	 * 1, just goto out.
4679 	 */
4680 	if (ret > 1)
4681 		goto out;
4682 
4683 	/*
4684 	 * iommu->lock must be held to attach domain to iommu and setup the
4685 	 * pasid entry for second level translation.
4686 	 */
4687 	spin_lock(&iommu->lock);
4688 	ret = domain_attach_iommu(domain, iommu);
4689 	if (ret)
4690 		goto attach_failed;
4691 
4692 	/* Setup the PASID entry for mediated devices: */
4693 	if (domain_use_first_level(domain))
4694 		ret = domain_setup_first_level(iommu, domain, dev,
4695 					       domain->default_pasid);
4696 	else
4697 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4698 						     domain->default_pasid);
4699 	if (ret)
4700 		goto table_failed;
4701 
4702 	spin_unlock(&iommu->lock);
4703 out:
4704 	spin_unlock_irqrestore(&device_domain_lock, flags);
4705 
4706 	return 0;
4707 
4708 table_failed:
4709 	domain_detach_iommu(domain, iommu);
4710 attach_failed:
4711 	spin_unlock(&iommu->lock);
4712 	auxiliary_unlink_device(domain, dev);
4713 link_failed:
4714 	spin_unlock_irqrestore(&device_domain_lock, flags);
4715 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4716 		ioasid_put(domain->default_pasid);
4717 
4718 	return ret;
4719 }
4720 
4721 static void aux_domain_remove_dev(struct dmar_domain *domain,
4722 				  struct device *dev)
4723 {
4724 	struct device_domain_info *info;
4725 	struct intel_iommu *iommu;
4726 	unsigned long flags;
4727 
4728 	if (!is_aux_domain(dev, &domain->domain))
4729 		return;
4730 
4731 	spin_lock_irqsave(&device_domain_lock, flags);
4732 	info = get_domain_info(dev);
4733 	iommu = info->iommu;
4734 
4735 	if (!auxiliary_unlink_device(domain, dev)) {
4736 		spin_lock(&iommu->lock);
4737 		intel_pasid_tear_down_entry(iommu, dev,
4738 					    domain->default_pasid, false);
4739 		domain_detach_iommu(domain, iommu);
4740 		spin_unlock(&iommu->lock);
4741 	}
4742 
4743 	spin_unlock_irqrestore(&device_domain_lock, flags);
4744 
4745 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4746 		ioasid_put(domain->default_pasid);
4747 }
4748 
4749 static int prepare_domain_attach_device(struct iommu_domain *domain,
4750 					struct device *dev)
4751 {
4752 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4753 	struct intel_iommu *iommu;
4754 	int addr_width;
4755 
4756 	iommu = device_to_iommu(dev, NULL, NULL);
4757 	if (!iommu)
4758 		return -ENODEV;
4759 
4760 	/* check if this iommu agaw is sufficient for max mapped address */
4761 	addr_width = agaw_to_width(iommu->agaw);
4762 	if (addr_width > cap_mgaw(iommu->cap))
4763 		addr_width = cap_mgaw(iommu->cap);
4764 
4765 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4766 		dev_err(dev, "%s: iommu width (%d) is not "
4767 		        "sufficient for the mapped address (%llx)\n",
4768 		        __func__, addr_width, dmar_domain->max_addr);
4769 		return -EFAULT;
4770 	}
4771 	dmar_domain->gaw = addr_width;
4772 
4773 	/*
4774 	 * Knock out extra levels of page tables if necessary
4775 	 */
4776 	while (iommu->agaw < dmar_domain->agaw) {
4777 		struct dma_pte *pte;
4778 
4779 		pte = dmar_domain->pgd;
4780 		if (dma_pte_present(pte)) {
4781 			dmar_domain->pgd = (struct dma_pte *)
4782 				phys_to_virt(dma_pte_addr(pte));
4783 			free_pgtable_page(pte);
4784 		}
4785 		dmar_domain->agaw--;
4786 	}
4787 
4788 	return 0;
4789 }
4790 
4791 static int intel_iommu_attach_device(struct iommu_domain *domain,
4792 				     struct device *dev)
4793 {
4794 	int ret;
4795 
4796 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4797 	    device_is_rmrr_locked(dev)) {
4798 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4799 		return -EPERM;
4800 	}
4801 
4802 	if (is_aux_domain(dev, domain))
4803 		return -EPERM;
4804 
4805 	/* normally dev is not mapped */
4806 	if (unlikely(domain_context_mapped(dev))) {
4807 		struct dmar_domain *old_domain;
4808 
4809 		old_domain = find_domain(dev);
4810 		if (old_domain)
4811 			dmar_remove_one_dev_info(dev);
4812 	}
4813 
4814 	ret = prepare_domain_attach_device(domain, dev);
4815 	if (ret)
4816 		return ret;
4817 
4818 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4819 }
4820 
4821 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4822 					 struct device *dev)
4823 {
4824 	int ret;
4825 
4826 	if (!is_aux_domain(dev, domain))
4827 		return -EPERM;
4828 
4829 	ret = prepare_domain_attach_device(domain, dev);
4830 	if (ret)
4831 		return ret;
4832 
4833 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4834 }
4835 
4836 static void intel_iommu_detach_device(struct iommu_domain *domain,
4837 				      struct device *dev)
4838 {
4839 	dmar_remove_one_dev_info(dev);
4840 }
4841 
4842 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4843 					  struct device *dev)
4844 {
4845 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4846 }
4847 
4848 #ifdef CONFIG_INTEL_IOMMU_SVM
4849 /*
4850  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4851  * VT-d granularity. Invalidation is typically included in the unmap operation
4852  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4853  * owns the first level page tables. Invalidations of translation caches in the
4854  * guest are trapped and passed down to the host.
4855  *
4856  * vIOMMU in the guest will only expose first level page tables, therefore
4857  * we do not support IOTLB granularity for request without PASID (second level).
4858  *
4859  * For example, to find the VT-d granularity encoding for IOTLB
4860  * type and page selective granularity within PASID:
4861  * X: indexed by iommu cache type
4862  * Y: indexed by enum iommu_inv_granularity
4863  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4864  */
4865 
4866 static const int
4867 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4868 	/*
4869 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4870 	 * page selective (address granularity)
4871 	 */
4872 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4873 	/* PASID based dev TLBs */
4874 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4875 	/* PASID cache */
4876 	{-EINVAL, -EINVAL, -EINVAL}
4877 };
4878 
4879 static inline int to_vtd_granularity(int type, int granu)
4880 {
4881 	return inv_type_granu_table[type][granu];
4882 }
4883 
4884 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4885 {
4886 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4887 
4888 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4889 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4890 	 * granu size in contiguous memory.
4891 	 */
4892 	return order_base_2(nr_pages);
4893 }
4894 
4895 static int
4896 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4897 			   struct iommu_cache_invalidate_info *inv_info)
4898 {
4899 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4900 	struct device_domain_info *info;
4901 	struct intel_iommu *iommu;
4902 	unsigned long flags;
4903 	int cache_type;
4904 	u8 bus, devfn;
4905 	u16 did, sid;
4906 	int ret = 0;
4907 	u64 size = 0;
4908 
4909 	if (!inv_info || !dmar_domain)
4910 		return -EINVAL;
4911 
4912 	if (!dev || !dev_is_pci(dev))
4913 		return -ENODEV;
4914 
4915 	iommu = device_to_iommu(dev, &bus, &devfn);
4916 	if (!iommu)
4917 		return -ENODEV;
4918 
4919 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4920 		return -EINVAL;
4921 
4922 	spin_lock_irqsave(&device_domain_lock, flags);
4923 	spin_lock(&iommu->lock);
4924 	info = get_domain_info(dev);
4925 	if (!info) {
4926 		ret = -EINVAL;
4927 		goto out_unlock;
4928 	}
4929 	did = dmar_domain->iommu_did[iommu->seq_id];
4930 	sid = PCI_DEVID(bus, devfn);
4931 
4932 	/* Size is only valid in address selective invalidation */
4933 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4934 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4935 				   inv_info->granu.addr_info.nb_granules);
4936 
4937 	for_each_set_bit(cache_type,
4938 			 (unsigned long *)&inv_info->cache,
4939 			 IOMMU_CACHE_INV_TYPE_NR) {
4940 		int granu = 0;
4941 		u64 pasid = 0;
4942 		u64 addr = 0;
4943 
4944 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4945 		if (granu == -EINVAL) {
4946 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4947 					   cache_type, inv_info->granularity);
4948 			break;
4949 		}
4950 
4951 		/*
4952 		 * PASID is stored in different locations based on the
4953 		 * granularity.
4954 		 */
4955 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4956 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4957 			pasid = inv_info->granu.pasid_info.pasid;
4958 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4959 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4960 			pasid = inv_info->granu.addr_info.pasid;
4961 
4962 		switch (BIT(cache_type)) {
4963 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4964 			/* HW will ignore LSB bits based on address mask */
4965 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4966 			    size &&
4967 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4968 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4969 						   inv_info->granu.addr_info.addr, size);
4970 			}
4971 
4972 			/*
4973 			 * If granu is PASID-selective, address is ignored.
4974 			 * We use npages = -1 to indicate that.
4975 			 */
4976 			qi_flush_piotlb(iommu, did, pasid,
4977 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4978 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4979 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4980 
4981 			if (!info->ats_enabled)
4982 				break;
4983 			/*
4984 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4985 			 * in the guest may assume IOTLB flush is inclusive,
4986 			 * which is more efficient.
4987 			 */
4988 			fallthrough;
4989 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4990 			/*
4991 			 * PASID based device TLB invalidation does not support
4992 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4993 			 * IOMMU_INV_GRANU_ADDR.
4994 			 * The equivalent of that is we set the size to be the
4995 			 * entire range of 64 bit. User only provides PASID info
4996 			 * without address info. So we set addr to 0.
4997 			 */
4998 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4999 				size = 64 - VTD_PAGE_SHIFT;
5000 				addr = 0;
5001 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5002 				addr = inv_info->granu.addr_info.addr;
5003 			}
5004 
5005 			if (info->ats_enabled)
5006 				qi_flush_dev_iotlb_pasid(iommu, sid,
5007 						info->pfsid, pasid,
5008 						info->ats_qdep, addr,
5009 						size);
5010 			else
5011 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5012 			break;
5013 		default:
5014 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5015 					    cache_type);
5016 			ret = -EINVAL;
5017 		}
5018 	}
5019 out_unlock:
5020 	spin_unlock(&iommu->lock);
5021 	spin_unlock_irqrestore(&device_domain_lock, flags);
5022 
5023 	return ret;
5024 }
5025 #endif
5026 
5027 static int intel_iommu_map(struct iommu_domain *domain,
5028 			   unsigned long iova, phys_addr_t hpa,
5029 			   size_t size, int iommu_prot, gfp_t gfp)
5030 {
5031 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5032 	u64 max_addr;
5033 	int prot = 0;
5034 
5035 	if (iommu_prot & IOMMU_READ)
5036 		prot |= DMA_PTE_READ;
5037 	if (iommu_prot & IOMMU_WRITE)
5038 		prot |= DMA_PTE_WRITE;
5039 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5040 		prot |= DMA_PTE_SNP;
5041 
5042 	max_addr = iova + size;
5043 	if (dmar_domain->max_addr < max_addr) {
5044 		u64 end;
5045 
5046 		/* check if minimum agaw is sufficient for mapped address */
5047 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5048 		if (end < max_addr) {
5049 			pr_err("%s: iommu width (%d) is not "
5050 			       "sufficient for the mapped address (%llx)\n",
5051 			       __func__, dmar_domain->gaw, max_addr);
5052 			return -EFAULT;
5053 		}
5054 		dmar_domain->max_addr = max_addr;
5055 	}
5056 	/* Round up size to next multiple of PAGE_SIZE, if it and
5057 	   the low bits of hpa would take us onto the next page */
5058 	size = aligned_nrpages(hpa, size);
5059 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5060 				hpa >> VTD_PAGE_SHIFT, size, prot);
5061 }
5062 
5063 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5064 				unsigned long iova, size_t size,
5065 				struct iommu_iotlb_gather *gather)
5066 {
5067 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5068 	unsigned long start_pfn, last_pfn;
5069 	int level = 0;
5070 
5071 	/* Cope with horrid API which requires us to unmap more than the
5072 	   size argument if it happens to be a large-page mapping. */
5073 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5074 
5075 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5076 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5077 
5078 	start_pfn = iova >> VTD_PAGE_SHIFT;
5079 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5080 
5081 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5082 					last_pfn, gather->freelist);
5083 
5084 	if (dmar_domain->max_addr == iova + size)
5085 		dmar_domain->max_addr = iova;
5086 
5087 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5088 
5089 	return size;
5090 }
5091 
5092 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5093 				 struct iommu_iotlb_gather *gather)
5094 {
5095 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5096 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5097 	size_t size = gather->end - gather->start;
5098 	unsigned long start_pfn;
5099 	unsigned long nrpages;
5100 	int iommu_id;
5101 
5102 	nrpages = aligned_nrpages(gather->start, size);
5103 	start_pfn = mm_to_dma_pfn(iova_pfn);
5104 
5105 	for_each_domain_iommu(iommu_id, dmar_domain)
5106 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5107 				      start_pfn, nrpages, !gather->freelist, 0);
5108 
5109 	dma_free_pagelist(gather->freelist);
5110 }
5111 
5112 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5113 					    dma_addr_t iova)
5114 {
5115 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5116 	struct dma_pte *pte;
5117 	int level = 0;
5118 	u64 phys = 0;
5119 
5120 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5121 	if (pte && dma_pte_present(pte))
5122 		phys = dma_pte_addr(pte) +
5123 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5124 						VTD_PAGE_SHIFT) - 1));
5125 
5126 	return phys;
5127 }
5128 
5129 static bool intel_iommu_capable(enum iommu_cap cap)
5130 {
5131 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5132 		return domain_update_iommu_snooping(NULL) == 1;
5133 	if (cap == IOMMU_CAP_INTR_REMAP)
5134 		return irq_remapping_enabled == 1;
5135 
5136 	return false;
5137 }
5138 
5139 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5140 {
5141 	struct intel_iommu *iommu;
5142 
5143 	iommu = device_to_iommu(dev, NULL, NULL);
5144 	if (!iommu)
5145 		return ERR_PTR(-ENODEV);
5146 
5147 	if (translation_pre_enabled(iommu))
5148 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5149 
5150 	return &iommu->iommu;
5151 }
5152 
5153 static void intel_iommu_release_device(struct device *dev)
5154 {
5155 	struct intel_iommu *iommu;
5156 
5157 	iommu = device_to_iommu(dev, NULL, NULL);
5158 	if (!iommu)
5159 		return;
5160 
5161 	dmar_remove_one_dev_info(dev);
5162 
5163 	set_dma_ops(dev, NULL);
5164 }
5165 
5166 static void intel_iommu_probe_finalize(struct device *dev)
5167 {
5168 	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5169 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5170 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5171 
5172 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5173 		iommu_setup_dma_ops(dev, base,
5174 				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5175 	else
5176 		set_dma_ops(dev, NULL);
5177 }
5178 
5179 static void intel_iommu_get_resv_regions(struct device *device,
5180 					 struct list_head *head)
5181 {
5182 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5183 	struct iommu_resv_region *reg;
5184 	struct dmar_rmrr_unit *rmrr;
5185 	struct device *i_dev;
5186 	int i;
5187 
5188 	down_read(&dmar_global_lock);
5189 	for_each_rmrr_units(rmrr) {
5190 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5191 					  i, i_dev) {
5192 			struct iommu_resv_region *resv;
5193 			enum iommu_resv_type type;
5194 			size_t length;
5195 
5196 			if (i_dev != device &&
5197 			    !is_downstream_to_pci_bridge(device, i_dev))
5198 				continue;
5199 
5200 			length = rmrr->end_address - rmrr->base_address + 1;
5201 
5202 			type = device_rmrr_is_relaxable(device) ?
5203 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5204 
5205 			resv = iommu_alloc_resv_region(rmrr->base_address,
5206 						       length, prot, type);
5207 			if (!resv)
5208 				break;
5209 
5210 			list_add_tail(&resv->list, head);
5211 		}
5212 	}
5213 	up_read(&dmar_global_lock);
5214 
5215 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5216 	if (dev_is_pci(device)) {
5217 		struct pci_dev *pdev = to_pci_dev(device);
5218 
5219 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5220 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5221 						   IOMMU_RESV_DIRECT_RELAXABLE);
5222 			if (reg)
5223 				list_add_tail(&reg->list, head);
5224 		}
5225 	}
5226 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5227 
5228 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5229 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5230 				      0, IOMMU_RESV_MSI);
5231 	if (!reg)
5232 		return;
5233 	list_add_tail(&reg->list, head);
5234 }
5235 
5236 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5237 {
5238 	struct device_domain_info *info;
5239 	struct context_entry *context;
5240 	struct dmar_domain *domain;
5241 	unsigned long flags;
5242 	u64 ctx_lo;
5243 	int ret;
5244 
5245 	domain = find_domain(dev);
5246 	if (!domain)
5247 		return -EINVAL;
5248 
5249 	spin_lock_irqsave(&device_domain_lock, flags);
5250 	spin_lock(&iommu->lock);
5251 
5252 	ret = -EINVAL;
5253 	info = get_domain_info(dev);
5254 	if (!info || !info->pasid_supported)
5255 		goto out;
5256 
5257 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5258 	if (WARN_ON(!context))
5259 		goto out;
5260 
5261 	ctx_lo = context[0].lo;
5262 
5263 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5264 		ctx_lo |= CONTEXT_PASIDE;
5265 		context[0].lo = ctx_lo;
5266 		wmb();
5267 		iommu->flush.flush_context(iommu,
5268 					   domain->iommu_did[iommu->seq_id],
5269 					   PCI_DEVID(info->bus, info->devfn),
5270 					   DMA_CCMD_MASK_NOBIT,
5271 					   DMA_CCMD_DEVICE_INVL);
5272 	}
5273 
5274 	/* Enable PASID support in the device, if it wasn't already */
5275 	if (!info->pasid_enabled)
5276 		iommu_enable_dev_iotlb(info);
5277 
5278 	ret = 0;
5279 
5280  out:
5281 	spin_unlock(&iommu->lock);
5282 	spin_unlock_irqrestore(&device_domain_lock, flags);
5283 
5284 	return ret;
5285 }
5286 
5287 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5288 {
5289 	if (dev_is_pci(dev))
5290 		return pci_device_group(dev);
5291 	return generic_device_group(dev);
5292 }
5293 
5294 static int intel_iommu_enable_auxd(struct device *dev)
5295 {
5296 	struct device_domain_info *info;
5297 	struct intel_iommu *iommu;
5298 	unsigned long flags;
5299 	int ret;
5300 
5301 	iommu = device_to_iommu(dev, NULL, NULL);
5302 	if (!iommu || dmar_disabled)
5303 		return -EINVAL;
5304 
5305 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5306 		return -EINVAL;
5307 
5308 	ret = intel_iommu_enable_pasid(iommu, dev);
5309 	if (ret)
5310 		return -ENODEV;
5311 
5312 	spin_lock_irqsave(&device_domain_lock, flags);
5313 	info = get_domain_info(dev);
5314 	info->auxd_enabled = 1;
5315 	spin_unlock_irqrestore(&device_domain_lock, flags);
5316 
5317 	return 0;
5318 }
5319 
5320 static int intel_iommu_disable_auxd(struct device *dev)
5321 {
5322 	struct device_domain_info *info;
5323 	unsigned long flags;
5324 
5325 	spin_lock_irqsave(&device_domain_lock, flags);
5326 	info = get_domain_info(dev);
5327 	if (!WARN_ON(!info))
5328 		info->auxd_enabled = 0;
5329 	spin_unlock_irqrestore(&device_domain_lock, flags);
5330 
5331 	return 0;
5332 }
5333 
5334 /*
5335  * A PCI express designated vendor specific extended capability is defined
5336  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5337  * for system software and tools to detect endpoint devices supporting the
5338  * Intel scalable IO virtualization without host driver dependency.
5339  *
5340  * Returns the address of the matching extended capability structure within
5341  * the device's PCI configuration space or 0 if the device does not support
5342  * it.
5343  */
5344 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5345 {
5346 	int pos;
5347 	u16 vendor, id;
5348 
5349 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5350 	while (pos) {
5351 		pci_read_config_word(pdev, pos + 4, &vendor);
5352 		pci_read_config_word(pdev, pos + 8, &id);
5353 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5354 			return pos;
5355 
5356 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5357 	}
5358 
5359 	return 0;
5360 }
5361 
5362 static bool
5363 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5364 {
5365 	struct device_domain_info *info = get_domain_info(dev);
5366 
5367 	if (feat == IOMMU_DEV_FEAT_AUX) {
5368 		int ret;
5369 
5370 		if (!dev_is_pci(dev) || dmar_disabled ||
5371 		    !scalable_mode_support() || !pasid_mode_support())
5372 			return false;
5373 
5374 		ret = pci_pasid_features(to_pci_dev(dev));
5375 		if (ret < 0)
5376 			return false;
5377 
5378 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5379 	}
5380 
5381 	if (feat == IOMMU_DEV_FEAT_IOPF)
5382 		return info && info->pri_supported;
5383 
5384 	if (feat == IOMMU_DEV_FEAT_SVA)
5385 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5386 			info->pasid_supported && info->pri_supported &&
5387 			info->ats_supported;
5388 
5389 	return false;
5390 }
5391 
5392 static int
5393 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5394 {
5395 	if (feat == IOMMU_DEV_FEAT_AUX)
5396 		return intel_iommu_enable_auxd(dev);
5397 
5398 	if (feat == IOMMU_DEV_FEAT_IOPF)
5399 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5400 
5401 	if (feat == IOMMU_DEV_FEAT_SVA) {
5402 		struct device_domain_info *info = get_domain_info(dev);
5403 
5404 		if (!info)
5405 			return -EINVAL;
5406 
5407 		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5408 			return -EINVAL;
5409 
5410 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5411 			return 0;
5412 	}
5413 
5414 	return -ENODEV;
5415 }
5416 
5417 static int
5418 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5419 {
5420 	if (feat == IOMMU_DEV_FEAT_AUX)
5421 		return intel_iommu_disable_auxd(dev);
5422 
5423 	return -ENODEV;
5424 }
5425 
5426 static bool
5427 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5428 {
5429 	struct device_domain_info *info = get_domain_info(dev);
5430 
5431 	if (feat == IOMMU_DEV_FEAT_AUX)
5432 		return scalable_mode_support() && info && info->auxd_enabled;
5433 
5434 	return false;
5435 }
5436 
5437 static int
5438 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5439 {
5440 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5441 
5442 	return dmar_domain->default_pasid > 0 ?
5443 			dmar_domain->default_pasid : -EINVAL;
5444 }
5445 
5446 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5447 					   struct device *dev)
5448 {
5449 	return attach_deferred(dev);
5450 }
5451 
5452 static int
5453 intel_iommu_enable_nesting(struct iommu_domain *domain)
5454 {
5455 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5456 	unsigned long flags;
5457 	int ret = -ENODEV;
5458 
5459 	spin_lock_irqsave(&device_domain_lock, flags);
5460 	if (nested_mode_support() && list_empty(&dmar_domain->devices)) {
5461 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5462 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5463 		ret = 0;
5464 	}
5465 	spin_unlock_irqrestore(&device_domain_lock, flags);
5466 
5467 	return ret;
5468 }
5469 
5470 /*
5471  * Check that the device does not live on an external facing PCI port that is
5472  * marked as untrusted. Such devices should not be able to apply quirks and
5473  * thus not be able to bypass the IOMMU restrictions.
5474  */
5475 static bool risky_device(struct pci_dev *pdev)
5476 {
5477 	if (pdev->untrusted) {
5478 		pci_info(pdev,
5479 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5480 			 pdev->vendor, pdev->device);
5481 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5482 		return true;
5483 	}
5484 	return false;
5485 }
5486 
5487 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5488 			     unsigned long clf_pages)
5489 {
5490 	struct dma_pte *first_pte = NULL, *pte = NULL;
5491 	unsigned long lvl_pages = 0;
5492 	int level = 0;
5493 
5494 	while (clf_pages > 0) {
5495 		if (!pte) {
5496 			level = 0;
5497 			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5498 			if (WARN_ON(!pte))
5499 				return;
5500 			first_pte = pte;
5501 			lvl_pages = lvl_to_nr_pages(level);
5502 		}
5503 
5504 		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5505 			return;
5506 
5507 		clf_pages -= lvl_pages;
5508 		clf_pfn += lvl_pages;
5509 		pte++;
5510 
5511 		if (!clf_pages || first_pte_in_page(pte) ||
5512 		    (level > 1 && clf_pages < lvl_pages)) {
5513 			domain_flush_cache(domain, first_pte,
5514 					   (void *)pte - (void *)first_pte);
5515 			pte = NULL;
5516 		}
5517 	}
5518 }
5519 
5520 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5521 				       unsigned long iova, size_t size)
5522 {
5523 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524 	unsigned long pages = aligned_nrpages(iova, size);
5525 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5526 	struct intel_iommu *iommu;
5527 	int iommu_id;
5528 
5529 	if (!dmar_domain->iommu_coherency)
5530 		clflush_sync_map(dmar_domain, pfn, pages);
5531 
5532 	for_each_domain_iommu(iommu_id, dmar_domain) {
5533 		iommu = g_iommus[iommu_id];
5534 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5535 	}
5536 }
5537 
5538 const struct iommu_ops intel_iommu_ops = {
5539 	.capable		= intel_iommu_capable,
5540 	.domain_alloc		= intel_iommu_domain_alloc,
5541 	.domain_free		= intel_iommu_domain_free,
5542 	.enable_nesting		= intel_iommu_enable_nesting,
5543 	.attach_dev		= intel_iommu_attach_device,
5544 	.detach_dev		= intel_iommu_detach_device,
5545 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5546 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5547 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5548 	.map			= intel_iommu_map,
5549 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5550 	.unmap			= intel_iommu_unmap,
5551 	.flush_iotlb_all        = intel_flush_iotlb_all,
5552 	.iotlb_sync		= intel_iommu_tlb_sync,
5553 	.iova_to_phys		= intel_iommu_iova_to_phys,
5554 	.probe_device		= intel_iommu_probe_device,
5555 	.probe_finalize		= intel_iommu_probe_finalize,
5556 	.release_device		= intel_iommu_release_device,
5557 	.get_resv_regions	= intel_iommu_get_resv_regions,
5558 	.put_resv_regions	= generic_iommu_put_resv_regions,
5559 	.device_group		= intel_iommu_device_group,
5560 	.dev_has_feat		= intel_iommu_dev_has_feat,
5561 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5562 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5563 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5564 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5565 	.def_domain_type	= device_def_domain_type,
5566 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5567 #ifdef CONFIG_INTEL_IOMMU_SVM
5568 	.cache_invalidate	= intel_iommu_sva_invalidate,
5569 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5570 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5571 	.sva_bind		= intel_svm_bind,
5572 	.sva_unbind		= intel_svm_unbind,
5573 	.sva_get_pasid		= intel_svm_get_pasid,
5574 	.page_response		= intel_svm_page_response,
5575 #endif
5576 };
5577 
5578 static void quirk_iommu_igfx(struct pci_dev *dev)
5579 {
5580 	if (risky_device(dev))
5581 		return;
5582 
5583 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5584 	dmar_map_gfx = 0;
5585 }
5586 
5587 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5588 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5589 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5590 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5591 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5592 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5593 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5594 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5595 
5596 /* Broadwell igfx malfunctions with dmar */
5597 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5598 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5599 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5600 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5601 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5602 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5603 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5604 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5605 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5606 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5607 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5608 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5609 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5610 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5611 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5612 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5613 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5615 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5616 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5617 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5618 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5619 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5620 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5621 
5622 static void quirk_iommu_rwbf(struct pci_dev *dev)
5623 {
5624 	if (risky_device(dev))
5625 		return;
5626 
5627 	/*
5628 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5629 	 * but needs it. Same seems to hold for the desktop versions.
5630 	 */
5631 	pci_info(dev, "Forcing write-buffer flush capability\n");
5632 	rwbf_quirk = 1;
5633 }
5634 
5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5642 
5643 #define GGC 0x52
5644 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5645 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5646 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5647 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5648 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5649 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5650 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5651 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5652 
5653 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5654 {
5655 	unsigned short ggc;
5656 
5657 	if (risky_device(dev))
5658 		return;
5659 
5660 	if (pci_read_config_word(dev, GGC, &ggc))
5661 		return;
5662 
5663 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5664 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5665 		dmar_map_gfx = 0;
5666 	} else if (dmar_map_gfx) {
5667 		/* we have to ensure the gfx device is idle before we flush */
5668 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5669 		intel_iommu_strict = 1;
5670        }
5671 }
5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5676 
5677 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5678 {
5679 	unsigned short ver;
5680 
5681 	if (!IS_GFX_DEVICE(dev))
5682 		return;
5683 
5684 	ver = (dev->device >> 8) & 0xff;
5685 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5686 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5687 	    ver != 0x9a)
5688 		return;
5689 
5690 	if (risky_device(dev))
5691 		return;
5692 
5693 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5694 	iommu_skip_te_disable = 1;
5695 }
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5697 
5698 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5699    ISOCH DMAR unit for the Azalia sound device, but not give it any
5700    TLB entries, which causes it to deadlock. Check for that.  We do
5701    this in a function called from init_dmars(), instead of in a PCI
5702    quirk, because we don't want to print the obnoxious "BIOS broken"
5703    message if VT-d is actually disabled.
5704 */
5705 static void __init check_tylersburg_isoch(void)
5706 {
5707 	struct pci_dev *pdev;
5708 	uint32_t vtisochctrl;
5709 
5710 	/* If there's no Azalia in the system anyway, forget it. */
5711 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5712 	if (!pdev)
5713 		return;
5714 
5715 	if (risky_device(pdev)) {
5716 		pci_dev_put(pdev);
5717 		return;
5718 	}
5719 
5720 	pci_dev_put(pdev);
5721 
5722 	/* System Management Registers. Might be hidden, in which case
5723 	   we can't do the sanity check. But that's OK, because the
5724 	   known-broken BIOSes _don't_ actually hide it, so far. */
5725 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5726 	if (!pdev)
5727 		return;
5728 
5729 	if (risky_device(pdev)) {
5730 		pci_dev_put(pdev);
5731 		return;
5732 	}
5733 
5734 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5735 		pci_dev_put(pdev);
5736 		return;
5737 	}
5738 
5739 	pci_dev_put(pdev);
5740 
5741 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5742 	if (vtisochctrl & 1)
5743 		return;
5744 
5745 	/* Drop all bits other than the number of TLB entries */
5746 	vtisochctrl &= 0x1c;
5747 
5748 	/* If we have the recommended number of TLB entries (16), fine. */
5749 	if (vtisochctrl == 0x10)
5750 		return;
5751 
5752 	/* Zero TLB entries? You get to ride the short bus to school. */
5753 	if (!vtisochctrl) {
5754 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5755 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5756 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5757 		     dmi_get_system_info(DMI_BIOS_VERSION),
5758 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5759 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5760 		return;
5761 	}
5762 
5763 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5764 	       vtisochctrl);
5765 }
5766