xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision d15cb3da)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/intel-svm.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 
49 #include "../irq_remapping.h"
50 #include "../iommu-sva-lib.h"
51 #include "pasid.h"
52 #include "cap_audit.h"
53 
54 #define ROOT_SIZE		VTD_PAGE_SIZE
55 #define CONTEXT_SIZE		VTD_PAGE_SIZE
56 
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 
62 #define IOAPIC_RANGE_START	(0xfee00000)
63 #define IOAPIC_RANGE_END	(0xfeefffff)
64 #define IOVA_START_ADDR		(0x1000)
65 
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
77 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN		(1)
82 
83 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
84 
85 /* page table handling */
86 #define LEVEL_STRIDE		(9)
87 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
88 
89 static inline int agaw_to_level(int agaw)
90 {
91 	return agaw + 2;
92 }
93 
94 static inline int agaw_to_width(int agaw)
95 {
96 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
97 }
98 
99 static inline int width_to_agaw(int width)
100 {
101 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
102 }
103 
104 static inline unsigned int level_to_offset_bits(int level)
105 {
106 	return (level - 1) * LEVEL_STRIDE;
107 }
108 
109 static inline int pfn_level_offset(u64 pfn, int level)
110 {
111 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
112 }
113 
114 static inline u64 level_mask(int level)
115 {
116 	return -1ULL << level_to_offset_bits(level);
117 }
118 
119 static inline u64 level_size(int level)
120 {
121 	return 1ULL << level_to_offset_bits(level);
122 }
123 
124 static inline u64 align_to_level(u64 pfn, int level)
125 {
126 	return (pfn + level_size(level) - 1) & level_mask(level);
127 }
128 
129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
130 {
131 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
132 }
133 
134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
135    are never going to work. */
136 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
137 {
138 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
139 }
140 static inline unsigned long page_to_dma_pfn(struct page *pg)
141 {
142 	return mm_to_dma_pfn(page_to_pfn(pg));
143 }
144 static inline unsigned long virt_to_dma_pfn(void *p)
145 {
146 	return page_to_dma_pfn(virt_to_page(p));
147 }
148 
149 /* global iommu list, set NULL for ignored DMAR units */
150 static struct intel_iommu **g_iommus;
151 
152 static void __init check_tylersburg_isoch(void);
153 static int rwbf_quirk;
154 static inline struct device_domain_info *
155 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
156 
157 /*
158  * set to 1 to panic kernel if can't successfully enable VT-d
159  * (used when kernel is launched w/ TXT)
160  */
161 static int force_on = 0;
162 static int intel_iommu_tboot_noforce;
163 static int no_platform_optin;
164 
165 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
166 
167 /*
168  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
169  * if marked present.
170  */
171 static phys_addr_t root_entry_lctp(struct root_entry *re)
172 {
173 	if (!(re->lo & 1))
174 		return 0;
175 
176 	return re->lo & VTD_PAGE_MASK;
177 }
178 
179 /*
180  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
181  * if marked present.
182  */
183 static phys_addr_t root_entry_uctp(struct root_entry *re)
184 {
185 	if (!(re->hi & 1))
186 		return 0;
187 
188 	return re->hi & VTD_PAGE_MASK;
189 }
190 
191 static inline void context_clear_pasid_enable(struct context_entry *context)
192 {
193 	context->lo &= ~(1ULL << 11);
194 }
195 
196 static inline bool context_pasid_enabled(struct context_entry *context)
197 {
198 	return !!(context->lo & (1ULL << 11));
199 }
200 
201 static inline void context_set_copied(struct context_entry *context)
202 {
203 	context->hi |= (1ull << 3);
204 }
205 
206 static inline bool context_copied(struct context_entry *context)
207 {
208 	return !!(context->hi & (1ULL << 3));
209 }
210 
211 static inline bool __context_present(struct context_entry *context)
212 {
213 	return (context->lo & 1);
214 }
215 
216 bool context_present(struct context_entry *context)
217 {
218 	return context_pasid_enabled(context) ?
219 	     __context_present(context) :
220 	     __context_present(context) && !context_copied(context);
221 }
222 
223 static inline void context_set_present(struct context_entry *context)
224 {
225 	context->lo |= 1;
226 }
227 
228 static inline void context_set_fault_enable(struct context_entry *context)
229 {
230 	context->lo &= (((u64)-1) << 2) | 1;
231 }
232 
233 static inline void context_set_translation_type(struct context_entry *context,
234 						unsigned long value)
235 {
236 	context->lo &= (((u64)-1) << 4) | 3;
237 	context->lo |= (value & 3) << 2;
238 }
239 
240 static inline void context_set_address_root(struct context_entry *context,
241 					    unsigned long value)
242 {
243 	context->lo &= ~VTD_PAGE_MASK;
244 	context->lo |= value & VTD_PAGE_MASK;
245 }
246 
247 static inline void context_set_address_width(struct context_entry *context,
248 					     unsigned long value)
249 {
250 	context->hi |= value & 7;
251 }
252 
253 static inline void context_set_domain_id(struct context_entry *context,
254 					 unsigned long value)
255 {
256 	context->hi |= (value & ((1 << 16) - 1)) << 8;
257 }
258 
259 static inline int context_domain_id(struct context_entry *c)
260 {
261 	return((c->hi >> 8) & 0xffff);
262 }
263 
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266 	context->lo = 0;
267 	context->hi = 0;
268 }
269 
270 /*
271  * This domain is a statically identity mapping domain.
272  *	1. This domain creats a static 1:1 mapping to all usable memory.
273  * 	2. It maps to each iommu if successful.
274  *	3. Each iommu mapps to this domain if successful.
275  */
276 static struct dmar_domain *si_domain;
277 static int hw_pass_through = 1;
278 
279 #define for_each_domain_iommu(idx, domain)			\
280 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
281 		if (domain->iommu_refcnt[idx])
282 
283 struct dmar_rmrr_unit {
284 	struct list_head list;		/* list of rmrr units	*/
285 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
286 	u64	base_address;		/* reserved base address*/
287 	u64	end_address;		/* reserved end address */
288 	struct dmar_dev_scope *devices;	/* target devices */
289 	int	devices_cnt;		/* target device count */
290 };
291 
292 struct dmar_atsr_unit {
293 	struct list_head list;		/* list of ATSR units */
294 	struct acpi_dmar_header *hdr;	/* ACPI header */
295 	struct dmar_dev_scope *devices;	/* target devices */
296 	int devices_cnt;		/* target device count */
297 	u8 include_all:1;		/* include all ports */
298 };
299 
300 struct dmar_satc_unit {
301 	struct list_head list;		/* list of SATC units */
302 	struct acpi_dmar_header *hdr;	/* ACPI header */
303 	struct dmar_dev_scope *devices;	/* target devices */
304 	struct intel_iommu *iommu;	/* the corresponding iommu */
305 	int devices_cnt;		/* target device count */
306 	u8 atc_required:1;		/* ATS is required */
307 };
308 
309 static LIST_HEAD(dmar_atsr_units);
310 static LIST_HEAD(dmar_rmrr_units);
311 static LIST_HEAD(dmar_satc_units);
312 
313 #define for_each_rmrr_units(rmrr) \
314 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
315 
316 /* bitmap for indexing intel_iommus */
317 static int g_num_of_iommus;
318 
319 static void domain_exit(struct dmar_domain *domain);
320 static void domain_remove_dev_info(struct dmar_domain *domain);
321 static void dmar_remove_one_dev_info(struct device *dev);
322 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
323 static int intel_iommu_attach_device(struct iommu_domain *domain,
324 				     struct device *dev);
325 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
326 					    dma_addr_t iova);
327 
328 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
329 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
330 
331 int intel_iommu_enabled = 0;
332 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
333 
334 static int dmar_map_gfx = 1;
335 static int intel_iommu_superpage = 1;
336 static int iommu_identity_mapping;
337 static int iommu_skip_te_disable;
338 
339 #define IDENTMAP_GFX		2
340 #define IDENTMAP_AZALIA		4
341 
342 int intel_iommu_gfx_mapped;
343 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
344 
345 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
346 struct device_domain_info *get_domain_info(struct device *dev)
347 {
348 	struct device_domain_info *info;
349 
350 	if (!dev)
351 		return NULL;
352 
353 	info = dev_iommu_priv_get(dev);
354 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
355 		return NULL;
356 
357 	return info;
358 }
359 
360 DEFINE_SPINLOCK(device_domain_lock);
361 static LIST_HEAD(device_domain_list);
362 
363 /*
364  * Iterate over elements in device_domain_list and call the specified
365  * callback @fn against each element.
366  */
367 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
368 				     void *data), void *data)
369 {
370 	int ret = 0;
371 	unsigned long flags;
372 	struct device_domain_info *info;
373 
374 	spin_lock_irqsave(&device_domain_lock, flags);
375 	list_for_each_entry(info, &device_domain_list, global) {
376 		ret = fn(info, data);
377 		if (ret) {
378 			spin_unlock_irqrestore(&device_domain_lock, flags);
379 			return ret;
380 		}
381 	}
382 	spin_unlock_irqrestore(&device_domain_lock, flags);
383 
384 	return 0;
385 }
386 
387 const struct iommu_ops intel_iommu_ops;
388 
389 static bool translation_pre_enabled(struct intel_iommu *iommu)
390 {
391 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
392 }
393 
394 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
395 {
396 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
397 }
398 
399 static void init_translation_status(struct intel_iommu *iommu)
400 {
401 	u32 gsts;
402 
403 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
404 	if (gsts & DMA_GSTS_TES)
405 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
406 }
407 
408 static int __init intel_iommu_setup(char *str)
409 {
410 	if (!str)
411 		return -EINVAL;
412 
413 	while (*str) {
414 		if (!strncmp(str, "on", 2)) {
415 			dmar_disabled = 0;
416 			pr_info("IOMMU enabled\n");
417 		} else if (!strncmp(str, "off", 3)) {
418 			dmar_disabled = 1;
419 			no_platform_optin = 1;
420 			pr_info("IOMMU disabled\n");
421 		} else if (!strncmp(str, "igfx_off", 8)) {
422 			dmar_map_gfx = 0;
423 			pr_info("Disable GFX device mapping\n");
424 		} else if (!strncmp(str, "forcedac", 8)) {
425 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
426 			iommu_dma_forcedac = true;
427 		} else if (!strncmp(str, "strict", 6)) {
428 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
429 			iommu_set_dma_strict();
430 		} else if (!strncmp(str, "sp_off", 6)) {
431 			pr_info("Disable supported super page\n");
432 			intel_iommu_superpage = 0;
433 		} else if (!strncmp(str, "sm_on", 5)) {
434 			pr_info("Enable scalable mode if hardware supports\n");
435 			intel_iommu_sm = 1;
436 		} else if (!strncmp(str, "sm_off", 6)) {
437 			pr_info("Scalable mode is disallowed\n");
438 			intel_iommu_sm = 0;
439 		} else if (!strncmp(str, "tboot_noforce", 13)) {
440 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
441 			intel_iommu_tboot_noforce = 1;
442 		} else {
443 			pr_notice("Unknown option - '%s'\n", str);
444 		}
445 
446 		str += strcspn(str, ",");
447 		while (*str == ',')
448 			str++;
449 	}
450 
451 	return 1;
452 }
453 __setup("intel_iommu=", intel_iommu_setup);
454 
455 static struct kmem_cache *iommu_domain_cache;
456 static struct kmem_cache *iommu_devinfo_cache;
457 
458 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
459 {
460 	struct dmar_domain **domains;
461 	int idx = did >> 8;
462 
463 	domains = iommu->domains[idx];
464 	if (!domains)
465 		return NULL;
466 
467 	return domains[did & 0xff];
468 }
469 
470 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
471 			     struct dmar_domain *domain)
472 {
473 	struct dmar_domain **domains;
474 	int idx = did >> 8;
475 
476 	if (!iommu->domains[idx]) {
477 		size_t size = 256 * sizeof(struct dmar_domain *);
478 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
479 	}
480 
481 	domains = iommu->domains[idx];
482 	if (WARN_ON(!domains))
483 		return;
484 	else
485 		domains[did & 0xff] = domain;
486 }
487 
488 void *alloc_pgtable_page(int node)
489 {
490 	struct page *page;
491 	void *vaddr = NULL;
492 
493 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
494 	if (page)
495 		vaddr = page_address(page);
496 	return vaddr;
497 }
498 
499 void free_pgtable_page(void *vaddr)
500 {
501 	free_page((unsigned long)vaddr);
502 }
503 
504 static inline void *alloc_domain_mem(void)
505 {
506 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
507 }
508 
509 static void free_domain_mem(void *vaddr)
510 {
511 	kmem_cache_free(iommu_domain_cache, vaddr);
512 }
513 
514 static inline void * alloc_devinfo_mem(void)
515 {
516 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
517 }
518 
519 static inline void free_devinfo_mem(void *vaddr)
520 {
521 	kmem_cache_free(iommu_devinfo_cache, vaddr);
522 }
523 
524 static inline int domain_type_is_si(struct dmar_domain *domain)
525 {
526 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
527 }
528 
529 static inline bool domain_use_first_level(struct dmar_domain *domain)
530 {
531 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
532 }
533 
534 static inline int domain_pfn_supported(struct dmar_domain *domain,
535 				       unsigned long pfn)
536 {
537 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
538 
539 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
540 }
541 
542 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
543 {
544 	unsigned long sagaw;
545 	int agaw;
546 
547 	sagaw = cap_sagaw(iommu->cap);
548 	for (agaw = width_to_agaw(max_gaw);
549 	     agaw >= 0; agaw--) {
550 		if (test_bit(agaw, &sagaw))
551 			break;
552 	}
553 
554 	return agaw;
555 }
556 
557 /*
558  * Calculate max SAGAW for each iommu.
559  */
560 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
561 {
562 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
563 }
564 
565 /*
566  * calculate agaw for each iommu.
567  * "SAGAW" may be different across iommus, use a default agaw, and
568  * get a supported less agaw for iommus that don't support the default agaw.
569  */
570 int iommu_calculate_agaw(struct intel_iommu *iommu)
571 {
572 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
573 }
574 
575 /* This functionin only returns single iommu in a domain */
576 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
577 {
578 	int iommu_id;
579 
580 	/* si_domain and vm domain should not get here. */
581 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
582 		return NULL;
583 
584 	for_each_domain_iommu(iommu_id, domain)
585 		break;
586 
587 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
588 		return NULL;
589 
590 	return g_iommus[iommu_id];
591 }
592 
593 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
594 {
595 	return sm_supported(iommu) ?
596 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
597 }
598 
599 static void domain_update_iommu_coherency(struct dmar_domain *domain)
600 {
601 	struct dmar_drhd_unit *drhd;
602 	struct intel_iommu *iommu;
603 	bool found = false;
604 	int i;
605 
606 	domain->iommu_coherency = true;
607 
608 	for_each_domain_iommu(i, domain) {
609 		found = true;
610 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
611 			domain->iommu_coherency = false;
612 			break;
613 		}
614 	}
615 	if (found)
616 		return;
617 
618 	/* No hardware attached; use lowest common denominator */
619 	rcu_read_lock();
620 	for_each_active_iommu(iommu, drhd) {
621 		if (!iommu_paging_structure_coherency(iommu)) {
622 			domain->iommu_coherency = false;
623 			break;
624 		}
625 	}
626 	rcu_read_unlock();
627 }
628 
629 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
630 {
631 	struct dmar_drhd_unit *drhd;
632 	struct intel_iommu *iommu;
633 	bool ret = true;
634 
635 	rcu_read_lock();
636 	for_each_active_iommu(iommu, drhd) {
637 		if (iommu != skip) {
638 			/*
639 			 * If the hardware is operating in the scalable mode,
640 			 * the snooping control is always supported since we
641 			 * always set PASID-table-entry.PGSNP bit if the domain
642 			 * is managed outside (UNMANAGED).
643 			 */
644 			if (!sm_supported(iommu) &&
645 			    !ecap_sc_support(iommu->ecap)) {
646 				ret = false;
647 				break;
648 			}
649 		}
650 	}
651 	rcu_read_unlock();
652 
653 	return ret;
654 }
655 
656 static int domain_update_iommu_superpage(struct dmar_domain *domain,
657 					 struct intel_iommu *skip)
658 {
659 	struct dmar_drhd_unit *drhd;
660 	struct intel_iommu *iommu;
661 	int mask = 0x3;
662 
663 	if (!intel_iommu_superpage)
664 		return 0;
665 
666 	/* set iommu_superpage to the smallest common denominator */
667 	rcu_read_lock();
668 	for_each_active_iommu(iommu, drhd) {
669 		if (iommu != skip) {
670 			if (domain && domain_use_first_level(domain)) {
671 				if (!cap_fl1gp_support(iommu->cap))
672 					mask = 0x1;
673 			} else {
674 				mask &= cap_super_page_val(iommu->cap);
675 			}
676 
677 			if (!mask)
678 				break;
679 		}
680 	}
681 	rcu_read_unlock();
682 
683 	return fls(mask);
684 }
685 
686 static int domain_update_device_node(struct dmar_domain *domain)
687 {
688 	struct device_domain_info *info;
689 	int nid = NUMA_NO_NODE;
690 
691 	assert_spin_locked(&device_domain_lock);
692 
693 	if (list_empty(&domain->devices))
694 		return NUMA_NO_NODE;
695 
696 	list_for_each_entry(info, &domain->devices, link) {
697 		if (!info->dev)
698 			continue;
699 
700 		/*
701 		 * There could possibly be multiple device numa nodes as devices
702 		 * within the same domain may sit behind different IOMMUs. There
703 		 * isn't perfect answer in such situation, so we select first
704 		 * come first served policy.
705 		 */
706 		nid = dev_to_node(info->dev);
707 		if (nid != NUMA_NO_NODE)
708 			break;
709 	}
710 
711 	return nid;
712 }
713 
714 static void domain_update_iotlb(struct dmar_domain *domain);
715 
716 /* Return the super pagesize bitmap if supported. */
717 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
718 {
719 	unsigned long bitmap = 0;
720 
721 	/*
722 	 * 1-level super page supports page size of 2MiB, 2-level super page
723 	 * supports page size of both 2MiB and 1GiB.
724 	 */
725 	if (domain->iommu_superpage == 1)
726 		bitmap |= SZ_2M;
727 	else if (domain->iommu_superpage == 2)
728 		bitmap |= SZ_2M | SZ_1G;
729 
730 	return bitmap;
731 }
732 
733 /* Some capabilities may be different across iommus */
734 static void domain_update_iommu_cap(struct dmar_domain *domain)
735 {
736 	domain_update_iommu_coherency(domain);
737 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
738 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
739 
740 	/*
741 	 * If RHSA is missing, we should default to the device numa domain
742 	 * as fall back.
743 	 */
744 	if (domain->nid == NUMA_NO_NODE)
745 		domain->nid = domain_update_device_node(domain);
746 
747 	/*
748 	 * First-level translation restricts the input-address to a
749 	 * canonical address (i.e., address bits 63:N have the same
750 	 * value as address bit [N-1], where N is 48-bits with 4-level
751 	 * paging and 57-bits with 5-level paging). Hence, skip bit
752 	 * [N-1].
753 	 */
754 	if (domain_use_first_level(domain))
755 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
756 	else
757 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
758 
759 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
760 	domain_update_iotlb(domain);
761 }
762 
763 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
764 					 u8 devfn, int alloc)
765 {
766 	struct root_entry *root = &iommu->root_entry[bus];
767 	struct context_entry *context;
768 	u64 *entry;
769 
770 	entry = &root->lo;
771 	if (sm_supported(iommu)) {
772 		if (devfn >= 0x80) {
773 			devfn -= 0x80;
774 			entry = &root->hi;
775 		}
776 		devfn *= 2;
777 	}
778 	if (*entry & 1)
779 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
780 	else {
781 		unsigned long phy_addr;
782 		if (!alloc)
783 			return NULL;
784 
785 		context = alloc_pgtable_page(iommu->node);
786 		if (!context)
787 			return NULL;
788 
789 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
790 		phy_addr = virt_to_phys((void *)context);
791 		*entry = phy_addr | 1;
792 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
793 	}
794 	return &context[devfn];
795 }
796 
797 static bool attach_deferred(struct device *dev)
798 {
799 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
800 }
801 
802 /**
803  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
804  *				 sub-hierarchy of a candidate PCI-PCI bridge
805  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
806  * @bridge: the candidate PCI-PCI bridge
807  *
808  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
809  */
810 static bool
811 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
812 {
813 	struct pci_dev *pdev, *pbridge;
814 
815 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
816 		return false;
817 
818 	pdev = to_pci_dev(dev);
819 	pbridge = to_pci_dev(bridge);
820 
821 	if (pbridge->subordinate &&
822 	    pbridge->subordinate->number <= pdev->bus->number &&
823 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
824 		return true;
825 
826 	return false;
827 }
828 
829 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
830 {
831 	struct dmar_drhd_unit *drhd;
832 	u32 vtbar;
833 	int rc;
834 
835 	/* We know that this device on this chipset has its own IOMMU.
836 	 * If we find it under a different IOMMU, then the BIOS is lying
837 	 * to us. Hope that the IOMMU for this device is actually
838 	 * disabled, and it needs no translation...
839 	 */
840 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
841 	if (rc) {
842 		/* "can't" happen */
843 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
844 		return false;
845 	}
846 	vtbar &= 0xffff0000;
847 
848 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
849 	drhd = dmar_find_matched_drhd_unit(pdev);
850 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
851 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
852 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
853 		return true;
854 	}
855 
856 	return false;
857 }
858 
859 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
860 {
861 	if (!iommu || iommu->drhd->ignored)
862 		return true;
863 
864 	if (dev_is_pci(dev)) {
865 		struct pci_dev *pdev = to_pci_dev(dev);
866 
867 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
868 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
869 		    quirk_ioat_snb_local_iommu(pdev))
870 			return true;
871 	}
872 
873 	return false;
874 }
875 
876 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
877 {
878 	struct dmar_drhd_unit *drhd = NULL;
879 	struct pci_dev *pdev = NULL;
880 	struct intel_iommu *iommu;
881 	struct device *tmp;
882 	u16 segment = 0;
883 	int i;
884 
885 	if (!dev)
886 		return NULL;
887 
888 	if (dev_is_pci(dev)) {
889 		struct pci_dev *pf_pdev;
890 
891 		pdev = pci_real_dma_dev(to_pci_dev(dev));
892 
893 		/* VFs aren't listed in scope tables; we need to look up
894 		 * the PF instead to find the IOMMU. */
895 		pf_pdev = pci_physfn(pdev);
896 		dev = &pf_pdev->dev;
897 		segment = pci_domain_nr(pdev->bus);
898 	} else if (has_acpi_companion(dev))
899 		dev = &ACPI_COMPANION(dev)->dev;
900 
901 	rcu_read_lock();
902 	for_each_iommu(iommu, drhd) {
903 		if (pdev && segment != drhd->segment)
904 			continue;
905 
906 		for_each_active_dev_scope(drhd->devices,
907 					  drhd->devices_cnt, i, tmp) {
908 			if (tmp == dev) {
909 				/* For a VF use its original BDF# not that of the PF
910 				 * which we used for the IOMMU lookup. Strictly speaking
911 				 * we could do this for all PCI devices; we only need to
912 				 * get the BDF# from the scope table for ACPI matches. */
913 				if (pdev && pdev->is_virtfn)
914 					goto got_pdev;
915 
916 				if (bus && devfn) {
917 					*bus = drhd->devices[i].bus;
918 					*devfn = drhd->devices[i].devfn;
919 				}
920 				goto out;
921 			}
922 
923 			if (is_downstream_to_pci_bridge(dev, tmp))
924 				goto got_pdev;
925 		}
926 
927 		if (pdev && drhd->include_all) {
928 		got_pdev:
929 			if (bus && devfn) {
930 				*bus = pdev->bus->number;
931 				*devfn = pdev->devfn;
932 			}
933 			goto out;
934 		}
935 	}
936 	iommu = NULL;
937  out:
938 	if (iommu_is_dummy(iommu, dev))
939 		iommu = NULL;
940 
941 	rcu_read_unlock();
942 
943 	return iommu;
944 }
945 
946 static void domain_flush_cache(struct dmar_domain *domain,
947 			       void *addr, int size)
948 {
949 	if (!domain->iommu_coherency)
950 		clflush_cache_range(addr, size);
951 }
952 
953 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
954 {
955 	struct context_entry *context;
956 	int ret = 0;
957 	unsigned long flags;
958 
959 	spin_lock_irqsave(&iommu->lock, flags);
960 	context = iommu_context_addr(iommu, bus, devfn, 0);
961 	if (context)
962 		ret = context_present(context);
963 	spin_unlock_irqrestore(&iommu->lock, flags);
964 	return ret;
965 }
966 
967 static void free_context_table(struct intel_iommu *iommu)
968 {
969 	int i;
970 	unsigned long flags;
971 	struct context_entry *context;
972 
973 	spin_lock_irqsave(&iommu->lock, flags);
974 	if (!iommu->root_entry) {
975 		goto out;
976 	}
977 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
978 		context = iommu_context_addr(iommu, i, 0, 0);
979 		if (context)
980 			free_pgtable_page(context);
981 
982 		if (!sm_supported(iommu))
983 			continue;
984 
985 		context = iommu_context_addr(iommu, i, 0x80, 0);
986 		if (context)
987 			free_pgtable_page(context);
988 
989 	}
990 	free_pgtable_page(iommu->root_entry);
991 	iommu->root_entry = NULL;
992 out:
993 	spin_unlock_irqrestore(&iommu->lock, flags);
994 }
995 
996 #ifdef CONFIG_DMAR_DEBUG
997 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
998 {
999 	struct device_domain_info *info;
1000 	struct dma_pte *parent, *pte;
1001 	struct dmar_domain *domain;
1002 	int offset, level;
1003 
1004 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
1005 	if (!info || !info->domain) {
1006 		pr_info("device [%02x:%02x.%d] not probed\n",
1007 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1008 		return;
1009 	}
1010 
1011 	domain = info->domain;
1012 	level = agaw_to_level(domain->agaw);
1013 	parent = domain->pgd;
1014 	if (!parent) {
1015 		pr_info("no page table setup\n");
1016 		return;
1017 	}
1018 
1019 	while (1) {
1020 		offset = pfn_level_offset(pfn, level);
1021 		pte = &parent[offset];
1022 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
1023 			pr_info("PTE not present at level %d\n", level);
1024 			break;
1025 		}
1026 
1027 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
1028 
1029 		if (level == 1)
1030 			break;
1031 
1032 		parent = phys_to_virt(dma_pte_addr(pte));
1033 		level--;
1034 	}
1035 }
1036 
1037 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
1038 			  unsigned long long addr, u32 pasid)
1039 {
1040 	struct pasid_dir_entry *dir, *pde;
1041 	struct pasid_entry *entries, *pte;
1042 	struct context_entry *ctx_entry;
1043 	struct root_entry *rt_entry;
1044 	u8 devfn = source_id & 0xff;
1045 	u8 bus = source_id >> 8;
1046 	int i, dir_index, index;
1047 
1048 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
1049 
1050 	/* root entry dump */
1051 	rt_entry = &iommu->root_entry[bus];
1052 	if (!rt_entry) {
1053 		pr_info("root table entry is not present\n");
1054 		return;
1055 	}
1056 
1057 	if (sm_supported(iommu))
1058 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
1059 			rt_entry->hi, rt_entry->lo);
1060 	else
1061 		pr_info("root entry: 0x%016llx", rt_entry->lo);
1062 
1063 	/* context entry dump */
1064 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
1065 	if (!ctx_entry) {
1066 		pr_info("context table entry is not present\n");
1067 		return;
1068 	}
1069 
1070 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
1071 		ctx_entry->hi, ctx_entry->lo);
1072 
1073 	/* legacy mode does not require PASID entries */
1074 	if (!sm_supported(iommu))
1075 		goto pgtable_walk;
1076 
1077 	/* get the pointer to pasid directory entry */
1078 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
1079 	if (!dir) {
1080 		pr_info("pasid directory entry is not present\n");
1081 		return;
1082 	}
1083 	/* For request-without-pasid, get the pasid from context entry */
1084 	if (intel_iommu_sm && pasid == INVALID_IOASID)
1085 		pasid = PASID_RID2PASID;
1086 
1087 	dir_index = pasid >> PASID_PDE_SHIFT;
1088 	pde = &dir[dir_index];
1089 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
1090 
1091 	/* get the pointer to the pasid table entry */
1092 	entries = get_pasid_table_from_pde(pde);
1093 	if (!entries) {
1094 		pr_info("pasid table entry is not present\n");
1095 		return;
1096 	}
1097 	index = pasid & PASID_PTE_MASK;
1098 	pte = &entries[index];
1099 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1100 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1101 
1102 pgtable_walk:
1103 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1104 }
1105 #endif
1106 
1107 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1108 				      unsigned long pfn, int *target_level)
1109 {
1110 	struct dma_pte *parent, *pte;
1111 	int level = agaw_to_level(domain->agaw);
1112 	int offset;
1113 
1114 	BUG_ON(!domain->pgd);
1115 
1116 	if (!domain_pfn_supported(domain, pfn))
1117 		/* Address beyond IOMMU's addressing capabilities. */
1118 		return NULL;
1119 
1120 	parent = domain->pgd;
1121 
1122 	while (1) {
1123 		void *tmp_page;
1124 
1125 		offset = pfn_level_offset(pfn, level);
1126 		pte = &parent[offset];
1127 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1128 			break;
1129 		if (level == *target_level)
1130 			break;
1131 
1132 		if (!dma_pte_present(pte)) {
1133 			uint64_t pteval;
1134 
1135 			tmp_page = alloc_pgtable_page(domain->nid);
1136 
1137 			if (!tmp_page)
1138 				return NULL;
1139 
1140 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1141 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1142 			if (domain_use_first_level(domain)) {
1143 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1144 				if (iommu_is_dma_domain(&domain->domain))
1145 					pteval |= DMA_FL_PTE_ACCESS;
1146 			}
1147 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1148 				/* Someone else set it while we were thinking; use theirs. */
1149 				free_pgtable_page(tmp_page);
1150 			else
1151 				domain_flush_cache(domain, pte, sizeof(*pte));
1152 		}
1153 		if (level == 1)
1154 			break;
1155 
1156 		parent = phys_to_virt(dma_pte_addr(pte));
1157 		level--;
1158 	}
1159 
1160 	if (!*target_level)
1161 		*target_level = level;
1162 
1163 	return pte;
1164 }
1165 
1166 /* return address's pte at specific level */
1167 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1168 					 unsigned long pfn,
1169 					 int level, int *large_page)
1170 {
1171 	struct dma_pte *parent, *pte;
1172 	int total = agaw_to_level(domain->agaw);
1173 	int offset;
1174 
1175 	parent = domain->pgd;
1176 	while (level <= total) {
1177 		offset = pfn_level_offset(pfn, total);
1178 		pte = &parent[offset];
1179 		if (level == total)
1180 			return pte;
1181 
1182 		if (!dma_pte_present(pte)) {
1183 			*large_page = total;
1184 			break;
1185 		}
1186 
1187 		if (dma_pte_superpage(pte)) {
1188 			*large_page = total;
1189 			return pte;
1190 		}
1191 
1192 		parent = phys_to_virt(dma_pte_addr(pte));
1193 		total--;
1194 	}
1195 	return NULL;
1196 }
1197 
1198 /* clear last level pte, a tlb flush should be followed */
1199 static void dma_pte_clear_range(struct dmar_domain *domain,
1200 				unsigned long start_pfn,
1201 				unsigned long last_pfn)
1202 {
1203 	unsigned int large_page;
1204 	struct dma_pte *first_pte, *pte;
1205 
1206 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1207 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1208 	BUG_ON(start_pfn > last_pfn);
1209 
1210 	/* we don't need lock here; nobody else touches the iova range */
1211 	do {
1212 		large_page = 1;
1213 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1214 		if (!pte) {
1215 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1216 			continue;
1217 		}
1218 		do {
1219 			dma_clear_pte(pte);
1220 			start_pfn += lvl_to_nr_pages(large_page);
1221 			pte++;
1222 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1223 
1224 		domain_flush_cache(domain, first_pte,
1225 				   (void *)pte - (void *)first_pte);
1226 
1227 	} while (start_pfn && start_pfn <= last_pfn);
1228 }
1229 
1230 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1231 			       int retain_level, struct dma_pte *pte,
1232 			       unsigned long pfn, unsigned long start_pfn,
1233 			       unsigned long last_pfn)
1234 {
1235 	pfn = max(start_pfn, pfn);
1236 	pte = &pte[pfn_level_offset(pfn, level)];
1237 
1238 	do {
1239 		unsigned long level_pfn;
1240 		struct dma_pte *level_pte;
1241 
1242 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1243 			goto next;
1244 
1245 		level_pfn = pfn & level_mask(level);
1246 		level_pte = phys_to_virt(dma_pte_addr(pte));
1247 
1248 		if (level > 2) {
1249 			dma_pte_free_level(domain, level - 1, retain_level,
1250 					   level_pte, level_pfn, start_pfn,
1251 					   last_pfn);
1252 		}
1253 
1254 		/*
1255 		 * Free the page table if we're below the level we want to
1256 		 * retain and the range covers the entire table.
1257 		 */
1258 		if (level < retain_level && !(start_pfn > level_pfn ||
1259 		      last_pfn < level_pfn + level_size(level) - 1)) {
1260 			dma_clear_pte(pte);
1261 			domain_flush_cache(domain, pte, sizeof(*pte));
1262 			free_pgtable_page(level_pte);
1263 		}
1264 next:
1265 		pfn += level_size(level);
1266 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1267 }
1268 
1269 /*
1270  * clear last level (leaf) ptes and free page table pages below the
1271  * level we wish to keep intact.
1272  */
1273 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1274 				   unsigned long start_pfn,
1275 				   unsigned long last_pfn,
1276 				   int retain_level)
1277 {
1278 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1279 
1280 	/* We don't need lock here; nobody else touches the iova range */
1281 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1282 			   domain->pgd, 0, start_pfn, last_pfn);
1283 
1284 	/* free pgd */
1285 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1286 		free_pgtable_page(domain->pgd);
1287 		domain->pgd = NULL;
1288 	}
1289 }
1290 
1291 /* When a page at a given level is being unlinked from its parent, we don't
1292    need to *modify* it at all. All we need to do is make a list of all the
1293    pages which can be freed just as soon as we've flushed the IOTLB and we
1294    know the hardware page-walk will no longer touch them.
1295    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1296    be freed. */
1297 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1298 				    int level, struct dma_pte *pte,
1299 				    struct list_head *freelist)
1300 {
1301 	struct page *pg;
1302 
1303 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1304 	list_add_tail(&pg->lru, freelist);
1305 
1306 	if (level == 1)
1307 		return;
1308 
1309 	pte = page_address(pg);
1310 	do {
1311 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1312 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1313 		pte++;
1314 	} while (!first_pte_in_page(pte));
1315 }
1316 
1317 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1318 				struct dma_pte *pte, unsigned long pfn,
1319 				unsigned long start_pfn, unsigned long last_pfn,
1320 				struct list_head *freelist)
1321 {
1322 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1323 
1324 	pfn = max(start_pfn, pfn);
1325 	pte = &pte[pfn_level_offset(pfn, level)];
1326 
1327 	do {
1328 		unsigned long level_pfn = pfn & level_mask(level);
1329 
1330 		if (!dma_pte_present(pte))
1331 			goto next;
1332 
1333 		/* If range covers entire pagetable, free it */
1334 		if (start_pfn <= level_pfn &&
1335 		    last_pfn >= level_pfn + level_size(level) - 1) {
1336 			/* These suborbinate page tables are going away entirely. Don't
1337 			   bother to clear them; we're just going to *free* them. */
1338 			if (level > 1 && !dma_pte_superpage(pte))
1339 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1340 
1341 			dma_clear_pte(pte);
1342 			if (!first_pte)
1343 				first_pte = pte;
1344 			last_pte = pte;
1345 		} else if (level > 1) {
1346 			/* Recurse down into a level that isn't *entirely* obsolete */
1347 			dma_pte_clear_level(domain, level - 1,
1348 					    phys_to_virt(dma_pte_addr(pte)),
1349 					    level_pfn, start_pfn, last_pfn,
1350 					    freelist);
1351 		}
1352 next:
1353 		pfn = level_pfn + level_size(level);
1354 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1355 
1356 	if (first_pte)
1357 		domain_flush_cache(domain, first_pte,
1358 				   (void *)++last_pte - (void *)first_pte);
1359 }
1360 
1361 /* We can't just free the pages because the IOMMU may still be walking
1362    the page tables, and may have cached the intermediate levels. The
1363    pages can only be freed after the IOTLB flush has been done. */
1364 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1365 			 unsigned long last_pfn, struct list_head *freelist)
1366 {
1367 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1368 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1369 	BUG_ON(start_pfn > last_pfn);
1370 
1371 	/* we don't need lock here; nobody else touches the iova range */
1372 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1373 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1374 
1375 	/* free pgd */
1376 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1377 		struct page *pgd_page = virt_to_page(domain->pgd);
1378 		list_add_tail(&pgd_page->lru, freelist);
1379 		domain->pgd = NULL;
1380 	}
1381 }
1382 
1383 /* iommu handling */
1384 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1385 {
1386 	struct root_entry *root;
1387 	unsigned long flags;
1388 
1389 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1390 	if (!root) {
1391 		pr_err("Allocating root entry for %s failed\n",
1392 			iommu->name);
1393 		return -ENOMEM;
1394 	}
1395 
1396 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1397 
1398 	spin_lock_irqsave(&iommu->lock, flags);
1399 	iommu->root_entry = root;
1400 	spin_unlock_irqrestore(&iommu->lock, flags);
1401 
1402 	return 0;
1403 }
1404 
1405 static void iommu_set_root_entry(struct intel_iommu *iommu)
1406 {
1407 	u64 addr;
1408 	u32 sts;
1409 	unsigned long flag;
1410 
1411 	addr = virt_to_phys(iommu->root_entry);
1412 	if (sm_supported(iommu))
1413 		addr |= DMA_RTADDR_SMT;
1414 
1415 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1416 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1417 
1418 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1419 
1420 	/* Make sure hardware complete it */
1421 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1422 		      readl, (sts & DMA_GSTS_RTPS), sts);
1423 
1424 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1425 
1426 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1427 	if (sm_supported(iommu))
1428 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1429 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1430 }
1431 
1432 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1433 {
1434 	u32 val;
1435 	unsigned long flag;
1436 
1437 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1438 		return;
1439 
1440 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1441 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1442 
1443 	/* Make sure hardware complete it */
1444 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1445 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1446 
1447 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1448 }
1449 
1450 /* return value determine if we need a write buffer flush */
1451 static void __iommu_flush_context(struct intel_iommu *iommu,
1452 				  u16 did, u16 source_id, u8 function_mask,
1453 				  u64 type)
1454 {
1455 	u64 val = 0;
1456 	unsigned long flag;
1457 
1458 	switch (type) {
1459 	case DMA_CCMD_GLOBAL_INVL:
1460 		val = DMA_CCMD_GLOBAL_INVL;
1461 		break;
1462 	case DMA_CCMD_DOMAIN_INVL:
1463 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1464 		break;
1465 	case DMA_CCMD_DEVICE_INVL:
1466 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1467 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1468 		break;
1469 	default:
1470 		BUG();
1471 	}
1472 	val |= DMA_CCMD_ICC;
1473 
1474 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1475 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1476 
1477 	/* Make sure hardware complete it */
1478 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1479 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1480 
1481 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1482 }
1483 
1484 /* return value determine if we need a write buffer flush */
1485 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1486 				u64 addr, unsigned int size_order, u64 type)
1487 {
1488 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1489 	u64 val = 0, val_iva = 0;
1490 	unsigned long flag;
1491 
1492 	switch (type) {
1493 	case DMA_TLB_GLOBAL_FLUSH:
1494 		/* global flush doesn't need set IVA_REG */
1495 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1496 		break;
1497 	case DMA_TLB_DSI_FLUSH:
1498 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1499 		break;
1500 	case DMA_TLB_PSI_FLUSH:
1501 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1502 		/* IH bit is passed in as part of address */
1503 		val_iva = size_order | addr;
1504 		break;
1505 	default:
1506 		BUG();
1507 	}
1508 	/* Note: set drain read/write */
1509 #if 0
1510 	/*
1511 	 * This is probably to be super secure.. Looks like we can
1512 	 * ignore it without any impact.
1513 	 */
1514 	if (cap_read_drain(iommu->cap))
1515 		val |= DMA_TLB_READ_DRAIN;
1516 #endif
1517 	if (cap_write_drain(iommu->cap))
1518 		val |= DMA_TLB_WRITE_DRAIN;
1519 
1520 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1521 	/* Note: Only uses first TLB reg currently */
1522 	if (val_iva)
1523 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1524 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1525 
1526 	/* Make sure hardware complete it */
1527 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1528 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1529 
1530 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1531 
1532 	/* check IOTLB invalidation granularity */
1533 	if (DMA_TLB_IAIG(val) == 0)
1534 		pr_err("Flush IOTLB failed\n");
1535 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1536 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1537 			(unsigned long long)DMA_TLB_IIRG(type),
1538 			(unsigned long long)DMA_TLB_IAIG(val));
1539 }
1540 
1541 static struct device_domain_info *
1542 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1543 			 u8 bus, u8 devfn)
1544 {
1545 	struct device_domain_info *info;
1546 
1547 	assert_spin_locked(&device_domain_lock);
1548 
1549 	if (!iommu->qi)
1550 		return NULL;
1551 
1552 	list_for_each_entry(info, &domain->devices, link)
1553 		if (info->iommu == iommu && info->bus == bus &&
1554 		    info->devfn == devfn) {
1555 			if (info->ats_supported && info->dev)
1556 				return info;
1557 			break;
1558 		}
1559 
1560 	return NULL;
1561 }
1562 
1563 static void domain_update_iotlb(struct dmar_domain *domain)
1564 {
1565 	struct device_domain_info *info;
1566 	bool has_iotlb_device = false;
1567 
1568 	assert_spin_locked(&device_domain_lock);
1569 
1570 	list_for_each_entry(info, &domain->devices, link)
1571 		if (info->ats_enabled) {
1572 			has_iotlb_device = true;
1573 			break;
1574 		}
1575 
1576 	if (!has_iotlb_device) {
1577 		struct subdev_domain_info *sinfo;
1578 
1579 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1580 			info = get_domain_info(sinfo->pdev);
1581 			if (info && info->ats_enabled) {
1582 				has_iotlb_device = true;
1583 				break;
1584 			}
1585 		}
1586 	}
1587 
1588 	domain->has_iotlb_device = has_iotlb_device;
1589 }
1590 
1591 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1592 {
1593 	struct pci_dev *pdev;
1594 
1595 	assert_spin_locked(&device_domain_lock);
1596 
1597 	if (!info || !dev_is_pci(info->dev))
1598 		return;
1599 
1600 	pdev = to_pci_dev(info->dev);
1601 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1602 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1603 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1604 	 * reserved, which should be set to 0.
1605 	 */
1606 	if (!ecap_dit(info->iommu->ecap))
1607 		info->pfsid = 0;
1608 	else {
1609 		struct pci_dev *pf_pdev;
1610 
1611 		/* pdev will be returned if device is not a vf */
1612 		pf_pdev = pci_physfn(pdev);
1613 		info->pfsid = pci_dev_id(pf_pdev);
1614 	}
1615 
1616 #ifdef CONFIG_INTEL_IOMMU_SVM
1617 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1618 	   the device if you enable PASID support after ATS support is
1619 	   undefined. So always enable PASID support on devices which
1620 	   have it, even if we can't yet know if we're ever going to
1621 	   use it. */
1622 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1623 		info->pasid_enabled = 1;
1624 
1625 	if (info->pri_supported &&
1626 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1627 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1628 		info->pri_enabled = 1;
1629 #endif
1630 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1631 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1632 		info->ats_enabled = 1;
1633 		domain_update_iotlb(info->domain);
1634 		info->ats_qdep = pci_ats_queue_depth(pdev);
1635 	}
1636 }
1637 
1638 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1639 {
1640 	struct pci_dev *pdev;
1641 
1642 	assert_spin_locked(&device_domain_lock);
1643 
1644 	if (!dev_is_pci(info->dev))
1645 		return;
1646 
1647 	pdev = to_pci_dev(info->dev);
1648 
1649 	if (info->ats_enabled) {
1650 		pci_disable_ats(pdev);
1651 		info->ats_enabled = 0;
1652 		domain_update_iotlb(info->domain);
1653 	}
1654 #ifdef CONFIG_INTEL_IOMMU_SVM
1655 	if (info->pri_enabled) {
1656 		pci_disable_pri(pdev);
1657 		info->pri_enabled = 0;
1658 	}
1659 	if (info->pasid_enabled) {
1660 		pci_disable_pasid(pdev);
1661 		info->pasid_enabled = 0;
1662 	}
1663 #endif
1664 }
1665 
1666 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1667 				    u64 addr, unsigned int mask)
1668 {
1669 	u16 sid, qdep;
1670 
1671 	if (!info || !info->ats_enabled)
1672 		return;
1673 
1674 	sid = info->bus << 8 | info->devfn;
1675 	qdep = info->ats_qdep;
1676 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1677 			   qdep, addr, mask);
1678 }
1679 
1680 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1681 				  u64 addr, unsigned mask)
1682 {
1683 	unsigned long flags;
1684 	struct device_domain_info *info;
1685 	struct subdev_domain_info *sinfo;
1686 
1687 	if (!domain->has_iotlb_device)
1688 		return;
1689 
1690 	spin_lock_irqsave(&device_domain_lock, flags);
1691 	list_for_each_entry(info, &domain->devices, link)
1692 		__iommu_flush_dev_iotlb(info, addr, mask);
1693 
1694 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1695 		info = get_domain_info(sinfo->pdev);
1696 		__iommu_flush_dev_iotlb(info, addr, mask);
1697 	}
1698 	spin_unlock_irqrestore(&device_domain_lock, flags);
1699 }
1700 
1701 static void domain_flush_piotlb(struct intel_iommu *iommu,
1702 				struct dmar_domain *domain,
1703 				u64 addr, unsigned long npages, bool ih)
1704 {
1705 	u16 did = domain->iommu_did[iommu->seq_id];
1706 
1707 	if (domain->default_pasid)
1708 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1709 				addr, npages, ih);
1710 
1711 	if (!list_empty(&domain->devices))
1712 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1713 }
1714 
1715 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1716 				  struct dmar_domain *domain,
1717 				  unsigned long pfn, unsigned int pages,
1718 				  int ih, int map)
1719 {
1720 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1721 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1722 	u16 did = domain->iommu_did[iommu->seq_id];
1723 
1724 	BUG_ON(pages == 0);
1725 
1726 	if (ih)
1727 		ih = 1 << 6;
1728 
1729 	if (domain_use_first_level(domain)) {
1730 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1731 	} else {
1732 		/*
1733 		 * Fallback to domain selective flush if no PSI support or
1734 		 * the size is too big. PSI requires page size to be 2 ^ x,
1735 		 * and the base address is naturally aligned to the size.
1736 		 */
1737 		if (!cap_pgsel_inv(iommu->cap) ||
1738 		    mask > cap_max_amask_val(iommu->cap))
1739 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1740 							DMA_TLB_DSI_FLUSH);
1741 		else
1742 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1743 							DMA_TLB_PSI_FLUSH);
1744 	}
1745 
1746 	/*
1747 	 * In caching mode, changes of pages from non-present to present require
1748 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1749 	 */
1750 	if (!cap_caching_mode(iommu->cap) || !map)
1751 		iommu_flush_dev_iotlb(domain, addr, mask);
1752 }
1753 
1754 /* Notification for newly created mappings */
1755 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1756 					struct dmar_domain *domain,
1757 					unsigned long pfn, unsigned int pages)
1758 {
1759 	/*
1760 	 * It's a non-present to present mapping. Only flush if caching mode
1761 	 * and second level.
1762 	 */
1763 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1764 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1765 	else
1766 		iommu_flush_write_buffer(iommu);
1767 }
1768 
1769 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1770 {
1771 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1772 	int idx;
1773 
1774 	for_each_domain_iommu(idx, dmar_domain) {
1775 		struct intel_iommu *iommu = g_iommus[idx];
1776 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1777 
1778 		if (domain_use_first_level(dmar_domain))
1779 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1780 		else
1781 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1782 						 DMA_TLB_DSI_FLUSH);
1783 
1784 		if (!cap_caching_mode(iommu->cap))
1785 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1786 					      0, MAX_AGAW_PFN_WIDTH);
1787 	}
1788 }
1789 
1790 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1791 {
1792 	u32 pmen;
1793 	unsigned long flags;
1794 
1795 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1796 		return;
1797 
1798 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1799 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1800 	pmen &= ~DMA_PMEN_EPM;
1801 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1802 
1803 	/* wait for the protected region status bit to clear */
1804 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1805 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1806 
1807 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1808 }
1809 
1810 static void iommu_enable_translation(struct intel_iommu *iommu)
1811 {
1812 	u32 sts;
1813 	unsigned long flags;
1814 
1815 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1816 	iommu->gcmd |= DMA_GCMD_TE;
1817 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1818 
1819 	/* Make sure hardware complete it */
1820 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1821 		      readl, (sts & DMA_GSTS_TES), sts);
1822 
1823 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1824 }
1825 
1826 static void iommu_disable_translation(struct intel_iommu *iommu)
1827 {
1828 	u32 sts;
1829 	unsigned long flag;
1830 
1831 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1832 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1833 		return;
1834 
1835 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1836 	iommu->gcmd &= ~DMA_GCMD_TE;
1837 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1838 
1839 	/* Make sure hardware complete it */
1840 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1841 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1842 
1843 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1844 }
1845 
1846 static int iommu_init_domains(struct intel_iommu *iommu)
1847 {
1848 	u32 ndomains;
1849 	size_t size;
1850 
1851 	ndomains = cap_ndoms(iommu->cap);
1852 	pr_debug("%s: Number of Domains supported <%d>\n",
1853 		 iommu->name, ndomains);
1854 
1855 	spin_lock_init(&iommu->lock);
1856 
1857 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1858 	if (!iommu->domain_ids)
1859 		return -ENOMEM;
1860 
1861 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1862 	iommu->domains = kzalloc(size, GFP_KERNEL);
1863 
1864 	if (iommu->domains) {
1865 		size = 256 * sizeof(struct dmar_domain *);
1866 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1867 	}
1868 
1869 	if (!iommu->domains || !iommu->domains[0]) {
1870 		pr_err("%s: Allocating domain array failed\n",
1871 		       iommu->name);
1872 		bitmap_free(iommu->domain_ids);
1873 		kfree(iommu->domains);
1874 		iommu->domain_ids = NULL;
1875 		iommu->domains    = NULL;
1876 		return -ENOMEM;
1877 	}
1878 
1879 	/*
1880 	 * If Caching mode is set, then invalid translations are tagged
1881 	 * with domain-id 0, hence we need to pre-allocate it. We also
1882 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1883 	 * make sure it is not used for a real domain.
1884 	 */
1885 	set_bit(0, iommu->domain_ids);
1886 
1887 	/*
1888 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1889 	 * entry for first-level or pass-through translation modes should
1890 	 * be programmed with a domain id different from those used for
1891 	 * second-level or nested translation. We reserve a domain id for
1892 	 * this purpose.
1893 	 */
1894 	if (sm_supported(iommu))
1895 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1896 
1897 	return 0;
1898 }
1899 
1900 static void disable_dmar_iommu(struct intel_iommu *iommu)
1901 {
1902 	struct device_domain_info *info, *tmp;
1903 	unsigned long flags;
1904 
1905 	if (!iommu->domains || !iommu->domain_ids)
1906 		return;
1907 
1908 	spin_lock_irqsave(&device_domain_lock, flags);
1909 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1910 		if (info->iommu != iommu)
1911 			continue;
1912 
1913 		if (!info->dev || !info->domain)
1914 			continue;
1915 
1916 		__dmar_remove_one_dev_info(info);
1917 	}
1918 	spin_unlock_irqrestore(&device_domain_lock, flags);
1919 
1920 	if (iommu->gcmd & DMA_GCMD_TE)
1921 		iommu_disable_translation(iommu);
1922 }
1923 
1924 static void free_dmar_iommu(struct intel_iommu *iommu)
1925 {
1926 	if ((iommu->domains) && (iommu->domain_ids)) {
1927 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1928 		int i;
1929 
1930 		for (i = 0; i < elems; i++)
1931 			kfree(iommu->domains[i]);
1932 		kfree(iommu->domains);
1933 		bitmap_free(iommu->domain_ids);
1934 		iommu->domains = NULL;
1935 		iommu->domain_ids = NULL;
1936 	}
1937 
1938 	g_iommus[iommu->seq_id] = NULL;
1939 
1940 	/* free context mapping */
1941 	free_context_table(iommu);
1942 
1943 #ifdef CONFIG_INTEL_IOMMU_SVM
1944 	if (pasid_supported(iommu)) {
1945 		if (ecap_prs(iommu->ecap))
1946 			intel_svm_finish_prq(iommu);
1947 	}
1948 	if (vccap_pasid(iommu->vccap))
1949 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1950 
1951 #endif
1952 }
1953 
1954 /*
1955  * Check and return whether first level is used by default for
1956  * DMA translation.
1957  */
1958 static bool first_level_by_default(unsigned int type)
1959 {
1960 	/* Only SL is available in legacy mode */
1961 	if (!scalable_mode_support())
1962 		return false;
1963 
1964 	/* Only level (either FL or SL) is available, just use it */
1965 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1966 		return intel_cap_flts_sanity();
1967 
1968 	/* Both levels are available, decide it based on domain type */
1969 	return type != IOMMU_DOMAIN_UNMANAGED;
1970 }
1971 
1972 static struct dmar_domain *alloc_domain(unsigned int type)
1973 {
1974 	struct dmar_domain *domain;
1975 
1976 	domain = alloc_domain_mem();
1977 	if (!domain)
1978 		return NULL;
1979 
1980 	memset(domain, 0, sizeof(*domain));
1981 	domain->nid = NUMA_NO_NODE;
1982 	if (first_level_by_default(type))
1983 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1984 	domain->has_iotlb_device = false;
1985 	INIT_LIST_HEAD(&domain->devices);
1986 	INIT_LIST_HEAD(&domain->subdevices);
1987 
1988 	return domain;
1989 }
1990 
1991 /* Must be called with iommu->lock */
1992 static int domain_attach_iommu(struct dmar_domain *domain,
1993 			       struct intel_iommu *iommu)
1994 {
1995 	unsigned long ndomains;
1996 	int num;
1997 
1998 	assert_spin_locked(&device_domain_lock);
1999 	assert_spin_locked(&iommu->lock);
2000 
2001 	domain->iommu_refcnt[iommu->seq_id] += 1;
2002 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
2003 		ndomains = cap_ndoms(iommu->cap);
2004 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
2005 
2006 		if (num >= ndomains) {
2007 			pr_err("%s: No free domain ids\n", iommu->name);
2008 			domain->iommu_refcnt[iommu->seq_id] -= 1;
2009 			return -ENOSPC;
2010 		}
2011 
2012 		set_bit(num, iommu->domain_ids);
2013 		set_iommu_domain(iommu, num, domain);
2014 
2015 		domain->iommu_did[iommu->seq_id] = num;
2016 		domain->nid			 = iommu->node;
2017 
2018 		domain_update_iommu_cap(domain);
2019 	}
2020 
2021 	return 0;
2022 }
2023 
2024 static void domain_detach_iommu(struct dmar_domain *domain,
2025 				struct intel_iommu *iommu)
2026 {
2027 	int num;
2028 
2029 	assert_spin_locked(&device_domain_lock);
2030 	assert_spin_locked(&iommu->lock);
2031 
2032 	domain->iommu_refcnt[iommu->seq_id] -= 1;
2033 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2034 		num = domain->iommu_did[iommu->seq_id];
2035 		clear_bit(num, iommu->domain_ids);
2036 		set_iommu_domain(iommu, num, NULL);
2037 
2038 		domain_update_iommu_cap(domain);
2039 		domain->iommu_did[iommu->seq_id] = 0;
2040 	}
2041 }
2042 
2043 static inline int guestwidth_to_adjustwidth(int gaw)
2044 {
2045 	int agaw;
2046 	int r = (gaw - 12) % 9;
2047 
2048 	if (r == 0)
2049 		agaw = gaw;
2050 	else
2051 		agaw = gaw + 9 - r;
2052 	if (agaw > 64)
2053 		agaw = 64;
2054 	return agaw;
2055 }
2056 
2057 static void domain_exit(struct dmar_domain *domain)
2058 {
2059 
2060 	/* Remove associated devices and clear attached or cached domains */
2061 	domain_remove_dev_info(domain);
2062 
2063 	if (domain->pgd) {
2064 		LIST_HEAD(freelist);
2065 
2066 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
2067 		put_pages_list(&freelist);
2068 	}
2069 
2070 	free_domain_mem(domain);
2071 }
2072 
2073 /*
2074  * Get the PASID directory size for scalable mode context entry.
2075  * Value of X in the PDTS field of a scalable mode context entry
2076  * indicates PASID directory with 2^(X + 7) entries.
2077  */
2078 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2079 {
2080 	unsigned long pds, max_pde;
2081 
2082 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2083 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
2084 	if (pds < 7)
2085 		return 0;
2086 
2087 	return pds - 7;
2088 }
2089 
2090 /*
2091  * Set the RID_PASID field of a scalable mode context entry. The
2092  * IOMMU hardware will use the PASID value set in this field for
2093  * DMA translations of DMA requests without PASID.
2094  */
2095 static inline void
2096 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2097 {
2098 	context->hi |= pasid & ((1 << 20) - 1);
2099 }
2100 
2101 /*
2102  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2103  * entry.
2104  */
2105 static inline void context_set_sm_dte(struct context_entry *context)
2106 {
2107 	context->lo |= (1 << 2);
2108 }
2109 
2110 /*
2111  * Set the PRE(Page Request Enable) field of a scalable mode context
2112  * entry.
2113  */
2114 static inline void context_set_sm_pre(struct context_entry *context)
2115 {
2116 	context->lo |= (1 << 4);
2117 }
2118 
2119 /* Convert value to context PASID directory size field coding. */
2120 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2121 
2122 static int domain_context_mapping_one(struct dmar_domain *domain,
2123 				      struct intel_iommu *iommu,
2124 				      struct pasid_table *table,
2125 				      u8 bus, u8 devfn)
2126 {
2127 	u16 did = domain->iommu_did[iommu->seq_id];
2128 	int translation = CONTEXT_TT_MULTI_LEVEL;
2129 	struct device_domain_info *info = NULL;
2130 	struct context_entry *context;
2131 	unsigned long flags;
2132 	int ret;
2133 
2134 	WARN_ON(did == 0);
2135 
2136 	if (hw_pass_through && domain_type_is_si(domain))
2137 		translation = CONTEXT_TT_PASS_THROUGH;
2138 
2139 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2140 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2141 
2142 	BUG_ON(!domain->pgd);
2143 
2144 	spin_lock_irqsave(&device_domain_lock, flags);
2145 	spin_lock(&iommu->lock);
2146 
2147 	ret = -ENOMEM;
2148 	context = iommu_context_addr(iommu, bus, devfn, 1);
2149 	if (!context)
2150 		goto out_unlock;
2151 
2152 	ret = 0;
2153 	if (context_present(context))
2154 		goto out_unlock;
2155 
2156 	/*
2157 	 * For kdump cases, old valid entries may be cached due to the
2158 	 * in-flight DMA and copied pgtable, but there is no unmapping
2159 	 * behaviour for them, thus we need an explicit cache flush for
2160 	 * the newly-mapped device. For kdump, at this point, the device
2161 	 * is supposed to finish reset at its driver probe stage, so no
2162 	 * in-flight DMA will exist, and we don't need to worry anymore
2163 	 * hereafter.
2164 	 */
2165 	if (context_copied(context)) {
2166 		u16 did_old = context_domain_id(context);
2167 
2168 		if (did_old < cap_ndoms(iommu->cap)) {
2169 			iommu->flush.flush_context(iommu, did_old,
2170 						   (((u16)bus) << 8) | devfn,
2171 						   DMA_CCMD_MASK_NOBIT,
2172 						   DMA_CCMD_DEVICE_INVL);
2173 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2174 						 DMA_TLB_DSI_FLUSH);
2175 		}
2176 	}
2177 
2178 	context_clear_entry(context);
2179 
2180 	if (sm_supported(iommu)) {
2181 		unsigned long pds;
2182 
2183 		WARN_ON(!table);
2184 
2185 		/* Setup the PASID DIR pointer: */
2186 		pds = context_get_sm_pds(table);
2187 		context->lo = (u64)virt_to_phys(table->table) |
2188 				context_pdts(pds);
2189 
2190 		/* Setup the RID_PASID field: */
2191 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2192 
2193 		/*
2194 		 * Setup the Device-TLB enable bit and Page request
2195 		 * Enable bit:
2196 		 */
2197 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2198 		if (info && info->ats_supported)
2199 			context_set_sm_dte(context);
2200 		if (info && info->pri_supported)
2201 			context_set_sm_pre(context);
2202 	} else {
2203 		struct dma_pte *pgd = domain->pgd;
2204 		int agaw;
2205 
2206 		context_set_domain_id(context, did);
2207 
2208 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2209 			/*
2210 			 * Skip top levels of page tables for iommu which has
2211 			 * less agaw than default. Unnecessary for PT mode.
2212 			 */
2213 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2214 				ret = -ENOMEM;
2215 				pgd = phys_to_virt(dma_pte_addr(pgd));
2216 				if (!dma_pte_present(pgd))
2217 					goto out_unlock;
2218 			}
2219 
2220 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2221 			if (info && info->ats_supported)
2222 				translation = CONTEXT_TT_DEV_IOTLB;
2223 			else
2224 				translation = CONTEXT_TT_MULTI_LEVEL;
2225 
2226 			context_set_address_root(context, virt_to_phys(pgd));
2227 			context_set_address_width(context, agaw);
2228 		} else {
2229 			/*
2230 			 * In pass through mode, AW must be programmed to
2231 			 * indicate the largest AGAW value supported by
2232 			 * hardware. And ASR is ignored by hardware.
2233 			 */
2234 			context_set_address_width(context, iommu->msagaw);
2235 		}
2236 
2237 		context_set_translation_type(context, translation);
2238 	}
2239 
2240 	context_set_fault_enable(context);
2241 	context_set_present(context);
2242 	if (!ecap_coherent(iommu->ecap))
2243 		clflush_cache_range(context, sizeof(*context));
2244 
2245 	/*
2246 	 * It's a non-present to present mapping. If hardware doesn't cache
2247 	 * non-present entry we only need to flush the write-buffer. If the
2248 	 * _does_ cache non-present entries, then it does so in the special
2249 	 * domain #0, which we have to flush:
2250 	 */
2251 	if (cap_caching_mode(iommu->cap)) {
2252 		iommu->flush.flush_context(iommu, 0,
2253 					   (((u16)bus) << 8) | devfn,
2254 					   DMA_CCMD_MASK_NOBIT,
2255 					   DMA_CCMD_DEVICE_INVL);
2256 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2257 	} else {
2258 		iommu_flush_write_buffer(iommu);
2259 	}
2260 	iommu_enable_dev_iotlb(info);
2261 
2262 	ret = 0;
2263 
2264 out_unlock:
2265 	spin_unlock(&iommu->lock);
2266 	spin_unlock_irqrestore(&device_domain_lock, flags);
2267 
2268 	return ret;
2269 }
2270 
2271 struct domain_context_mapping_data {
2272 	struct dmar_domain *domain;
2273 	struct intel_iommu *iommu;
2274 	struct pasid_table *table;
2275 };
2276 
2277 static int domain_context_mapping_cb(struct pci_dev *pdev,
2278 				     u16 alias, void *opaque)
2279 {
2280 	struct domain_context_mapping_data *data = opaque;
2281 
2282 	return domain_context_mapping_one(data->domain, data->iommu,
2283 					  data->table, PCI_BUS_NUM(alias),
2284 					  alias & 0xff);
2285 }
2286 
2287 static int
2288 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2289 {
2290 	struct domain_context_mapping_data data;
2291 	struct pasid_table *table;
2292 	struct intel_iommu *iommu;
2293 	u8 bus, devfn;
2294 
2295 	iommu = device_to_iommu(dev, &bus, &devfn);
2296 	if (!iommu)
2297 		return -ENODEV;
2298 
2299 	table = intel_pasid_get_table(dev);
2300 
2301 	if (!dev_is_pci(dev))
2302 		return domain_context_mapping_one(domain, iommu, table,
2303 						  bus, devfn);
2304 
2305 	data.domain = domain;
2306 	data.iommu = iommu;
2307 	data.table = table;
2308 
2309 	return pci_for_each_dma_alias(to_pci_dev(dev),
2310 				      &domain_context_mapping_cb, &data);
2311 }
2312 
2313 static int domain_context_mapped_cb(struct pci_dev *pdev,
2314 				    u16 alias, void *opaque)
2315 {
2316 	struct intel_iommu *iommu = opaque;
2317 
2318 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2319 }
2320 
2321 static int domain_context_mapped(struct device *dev)
2322 {
2323 	struct intel_iommu *iommu;
2324 	u8 bus, devfn;
2325 
2326 	iommu = device_to_iommu(dev, &bus, &devfn);
2327 	if (!iommu)
2328 		return -ENODEV;
2329 
2330 	if (!dev_is_pci(dev))
2331 		return device_context_mapped(iommu, bus, devfn);
2332 
2333 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2334 				       domain_context_mapped_cb, iommu);
2335 }
2336 
2337 /* Returns a number of VTD pages, but aligned to MM page size */
2338 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2339 					    size_t size)
2340 {
2341 	host_addr &= ~PAGE_MASK;
2342 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2343 }
2344 
2345 /* Return largest possible superpage level for a given mapping */
2346 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2347 					  unsigned long iov_pfn,
2348 					  unsigned long phy_pfn,
2349 					  unsigned long pages)
2350 {
2351 	int support, level = 1;
2352 	unsigned long pfnmerge;
2353 
2354 	support = domain->iommu_superpage;
2355 
2356 	/* To use a large page, the virtual *and* physical addresses
2357 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2358 	   of them will mean we have to use smaller pages. So just
2359 	   merge them and check both at once. */
2360 	pfnmerge = iov_pfn | phy_pfn;
2361 
2362 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2363 		pages >>= VTD_STRIDE_SHIFT;
2364 		if (!pages)
2365 			break;
2366 		pfnmerge >>= VTD_STRIDE_SHIFT;
2367 		level++;
2368 		support--;
2369 	}
2370 	return level;
2371 }
2372 
2373 /*
2374  * Ensure that old small page tables are removed to make room for superpage(s).
2375  * We're going to add new large pages, so make sure we don't remove their parent
2376  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2377  */
2378 static void switch_to_super_page(struct dmar_domain *domain,
2379 				 unsigned long start_pfn,
2380 				 unsigned long end_pfn, int level)
2381 {
2382 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2383 	struct dma_pte *pte = NULL;
2384 	int i;
2385 
2386 	while (start_pfn <= end_pfn) {
2387 		if (!pte)
2388 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2389 
2390 		if (dma_pte_present(pte)) {
2391 			dma_pte_free_pagetable(domain, start_pfn,
2392 					       start_pfn + lvl_pages - 1,
2393 					       level + 1);
2394 
2395 			for_each_domain_iommu(i, domain)
2396 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2397 						      start_pfn, lvl_pages,
2398 						      0, 0);
2399 		}
2400 
2401 		pte++;
2402 		start_pfn += lvl_pages;
2403 		if (first_pte_in_page(pte))
2404 			pte = NULL;
2405 	}
2406 }
2407 
2408 static int
2409 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2410 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2411 {
2412 	struct dma_pte *first_pte = NULL, *pte = NULL;
2413 	unsigned int largepage_lvl = 0;
2414 	unsigned long lvl_pages = 0;
2415 	phys_addr_t pteval;
2416 	u64 attr;
2417 
2418 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2419 
2420 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2421 		return -EINVAL;
2422 
2423 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2424 	attr |= DMA_FL_PTE_PRESENT;
2425 	if (domain_use_first_level(domain)) {
2426 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2427 		if (prot & DMA_PTE_WRITE)
2428 			attr |= DMA_FL_PTE_DIRTY;
2429 	}
2430 
2431 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2432 
2433 	while (nr_pages > 0) {
2434 		uint64_t tmp;
2435 
2436 		if (!pte) {
2437 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2438 					phys_pfn, nr_pages);
2439 
2440 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2441 			if (!pte)
2442 				return -ENOMEM;
2443 			first_pte = pte;
2444 
2445 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2446 
2447 			/* It is large page*/
2448 			if (largepage_lvl > 1) {
2449 				unsigned long end_pfn;
2450 				unsigned long pages_to_remove;
2451 
2452 				pteval |= DMA_PTE_LARGE_PAGE;
2453 				pages_to_remove = min_t(unsigned long, nr_pages,
2454 							nr_pte_to_next_page(pte) * lvl_pages);
2455 				end_pfn = iov_pfn + pages_to_remove - 1;
2456 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2457 			} else {
2458 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2459 			}
2460 
2461 		}
2462 		/* We don't need lock here, nobody else
2463 		 * touches the iova range
2464 		 */
2465 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2466 		if (tmp) {
2467 			static int dumps = 5;
2468 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2469 				iov_pfn, tmp, (unsigned long long)pteval);
2470 			if (dumps) {
2471 				dumps--;
2472 				debug_dma_dump_mappings(NULL);
2473 			}
2474 			WARN_ON(1);
2475 		}
2476 
2477 		nr_pages -= lvl_pages;
2478 		iov_pfn += lvl_pages;
2479 		phys_pfn += lvl_pages;
2480 		pteval += lvl_pages * VTD_PAGE_SIZE;
2481 
2482 		/* If the next PTE would be the first in a new page, then we
2483 		 * need to flush the cache on the entries we've just written.
2484 		 * And then we'll need to recalculate 'pte', so clear it and
2485 		 * let it get set again in the if (!pte) block above.
2486 		 *
2487 		 * If we're done (!nr_pages) we need to flush the cache too.
2488 		 *
2489 		 * Also if we've been setting superpages, we may need to
2490 		 * recalculate 'pte' and switch back to smaller pages for the
2491 		 * end of the mapping, if the trailing size is not enough to
2492 		 * use another superpage (i.e. nr_pages < lvl_pages).
2493 		 */
2494 		pte++;
2495 		if (!nr_pages || first_pte_in_page(pte) ||
2496 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2497 			domain_flush_cache(domain, first_pte,
2498 					   (void *)pte - (void *)first_pte);
2499 			pte = NULL;
2500 		}
2501 	}
2502 
2503 	return 0;
2504 }
2505 
2506 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2507 {
2508 	struct intel_iommu *iommu = info->iommu;
2509 	struct context_entry *context;
2510 	unsigned long flags;
2511 	u16 did_old;
2512 
2513 	if (!iommu)
2514 		return;
2515 
2516 	spin_lock_irqsave(&iommu->lock, flags);
2517 	context = iommu_context_addr(iommu, bus, devfn, 0);
2518 	if (!context) {
2519 		spin_unlock_irqrestore(&iommu->lock, flags);
2520 		return;
2521 	}
2522 
2523 	if (sm_supported(iommu)) {
2524 		if (hw_pass_through && domain_type_is_si(info->domain))
2525 			did_old = FLPT_DEFAULT_DID;
2526 		else
2527 			did_old = info->domain->iommu_did[iommu->seq_id];
2528 	} else {
2529 		did_old = context_domain_id(context);
2530 	}
2531 
2532 	context_clear_entry(context);
2533 	__iommu_flush_cache(iommu, context, sizeof(*context));
2534 	spin_unlock_irqrestore(&iommu->lock, flags);
2535 	iommu->flush.flush_context(iommu,
2536 				   did_old,
2537 				   (((u16)bus) << 8) | devfn,
2538 				   DMA_CCMD_MASK_NOBIT,
2539 				   DMA_CCMD_DEVICE_INVL);
2540 
2541 	if (sm_supported(iommu))
2542 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2543 
2544 	iommu->flush.flush_iotlb(iommu,
2545 				 did_old,
2546 				 0,
2547 				 0,
2548 				 DMA_TLB_DSI_FLUSH);
2549 
2550 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2551 }
2552 
2553 static inline void unlink_domain_info(struct device_domain_info *info)
2554 {
2555 	assert_spin_locked(&device_domain_lock);
2556 	list_del(&info->link);
2557 	list_del(&info->global);
2558 	if (info->dev)
2559 		dev_iommu_priv_set(info->dev, NULL);
2560 }
2561 
2562 static void domain_remove_dev_info(struct dmar_domain *domain)
2563 {
2564 	struct device_domain_info *info, *tmp;
2565 	unsigned long flags;
2566 
2567 	spin_lock_irqsave(&device_domain_lock, flags);
2568 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2569 		__dmar_remove_one_dev_info(info);
2570 	spin_unlock_irqrestore(&device_domain_lock, flags);
2571 }
2572 
2573 struct dmar_domain *find_domain(struct device *dev)
2574 {
2575 	struct device_domain_info *info;
2576 
2577 	if (unlikely(!dev || !dev->iommu))
2578 		return NULL;
2579 
2580 	if (unlikely(attach_deferred(dev)))
2581 		return NULL;
2582 
2583 	/* No lock here, assumes no domain exit in normal case */
2584 	info = get_domain_info(dev);
2585 	if (likely(info))
2586 		return info->domain;
2587 
2588 	return NULL;
2589 }
2590 
2591 static inline struct device_domain_info *
2592 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2593 {
2594 	struct device_domain_info *info;
2595 
2596 	list_for_each_entry(info, &device_domain_list, global)
2597 		if (info->segment == segment && info->bus == bus &&
2598 		    info->devfn == devfn)
2599 			return info;
2600 
2601 	return NULL;
2602 }
2603 
2604 static int domain_setup_first_level(struct intel_iommu *iommu,
2605 				    struct dmar_domain *domain,
2606 				    struct device *dev,
2607 				    u32 pasid)
2608 {
2609 	struct dma_pte *pgd = domain->pgd;
2610 	int agaw, level;
2611 	int flags = 0;
2612 
2613 	/*
2614 	 * Skip top levels of page tables for iommu which has
2615 	 * less agaw than default. Unnecessary for PT mode.
2616 	 */
2617 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2618 		pgd = phys_to_virt(dma_pte_addr(pgd));
2619 		if (!dma_pte_present(pgd))
2620 			return -ENOMEM;
2621 	}
2622 
2623 	level = agaw_to_level(agaw);
2624 	if (level != 4 && level != 5)
2625 		return -EINVAL;
2626 
2627 	if (pasid != PASID_RID2PASID)
2628 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2629 	if (level == 5)
2630 		flags |= PASID_FLAG_FL5LP;
2631 
2632 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2633 		flags |= PASID_FLAG_PAGE_SNOOP;
2634 
2635 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2636 					     domain->iommu_did[iommu->seq_id],
2637 					     flags);
2638 }
2639 
2640 static bool dev_is_real_dma_subdevice(struct device *dev)
2641 {
2642 	return dev && dev_is_pci(dev) &&
2643 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2644 }
2645 
2646 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2647 						    int bus, int devfn,
2648 						    struct device *dev,
2649 						    struct dmar_domain *domain)
2650 {
2651 	struct dmar_domain *found = NULL;
2652 	struct device_domain_info *info;
2653 	unsigned long flags;
2654 	int ret;
2655 
2656 	info = alloc_devinfo_mem();
2657 	if (!info)
2658 		return NULL;
2659 
2660 	if (!dev_is_real_dma_subdevice(dev)) {
2661 		info->bus = bus;
2662 		info->devfn = devfn;
2663 		info->segment = iommu->segment;
2664 	} else {
2665 		struct pci_dev *pdev = to_pci_dev(dev);
2666 
2667 		info->bus = pdev->bus->number;
2668 		info->devfn = pdev->devfn;
2669 		info->segment = pci_domain_nr(pdev->bus);
2670 	}
2671 
2672 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2673 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2674 	info->ats_qdep = 0;
2675 	info->dev = dev;
2676 	info->domain = domain;
2677 	info->iommu = iommu;
2678 	info->pasid_table = NULL;
2679 	info->auxd_enabled = 0;
2680 	INIT_LIST_HEAD(&info->subdevices);
2681 
2682 	if (dev && dev_is_pci(dev)) {
2683 		struct pci_dev *pdev = to_pci_dev(info->dev);
2684 
2685 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2686 		    pci_ats_supported(pdev) &&
2687 		    dmar_find_matched_atsr_unit(pdev))
2688 			info->ats_supported = 1;
2689 
2690 		if (sm_supported(iommu)) {
2691 			if (pasid_supported(iommu)) {
2692 				int features = pci_pasid_features(pdev);
2693 				if (features >= 0)
2694 					info->pasid_supported = features | 1;
2695 			}
2696 
2697 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2698 			    pci_pri_supported(pdev))
2699 				info->pri_supported = 1;
2700 		}
2701 	}
2702 
2703 	spin_lock_irqsave(&device_domain_lock, flags);
2704 	if (dev)
2705 		found = find_domain(dev);
2706 
2707 	if (!found) {
2708 		struct device_domain_info *info2;
2709 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2710 						       info->devfn);
2711 		if (info2) {
2712 			found      = info2->domain;
2713 			info2->dev = dev;
2714 		}
2715 	}
2716 
2717 	if (found) {
2718 		spin_unlock_irqrestore(&device_domain_lock, flags);
2719 		free_devinfo_mem(info);
2720 		/* Caller must free the original domain */
2721 		return found;
2722 	}
2723 
2724 	spin_lock(&iommu->lock);
2725 	ret = domain_attach_iommu(domain, iommu);
2726 	spin_unlock(&iommu->lock);
2727 
2728 	if (ret) {
2729 		spin_unlock_irqrestore(&device_domain_lock, flags);
2730 		free_devinfo_mem(info);
2731 		return NULL;
2732 	}
2733 
2734 	list_add(&info->link, &domain->devices);
2735 	list_add(&info->global, &device_domain_list);
2736 	if (dev)
2737 		dev_iommu_priv_set(dev, info);
2738 	spin_unlock_irqrestore(&device_domain_lock, flags);
2739 
2740 	/* PASID table is mandatory for a PCI device in scalable mode. */
2741 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2742 		ret = intel_pasid_alloc_table(dev);
2743 		if (ret) {
2744 			dev_err(dev, "PASID table allocation failed\n");
2745 			dmar_remove_one_dev_info(dev);
2746 			return NULL;
2747 		}
2748 
2749 		/* Setup the PASID entry for requests without PASID: */
2750 		spin_lock_irqsave(&iommu->lock, flags);
2751 		if (hw_pass_through && domain_type_is_si(domain))
2752 			ret = intel_pasid_setup_pass_through(iommu, domain,
2753 					dev, PASID_RID2PASID);
2754 		else if (domain_use_first_level(domain))
2755 			ret = domain_setup_first_level(iommu, domain, dev,
2756 					PASID_RID2PASID);
2757 		else
2758 			ret = intel_pasid_setup_second_level(iommu, domain,
2759 					dev, PASID_RID2PASID);
2760 		spin_unlock_irqrestore(&iommu->lock, flags);
2761 		if (ret) {
2762 			dev_err(dev, "Setup RID2PASID failed\n");
2763 			dmar_remove_one_dev_info(dev);
2764 			return NULL;
2765 		}
2766 	}
2767 
2768 	if (dev && domain_context_mapping(domain, dev)) {
2769 		dev_err(dev, "Domain context map failed\n");
2770 		dmar_remove_one_dev_info(dev);
2771 		return NULL;
2772 	}
2773 
2774 	return domain;
2775 }
2776 
2777 static int iommu_domain_identity_map(struct dmar_domain *domain,
2778 				     unsigned long first_vpfn,
2779 				     unsigned long last_vpfn)
2780 {
2781 	/*
2782 	 * RMRR range might have overlap with physical memory range,
2783 	 * clear it first
2784 	 */
2785 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2786 
2787 	return __domain_mapping(domain, first_vpfn,
2788 				first_vpfn, last_vpfn - first_vpfn + 1,
2789 				DMA_PTE_READ|DMA_PTE_WRITE);
2790 }
2791 
2792 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2793 
2794 static int __init si_domain_init(int hw)
2795 {
2796 	struct dmar_rmrr_unit *rmrr;
2797 	struct device *dev;
2798 	int i, nid, ret;
2799 
2800 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2801 	if (!si_domain)
2802 		return -EFAULT;
2803 
2804 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2805 		domain_exit(si_domain);
2806 		return -EFAULT;
2807 	}
2808 
2809 	if (hw)
2810 		return 0;
2811 
2812 	for_each_online_node(nid) {
2813 		unsigned long start_pfn, end_pfn;
2814 		int i;
2815 
2816 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2817 			ret = iommu_domain_identity_map(si_domain,
2818 					mm_to_dma_pfn(start_pfn),
2819 					mm_to_dma_pfn(end_pfn));
2820 			if (ret)
2821 				return ret;
2822 		}
2823 	}
2824 
2825 	/*
2826 	 * Identity map the RMRRs so that devices with RMRRs could also use
2827 	 * the si_domain.
2828 	 */
2829 	for_each_rmrr_units(rmrr) {
2830 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2831 					  i, dev) {
2832 			unsigned long long start = rmrr->base_address;
2833 			unsigned long long end = rmrr->end_address;
2834 
2835 			if (WARN_ON(end < start ||
2836 				    end >> agaw_to_width(si_domain->agaw)))
2837 				continue;
2838 
2839 			ret = iommu_domain_identity_map(si_domain,
2840 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2841 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2842 			if (ret)
2843 				return ret;
2844 		}
2845 	}
2846 
2847 	return 0;
2848 }
2849 
2850 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2851 {
2852 	struct dmar_domain *ndomain;
2853 	struct intel_iommu *iommu;
2854 	u8 bus, devfn;
2855 
2856 	iommu = device_to_iommu(dev, &bus, &devfn);
2857 	if (!iommu)
2858 		return -ENODEV;
2859 
2860 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2861 	if (ndomain != domain)
2862 		return -EBUSY;
2863 
2864 	return 0;
2865 }
2866 
2867 static bool device_has_rmrr(struct device *dev)
2868 {
2869 	struct dmar_rmrr_unit *rmrr;
2870 	struct device *tmp;
2871 	int i;
2872 
2873 	rcu_read_lock();
2874 	for_each_rmrr_units(rmrr) {
2875 		/*
2876 		 * Return TRUE if this RMRR contains the device that
2877 		 * is passed in.
2878 		 */
2879 		for_each_active_dev_scope(rmrr->devices,
2880 					  rmrr->devices_cnt, i, tmp)
2881 			if (tmp == dev ||
2882 			    is_downstream_to_pci_bridge(dev, tmp)) {
2883 				rcu_read_unlock();
2884 				return true;
2885 			}
2886 	}
2887 	rcu_read_unlock();
2888 	return false;
2889 }
2890 
2891 /**
2892  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2893  * is relaxable (ie. is allowed to be not enforced under some conditions)
2894  * @dev: device handle
2895  *
2896  * We assume that PCI USB devices with RMRRs have them largely
2897  * for historical reasons and that the RMRR space is not actively used post
2898  * boot.  This exclusion may change if vendors begin to abuse it.
2899  *
2900  * The same exception is made for graphics devices, with the requirement that
2901  * any use of the RMRR regions will be torn down before assigning the device
2902  * to a guest.
2903  *
2904  * Return: true if the RMRR is relaxable, false otherwise
2905  */
2906 static bool device_rmrr_is_relaxable(struct device *dev)
2907 {
2908 	struct pci_dev *pdev;
2909 
2910 	if (!dev_is_pci(dev))
2911 		return false;
2912 
2913 	pdev = to_pci_dev(dev);
2914 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2915 		return true;
2916 	else
2917 		return false;
2918 }
2919 
2920 /*
2921  * There are a couple cases where we need to restrict the functionality of
2922  * devices associated with RMRRs.  The first is when evaluating a device for
2923  * identity mapping because problems exist when devices are moved in and out
2924  * of domains and their respective RMRR information is lost.  This means that
2925  * a device with associated RMRRs will never be in a "passthrough" domain.
2926  * The second is use of the device through the IOMMU API.  This interface
2927  * expects to have full control of the IOVA space for the device.  We cannot
2928  * satisfy both the requirement that RMRR access is maintained and have an
2929  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2930  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2931  * We therefore prevent devices associated with an RMRR from participating in
2932  * the IOMMU API, which eliminates them from device assignment.
2933  *
2934  * In both cases, devices which have relaxable RMRRs are not concerned by this
2935  * restriction. See device_rmrr_is_relaxable comment.
2936  */
2937 static bool device_is_rmrr_locked(struct device *dev)
2938 {
2939 	if (!device_has_rmrr(dev))
2940 		return false;
2941 
2942 	if (device_rmrr_is_relaxable(dev))
2943 		return false;
2944 
2945 	return true;
2946 }
2947 
2948 /*
2949  * Return the required default domain type for a specific device.
2950  *
2951  * @dev: the device in query
2952  * @startup: true if this is during early boot
2953  *
2954  * Returns:
2955  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2956  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2957  *  - 0: both identity and dynamic domains work for this device
2958  */
2959 static int device_def_domain_type(struct device *dev)
2960 {
2961 	if (dev_is_pci(dev)) {
2962 		struct pci_dev *pdev = to_pci_dev(dev);
2963 
2964 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2965 			return IOMMU_DOMAIN_IDENTITY;
2966 
2967 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2968 			return IOMMU_DOMAIN_IDENTITY;
2969 	}
2970 
2971 	return 0;
2972 }
2973 
2974 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2975 {
2976 	/*
2977 	 * Start from the sane iommu hardware state.
2978 	 * If the queued invalidation is already initialized by us
2979 	 * (for example, while enabling interrupt-remapping) then
2980 	 * we got the things already rolling from a sane state.
2981 	 */
2982 	if (!iommu->qi) {
2983 		/*
2984 		 * Clear any previous faults.
2985 		 */
2986 		dmar_fault(-1, iommu);
2987 		/*
2988 		 * Disable queued invalidation if supported and already enabled
2989 		 * before OS handover.
2990 		 */
2991 		dmar_disable_qi(iommu);
2992 	}
2993 
2994 	if (dmar_enable_qi(iommu)) {
2995 		/*
2996 		 * Queued Invalidate not enabled, use Register Based Invalidate
2997 		 */
2998 		iommu->flush.flush_context = __iommu_flush_context;
2999 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3000 		pr_info("%s: Using Register based invalidation\n",
3001 			iommu->name);
3002 	} else {
3003 		iommu->flush.flush_context = qi_flush_context;
3004 		iommu->flush.flush_iotlb = qi_flush_iotlb;
3005 		pr_info("%s: Using Queued invalidation\n", iommu->name);
3006 	}
3007 }
3008 
3009 static int copy_context_table(struct intel_iommu *iommu,
3010 			      struct root_entry *old_re,
3011 			      struct context_entry **tbl,
3012 			      int bus, bool ext)
3013 {
3014 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3015 	struct context_entry *new_ce = NULL, ce;
3016 	struct context_entry *old_ce = NULL;
3017 	struct root_entry re;
3018 	phys_addr_t old_ce_phys;
3019 
3020 	tbl_idx = ext ? bus * 2 : bus;
3021 	memcpy(&re, old_re, sizeof(re));
3022 
3023 	for (devfn = 0; devfn < 256; devfn++) {
3024 		/* First calculate the correct index */
3025 		idx = (ext ? devfn * 2 : devfn) % 256;
3026 
3027 		if (idx == 0) {
3028 			/* First save what we may have and clean up */
3029 			if (new_ce) {
3030 				tbl[tbl_idx] = new_ce;
3031 				__iommu_flush_cache(iommu, new_ce,
3032 						    VTD_PAGE_SIZE);
3033 				pos = 1;
3034 			}
3035 
3036 			if (old_ce)
3037 				memunmap(old_ce);
3038 
3039 			ret = 0;
3040 			if (devfn < 0x80)
3041 				old_ce_phys = root_entry_lctp(&re);
3042 			else
3043 				old_ce_phys = root_entry_uctp(&re);
3044 
3045 			if (!old_ce_phys) {
3046 				if (ext && devfn == 0) {
3047 					/* No LCTP, try UCTP */
3048 					devfn = 0x7f;
3049 					continue;
3050 				} else {
3051 					goto out;
3052 				}
3053 			}
3054 
3055 			ret = -ENOMEM;
3056 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3057 					MEMREMAP_WB);
3058 			if (!old_ce)
3059 				goto out;
3060 
3061 			new_ce = alloc_pgtable_page(iommu->node);
3062 			if (!new_ce)
3063 				goto out_unmap;
3064 
3065 			ret = 0;
3066 		}
3067 
3068 		/* Now copy the context entry */
3069 		memcpy(&ce, old_ce + idx, sizeof(ce));
3070 
3071 		if (!__context_present(&ce))
3072 			continue;
3073 
3074 		did = context_domain_id(&ce);
3075 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3076 			set_bit(did, iommu->domain_ids);
3077 
3078 		/*
3079 		 * We need a marker for copied context entries. This
3080 		 * marker needs to work for the old format as well as
3081 		 * for extended context entries.
3082 		 *
3083 		 * Bit 67 of the context entry is used. In the old
3084 		 * format this bit is available to software, in the
3085 		 * extended format it is the PGE bit, but PGE is ignored
3086 		 * by HW if PASIDs are disabled (and thus still
3087 		 * available).
3088 		 *
3089 		 * So disable PASIDs first and then mark the entry
3090 		 * copied. This means that we don't copy PASID
3091 		 * translations from the old kernel, but this is fine as
3092 		 * faults there are not fatal.
3093 		 */
3094 		context_clear_pasid_enable(&ce);
3095 		context_set_copied(&ce);
3096 
3097 		new_ce[idx] = ce;
3098 	}
3099 
3100 	tbl[tbl_idx + pos] = new_ce;
3101 
3102 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3103 
3104 out_unmap:
3105 	memunmap(old_ce);
3106 
3107 out:
3108 	return ret;
3109 }
3110 
3111 static int copy_translation_tables(struct intel_iommu *iommu)
3112 {
3113 	struct context_entry **ctxt_tbls;
3114 	struct root_entry *old_rt;
3115 	phys_addr_t old_rt_phys;
3116 	int ctxt_table_entries;
3117 	unsigned long flags;
3118 	u64 rtaddr_reg;
3119 	int bus, ret;
3120 	bool new_ext, ext;
3121 
3122 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3123 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3124 	new_ext    = !!ecap_ecs(iommu->ecap);
3125 
3126 	/*
3127 	 * The RTT bit can only be changed when translation is disabled,
3128 	 * but disabling translation means to open a window for data
3129 	 * corruption. So bail out and don't copy anything if we would
3130 	 * have to change the bit.
3131 	 */
3132 	if (new_ext != ext)
3133 		return -EINVAL;
3134 
3135 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3136 	if (!old_rt_phys)
3137 		return -EINVAL;
3138 
3139 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3140 	if (!old_rt)
3141 		return -ENOMEM;
3142 
3143 	/* This is too big for the stack - allocate it from slab */
3144 	ctxt_table_entries = ext ? 512 : 256;
3145 	ret = -ENOMEM;
3146 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3147 	if (!ctxt_tbls)
3148 		goto out_unmap;
3149 
3150 	for (bus = 0; bus < 256; bus++) {
3151 		ret = copy_context_table(iommu, &old_rt[bus],
3152 					 ctxt_tbls, bus, ext);
3153 		if (ret) {
3154 			pr_err("%s: Failed to copy context table for bus %d\n",
3155 				iommu->name, bus);
3156 			continue;
3157 		}
3158 	}
3159 
3160 	spin_lock_irqsave(&iommu->lock, flags);
3161 
3162 	/* Context tables are copied, now write them to the root_entry table */
3163 	for (bus = 0; bus < 256; bus++) {
3164 		int idx = ext ? bus * 2 : bus;
3165 		u64 val;
3166 
3167 		if (ctxt_tbls[idx]) {
3168 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3169 			iommu->root_entry[bus].lo = val;
3170 		}
3171 
3172 		if (!ext || !ctxt_tbls[idx + 1])
3173 			continue;
3174 
3175 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3176 		iommu->root_entry[bus].hi = val;
3177 	}
3178 
3179 	spin_unlock_irqrestore(&iommu->lock, flags);
3180 
3181 	kfree(ctxt_tbls);
3182 
3183 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3184 
3185 	ret = 0;
3186 
3187 out_unmap:
3188 	memunmap(old_rt);
3189 
3190 	return ret;
3191 }
3192 
3193 #ifdef CONFIG_INTEL_IOMMU_SVM
3194 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3195 {
3196 	struct intel_iommu *iommu = data;
3197 	ioasid_t ioasid;
3198 
3199 	if (!iommu)
3200 		return INVALID_IOASID;
3201 	/*
3202 	 * VT-d virtual command interface always uses the full 20 bit
3203 	 * PASID range. Host can partition guest PASID range based on
3204 	 * policies but it is out of guest's control.
3205 	 */
3206 	if (min < PASID_MIN || max > intel_pasid_max_id)
3207 		return INVALID_IOASID;
3208 
3209 	if (vcmd_alloc_pasid(iommu, &ioasid))
3210 		return INVALID_IOASID;
3211 
3212 	return ioasid;
3213 }
3214 
3215 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3216 {
3217 	struct intel_iommu *iommu = data;
3218 
3219 	if (!iommu)
3220 		return;
3221 	/*
3222 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3223 	 * We can only free the PASID when all the devices are unbound.
3224 	 */
3225 	if (ioasid_find(NULL, ioasid, NULL)) {
3226 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3227 		return;
3228 	}
3229 	vcmd_free_pasid(iommu, ioasid);
3230 }
3231 
3232 static void register_pasid_allocator(struct intel_iommu *iommu)
3233 {
3234 	/*
3235 	 * If we are running in the host, no need for custom allocator
3236 	 * in that PASIDs are allocated from the host system-wide.
3237 	 */
3238 	if (!cap_caching_mode(iommu->cap))
3239 		return;
3240 
3241 	if (!sm_supported(iommu)) {
3242 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3243 		return;
3244 	}
3245 
3246 	/*
3247 	 * Register a custom PASID allocator if we are running in a guest,
3248 	 * guest PASID must be obtained via virtual command interface.
3249 	 * There can be multiple vIOMMUs in each guest but only one allocator
3250 	 * is active. All vIOMMU allocators will eventually be calling the same
3251 	 * host allocator.
3252 	 */
3253 	if (!vccap_pasid(iommu->vccap))
3254 		return;
3255 
3256 	pr_info("Register custom PASID allocator\n");
3257 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3258 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3259 	iommu->pasid_allocator.pdata = (void *)iommu;
3260 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3261 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3262 		/*
3263 		 * Disable scalable mode on this IOMMU if there
3264 		 * is no custom allocator. Mixing SM capable vIOMMU
3265 		 * and non-SM vIOMMU are not supported.
3266 		 */
3267 		intel_iommu_sm = 0;
3268 	}
3269 }
3270 #endif
3271 
3272 static int __init init_dmars(void)
3273 {
3274 	struct dmar_drhd_unit *drhd;
3275 	struct intel_iommu *iommu;
3276 	int ret;
3277 
3278 	/*
3279 	 * for each drhd
3280 	 *    allocate root
3281 	 *    initialize and program root entry to not present
3282 	 * endfor
3283 	 */
3284 	for_each_drhd_unit(drhd) {
3285 		/*
3286 		 * lock not needed as this is only incremented in the single
3287 		 * threaded kernel __init code path all other access are read
3288 		 * only
3289 		 */
3290 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3291 			g_num_of_iommus++;
3292 			continue;
3293 		}
3294 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3295 	}
3296 
3297 	/* Preallocate enough resources for IOMMU hot-addition */
3298 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3299 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3300 
3301 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3302 			GFP_KERNEL);
3303 	if (!g_iommus) {
3304 		ret = -ENOMEM;
3305 		goto error;
3306 	}
3307 
3308 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3309 	if (ret)
3310 		goto free_iommu;
3311 
3312 	for_each_iommu(iommu, drhd) {
3313 		if (drhd->ignored) {
3314 			iommu_disable_translation(iommu);
3315 			continue;
3316 		}
3317 
3318 		/*
3319 		 * Find the max pasid size of all IOMMU's in the system.
3320 		 * We need to ensure the system pasid table is no bigger
3321 		 * than the smallest supported.
3322 		 */
3323 		if (pasid_supported(iommu)) {
3324 			u32 temp = 2 << ecap_pss(iommu->ecap);
3325 
3326 			intel_pasid_max_id = min_t(u32, temp,
3327 						   intel_pasid_max_id);
3328 		}
3329 
3330 		g_iommus[iommu->seq_id] = iommu;
3331 
3332 		intel_iommu_init_qi(iommu);
3333 
3334 		ret = iommu_init_domains(iommu);
3335 		if (ret)
3336 			goto free_iommu;
3337 
3338 		init_translation_status(iommu);
3339 
3340 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3341 			iommu_disable_translation(iommu);
3342 			clear_translation_pre_enabled(iommu);
3343 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3344 				iommu->name);
3345 		}
3346 
3347 		/*
3348 		 * TBD:
3349 		 * we could share the same root & context tables
3350 		 * among all IOMMU's. Need to Split it later.
3351 		 */
3352 		ret = iommu_alloc_root_entry(iommu);
3353 		if (ret)
3354 			goto free_iommu;
3355 
3356 		if (translation_pre_enabled(iommu)) {
3357 			pr_info("Translation already enabled - trying to copy translation structures\n");
3358 
3359 			ret = copy_translation_tables(iommu);
3360 			if (ret) {
3361 				/*
3362 				 * We found the IOMMU with translation
3363 				 * enabled - but failed to copy over the
3364 				 * old root-entry table. Try to proceed
3365 				 * by disabling translation now and
3366 				 * allocating a clean root-entry table.
3367 				 * This might cause DMAR faults, but
3368 				 * probably the dump will still succeed.
3369 				 */
3370 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3371 				       iommu->name);
3372 				iommu_disable_translation(iommu);
3373 				clear_translation_pre_enabled(iommu);
3374 			} else {
3375 				pr_info("Copied translation tables from previous kernel for %s\n",
3376 					iommu->name);
3377 			}
3378 		}
3379 
3380 		if (!ecap_pass_through(iommu->ecap))
3381 			hw_pass_through = 0;
3382 		intel_svm_check(iommu);
3383 	}
3384 
3385 	/*
3386 	 * Now that qi is enabled on all iommus, set the root entry and flush
3387 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3388 	 * flush_context function will loop forever and the boot hangs.
3389 	 */
3390 	for_each_active_iommu(iommu, drhd) {
3391 		iommu_flush_write_buffer(iommu);
3392 #ifdef CONFIG_INTEL_IOMMU_SVM
3393 		register_pasid_allocator(iommu);
3394 #endif
3395 		iommu_set_root_entry(iommu);
3396 	}
3397 
3398 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3399 	dmar_map_gfx = 0;
3400 #endif
3401 
3402 	if (!dmar_map_gfx)
3403 		iommu_identity_mapping |= IDENTMAP_GFX;
3404 
3405 	check_tylersburg_isoch();
3406 
3407 	ret = si_domain_init(hw_pass_through);
3408 	if (ret)
3409 		goto free_iommu;
3410 
3411 	/*
3412 	 * for each drhd
3413 	 *   enable fault log
3414 	 *   global invalidate context cache
3415 	 *   global invalidate iotlb
3416 	 *   enable translation
3417 	 */
3418 	for_each_iommu(iommu, drhd) {
3419 		if (drhd->ignored) {
3420 			/*
3421 			 * we always have to disable PMRs or DMA may fail on
3422 			 * this device
3423 			 */
3424 			if (force_on)
3425 				iommu_disable_protect_mem_regions(iommu);
3426 			continue;
3427 		}
3428 
3429 		iommu_flush_write_buffer(iommu);
3430 
3431 #ifdef CONFIG_INTEL_IOMMU_SVM
3432 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3433 			/*
3434 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3435 			 * could cause possible lock race condition.
3436 			 */
3437 			up_write(&dmar_global_lock);
3438 			ret = intel_svm_enable_prq(iommu);
3439 			down_write(&dmar_global_lock);
3440 			if (ret)
3441 				goto free_iommu;
3442 		}
3443 #endif
3444 		ret = dmar_set_interrupt(iommu);
3445 		if (ret)
3446 			goto free_iommu;
3447 	}
3448 
3449 	return 0;
3450 
3451 free_iommu:
3452 	for_each_active_iommu(iommu, drhd) {
3453 		disable_dmar_iommu(iommu);
3454 		free_dmar_iommu(iommu);
3455 	}
3456 
3457 	kfree(g_iommus);
3458 
3459 error:
3460 	return ret;
3461 }
3462 
3463 static inline int iommu_domain_cache_init(void)
3464 {
3465 	int ret = 0;
3466 
3467 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3468 					 sizeof(struct dmar_domain),
3469 					 0,
3470 					 SLAB_HWCACHE_ALIGN,
3471 
3472 					 NULL);
3473 	if (!iommu_domain_cache) {
3474 		pr_err("Couldn't create iommu_domain cache\n");
3475 		ret = -ENOMEM;
3476 	}
3477 
3478 	return ret;
3479 }
3480 
3481 static inline int iommu_devinfo_cache_init(void)
3482 {
3483 	int ret = 0;
3484 
3485 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3486 					 sizeof(struct device_domain_info),
3487 					 0,
3488 					 SLAB_HWCACHE_ALIGN,
3489 					 NULL);
3490 	if (!iommu_devinfo_cache) {
3491 		pr_err("Couldn't create devinfo cache\n");
3492 		ret = -ENOMEM;
3493 	}
3494 
3495 	return ret;
3496 }
3497 
3498 static int __init iommu_init_mempool(void)
3499 {
3500 	int ret;
3501 	ret = iova_cache_get();
3502 	if (ret)
3503 		return ret;
3504 
3505 	ret = iommu_domain_cache_init();
3506 	if (ret)
3507 		goto domain_error;
3508 
3509 	ret = iommu_devinfo_cache_init();
3510 	if (!ret)
3511 		return ret;
3512 
3513 	kmem_cache_destroy(iommu_domain_cache);
3514 domain_error:
3515 	iova_cache_put();
3516 
3517 	return -ENOMEM;
3518 }
3519 
3520 static void __init iommu_exit_mempool(void)
3521 {
3522 	kmem_cache_destroy(iommu_devinfo_cache);
3523 	kmem_cache_destroy(iommu_domain_cache);
3524 	iova_cache_put();
3525 }
3526 
3527 static void __init init_no_remapping_devices(void)
3528 {
3529 	struct dmar_drhd_unit *drhd;
3530 	struct device *dev;
3531 	int i;
3532 
3533 	for_each_drhd_unit(drhd) {
3534 		if (!drhd->include_all) {
3535 			for_each_active_dev_scope(drhd->devices,
3536 						  drhd->devices_cnt, i, dev)
3537 				break;
3538 			/* ignore DMAR unit if no devices exist */
3539 			if (i == drhd->devices_cnt)
3540 				drhd->ignored = 1;
3541 		}
3542 	}
3543 
3544 	for_each_active_drhd_unit(drhd) {
3545 		if (drhd->include_all)
3546 			continue;
3547 
3548 		for_each_active_dev_scope(drhd->devices,
3549 					  drhd->devices_cnt, i, dev)
3550 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3551 				break;
3552 		if (i < drhd->devices_cnt)
3553 			continue;
3554 
3555 		/* This IOMMU has *only* gfx devices. Either bypass it or
3556 		   set the gfx_mapped flag, as appropriate */
3557 		drhd->gfx_dedicated = 1;
3558 		if (!dmar_map_gfx)
3559 			drhd->ignored = 1;
3560 	}
3561 }
3562 
3563 #ifdef CONFIG_SUSPEND
3564 static int init_iommu_hw(void)
3565 {
3566 	struct dmar_drhd_unit *drhd;
3567 	struct intel_iommu *iommu = NULL;
3568 
3569 	for_each_active_iommu(iommu, drhd)
3570 		if (iommu->qi)
3571 			dmar_reenable_qi(iommu);
3572 
3573 	for_each_iommu(iommu, drhd) {
3574 		if (drhd->ignored) {
3575 			/*
3576 			 * we always have to disable PMRs or DMA may fail on
3577 			 * this device
3578 			 */
3579 			if (force_on)
3580 				iommu_disable_protect_mem_regions(iommu);
3581 			continue;
3582 		}
3583 
3584 		iommu_flush_write_buffer(iommu);
3585 		iommu_set_root_entry(iommu);
3586 		iommu_enable_translation(iommu);
3587 		iommu_disable_protect_mem_regions(iommu);
3588 	}
3589 
3590 	return 0;
3591 }
3592 
3593 static void iommu_flush_all(void)
3594 {
3595 	struct dmar_drhd_unit *drhd;
3596 	struct intel_iommu *iommu;
3597 
3598 	for_each_active_iommu(iommu, drhd) {
3599 		iommu->flush.flush_context(iommu, 0, 0, 0,
3600 					   DMA_CCMD_GLOBAL_INVL);
3601 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3602 					 DMA_TLB_GLOBAL_FLUSH);
3603 	}
3604 }
3605 
3606 static int iommu_suspend(void)
3607 {
3608 	struct dmar_drhd_unit *drhd;
3609 	struct intel_iommu *iommu = NULL;
3610 	unsigned long flag;
3611 
3612 	for_each_active_iommu(iommu, drhd) {
3613 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3614 					     GFP_KERNEL);
3615 		if (!iommu->iommu_state)
3616 			goto nomem;
3617 	}
3618 
3619 	iommu_flush_all();
3620 
3621 	for_each_active_iommu(iommu, drhd) {
3622 		iommu_disable_translation(iommu);
3623 
3624 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3625 
3626 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3627 			readl(iommu->reg + DMAR_FECTL_REG);
3628 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3629 			readl(iommu->reg + DMAR_FEDATA_REG);
3630 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3631 			readl(iommu->reg + DMAR_FEADDR_REG);
3632 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3633 			readl(iommu->reg + DMAR_FEUADDR_REG);
3634 
3635 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3636 	}
3637 	return 0;
3638 
3639 nomem:
3640 	for_each_active_iommu(iommu, drhd)
3641 		kfree(iommu->iommu_state);
3642 
3643 	return -ENOMEM;
3644 }
3645 
3646 static void iommu_resume(void)
3647 {
3648 	struct dmar_drhd_unit *drhd;
3649 	struct intel_iommu *iommu = NULL;
3650 	unsigned long flag;
3651 
3652 	if (init_iommu_hw()) {
3653 		if (force_on)
3654 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3655 		else
3656 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3657 		return;
3658 	}
3659 
3660 	for_each_active_iommu(iommu, drhd) {
3661 
3662 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3663 
3664 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3665 			iommu->reg + DMAR_FECTL_REG);
3666 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3667 			iommu->reg + DMAR_FEDATA_REG);
3668 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3669 			iommu->reg + DMAR_FEADDR_REG);
3670 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3671 			iommu->reg + DMAR_FEUADDR_REG);
3672 
3673 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3674 	}
3675 
3676 	for_each_active_iommu(iommu, drhd)
3677 		kfree(iommu->iommu_state);
3678 }
3679 
3680 static struct syscore_ops iommu_syscore_ops = {
3681 	.resume		= iommu_resume,
3682 	.suspend	= iommu_suspend,
3683 };
3684 
3685 static void __init init_iommu_pm_ops(void)
3686 {
3687 	register_syscore_ops(&iommu_syscore_ops);
3688 }
3689 
3690 #else
3691 static inline void init_iommu_pm_ops(void) {}
3692 #endif	/* CONFIG_PM */
3693 
3694 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3695 {
3696 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3697 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3698 	    rmrr->end_address <= rmrr->base_address ||
3699 	    arch_rmrr_sanity_check(rmrr))
3700 		return -EINVAL;
3701 
3702 	return 0;
3703 }
3704 
3705 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3706 {
3707 	struct acpi_dmar_reserved_memory *rmrr;
3708 	struct dmar_rmrr_unit *rmrru;
3709 
3710 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3711 	if (rmrr_sanity_check(rmrr)) {
3712 		pr_warn(FW_BUG
3713 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3714 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3715 			   rmrr->base_address, rmrr->end_address,
3716 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3717 			   dmi_get_system_info(DMI_BIOS_VERSION),
3718 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3719 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3720 	}
3721 
3722 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3723 	if (!rmrru)
3724 		goto out;
3725 
3726 	rmrru->hdr = header;
3727 
3728 	rmrru->base_address = rmrr->base_address;
3729 	rmrru->end_address = rmrr->end_address;
3730 
3731 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3732 				((void *)rmrr) + rmrr->header.length,
3733 				&rmrru->devices_cnt);
3734 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3735 		goto free_rmrru;
3736 
3737 	list_add(&rmrru->list, &dmar_rmrr_units);
3738 
3739 	return 0;
3740 free_rmrru:
3741 	kfree(rmrru);
3742 out:
3743 	return -ENOMEM;
3744 }
3745 
3746 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3747 {
3748 	struct dmar_atsr_unit *atsru;
3749 	struct acpi_dmar_atsr *tmp;
3750 
3751 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3752 				dmar_rcu_check()) {
3753 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3754 		if (atsr->segment != tmp->segment)
3755 			continue;
3756 		if (atsr->header.length != tmp->header.length)
3757 			continue;
3758 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3759 			return atsru;
3760 	}
3761 
3762 	return NULL;
3763 }
3764 
3765 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3766 {
3767 	struct acpi_dmar_atsr *atsr;
3768 	struct dmar_atsr_unit *atsru;
3769 
3770 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3771 		return 0;
3772 
3773 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3774 	atsru = dmar_find_atsr(atsr);
3775 	if (atsru)
3776 		return 0;
3777 
3778 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3779 	if (!atsru)
3780 		return -ENOMEM;
3781 
3782 	/*
3783 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3784 	 * copy the memory content because the memory buffer will be freed
3785 	 * on return.
3786 	 */
3787 	atsru->hdr = (void *)(atsru + 1);
3788 	memcpy(atsru->hdr, hdr, hdr->length);
3789 	atsru->include_all = atsr->flags & 0x1;
3790 	if (!atsru->include_all) {
3791 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3792 				(void *)atsr + atsr->header.length,
3793 				&atsru->devices_cnt);
3794 		if (atsru->devices_cnt && atsru->devices == NULL) {
3795 			kfree(atsru);
3796 			return -ENOMEM;
3797 		}
3798 	}
3799 
3800 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3801 
3802 	return 0;
3803 }
3804 
3805 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3806 {
3807 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3808 	kfree(atsru);
3809 }
3810 
3811 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3812 {
3813 	struct acpi_dmar_atsr *atsr;
3814 	struct dmar_atsr_unit *atsru;
3815 
3816 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3817 	atsru = dmar_find_atsr(atsr);
3818 	if (atsru) {
3819 		list_del_rcu(&atsru->list);
3820 		synchronize_rcu();
3821 		intel_iommu_free_atsr(atsru);
3822 	}
3823 
3824 	return 0;
3825 }
3826 
3827 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3828 {
3829 	int i;
3830 	struct device *dev;
3831 	struct acpi_dmar_atsr *atsr;
3832 	struct dmar_atsr_unit *atsru;
3833 
3834 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3835 	atsru = dmar_find_atsr(atsr);
3836 	if (!atsru)
3837 		return 0;
3838 
3839 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3840 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3841 					  i, dev)
3842 			return -EBUSY;
3843 	}
3844 
3845 	return 0;
3846 }
3847 
3848 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3849 {
3850 	struct dmar_satc_unit *satcu;
3851 	struct acpi_dmar_satc *tmp;
3852 
3853 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3854 				dmar_rcu_check()) {
3855 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3856 		if (satc->segment != tmp->segment)
3857 			continue;
3858 		if (satc->header.length != tmp->header.length)
3859 			continue;
3860 		if (memcmp(satc, tmp, satc->header.length) == 0)
3861 			return satcu;
3862 	}
3863 
3864 	return NULL;
3865 }
3866 
3867 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3868 {
3869 	struct acpi_dmar_satc *satc;
3870 	struct dmar_satc_unit *satcu;
3871 
3872 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3873 		return 0;
3874 
3875 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3876 	satcu = dmar_find_satc(satc);
3877 	if (satcu)
3878 		return 0;
3879 
3880 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3881 	if (!satcu)
3882 		return -ENOMEM;
3883 
3884 	satcu->hdr = (void *)(satcu + 1);
3885 	memcpy(satcu->hdr, hdr, hdr->length);
3886 	satcu->atc_required = satc->flags & 0x1;
3887 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3888 					      (void *)satc + satc->header.length,
3889 					      &satcu->devices_cnt);
3890 	if (satcu->devices_cnt && !satcu->devices) {
3891 		kfree(satcu);
3892 		return -ENOMEM;
3893 	}
3894 	list_add_rcu(&satcu->list, &dmar_satc_units);
3895 
3896 	return 0;
3897 }
3898 
3899 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3900 {
3901 	int sp, ret;
3902 	struct intel_iommu *iommu = dmaru->iommu;
3903 
3904 	if (g_iommus[iommu->seq_id])
3905 		return 0;
3906 
3907 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3908 	if (ret)
3909 		goto out;
3910 
3911 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3912 		pr_warn("%s: Doesn't support hardware pass through.\n",
3913 			iommu->name);
3914 		return -ENXIO;
3915 	}
3916 	if (!ecap_sc_support(iommu->ecap) &&
3917 	    domain_update_iommu_snooping(iommu)) {
3918 		pr_warn("%s: Doesn't support snooping.\n",
3919 			iommu->name);
3920 		return -ENXIO;
3921 	}
3922 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3923 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3924 		pr_warn("%s: Doesn't support large page.\n",
3925 			iommu->name);
3926 		return -ENXIO;
3927 	}
3928 
3929 	/*
3930 	 * Disable translation if already enabled prior to OS handover.
3931 	 */
3932 	if (iommu->gcmd & DMA_GCMD_TE)
3933 		iommu_disable_translation(iommu);
3934 
3935 	g_iommus[iommu->seq_id] = iommu;
3936 	ret = iommu_init_domains(iommu);
3937 	if (ret == 0)
3938 		ret = iommu_alloc_root_entry(iommu);
3939 	if (ret)
3940 		goto out;
3941 
3942 	intel_svm_check(iommu);
3943 
3944 	if (dmaru->ignored) {
3945 		/*
3946 		 * we always have to disable PMRs or DMA may fail on this device
3947 		 */
3948 		if (force_on)
3949 			iommu_disable_protect_mem_regions(iommu);
3950 		return 0;
3951 	}
3952 
3953 	intel_iommu_init_qi(iommu);
3954 	iommu_flush_write_buffer(iommu);
3955 
3956 #ifdef CONFIG_INTEL_IOMMU_SVM
3957 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3958 		ret = intel_svm_enable_prq(iommu);
3959 		if (ret)
3960 			goto disable_iommu;
3961 	}
3962 #endif
3963 	ret = dmar_set_interrupt(iommu);
3964 	if (ret)
3965 		goto disable_iommu;
3966 
3967 	iommu_set_root_entry(iommu);
3968 	iommu_enable_translation(iommu);
3969 
3970 	iommu_disable_protect_mem_regions(iommu);
3971 	return 0;
3972 
3973 disable_iommu:
3974 	disable_dmar_iommu(iommu);
3975 out:
3976 	free_dmar_iommu(iommu);
3977 	return ret;
3978 }
3979 
3980 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3981 {
3982 	int ret = 0;
3983 	struct intel_iommu *iommu = dmaru->iommu;
3984 
3985 	if (!intel_iommu_enabled)
3986 		return 0;
3987 	if (iommu == NULL)
3988 		return -EINVAL;
3989 
3990 	if (insert) {
3991 		ret = intel_iommu_add(dmaru);
3992 	} else {
3993 		disable_dmar_iommu(iommu);
3994 		free_dmar_iommu(iommu);
3995 	}
3996 
3997 	return ret;
3998 }
3999 
4000 static void intel_iommu_free_dmars(void)
4001 {
4002 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4003 	struct dmar_atsr_unit *atsru, *atsr_n;
4004 	struct dmar_satc_unit *satcu, *satc_n;
4005 
4006 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4007 		list_del(&rmrru->list);
4008 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4009 		kfree(rmrru);
4010 	}
4011 
4012 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4013 		list_del(&atsru->list);
4014 		intel_iommu_free_atsr(atsru);
4015 	}
4016 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
4017 		list_del(&satcu->list);
4018 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
4019 		kfree(satcu);
4020 	}
4021 }
4022 
4023 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4024 {
4025 	int i, ret = 1;
4026 	struct pci_bus *bus;
4027 	struct pci_dev *bridge = NULL;
4028 	struct device *tmp;
4029 	struct acpi_dmar_atsr *atsr;
4030 	struct dmar_atsr_unit *atsru;
4031 
4032 	dev = pci_physfn(dev);
4033 	for (bus = dev->bus; bus; bus = bus->parent) {
4034 		bridge = bus->self;
4035 		/* If it's an integrated device, allow ATS */
4036 		if (!bridge)
4037 			return 1;
4038 		/* Connected via non-PCIe: no ATS */
4039 		if (!pci_is_pcie(bridge) ||
4040 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4041 			return 0;
4042 		/* If we found the root port, look it up in the ATSR */
4043 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4044 			break;
4045 	}
4046 
4047 	rcu_read_lock();
4048 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4049 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4050 		if (atsr->segment != pci_domain_nr(dev->bus))
4051 			continue;
4052 
4053 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4054 			if (tmp == &bridge->dev)
4055 				goto out;
4056 
4057 		if (atsru->include_all)
4058 			goto out;
4059 	}
4060 	ret = 0;
4061 out:
4062 	rcu_read_unlock();
4063 
4064 	return ret;
4065 }
4066 
4067 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4068 {
4069 	int ret;
4070 	struct dmar_rmrr_unit *rmrru;
4071 	struct dmar_atsr_unit *atsru;
4072 	struct dmar_satc_unit *satcu;
4073 	struct acpi_dmar_atsr *atsr;
4074 	struct acpi_dmar_reserved_memory *rmrr;
4075 	struct acpi_dmar_satc *satc;
4076 
4077 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4078 		return 0;
4079 
4080 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4081 		rmrr = container_of(rmrru->hdr,
4082 				    struct acpi_dmar_reserved_memory, header);
4083 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4084 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4085 				((void *)rmrr) + rmrr->header.length,
4086 				rmrr->segment, rmrru->devices,
4087 				rmrru->devices_cnt);
4088 			if (ret < 0)
4089 				return ret;
4090 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4091 			dmar_remove_dev_scope(info, rmrr->segment,
4092 				rmrru->devices, rmrru->devices_cnt);
4093 		}
4094 	}
4095 
4096 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4097 		if (atsru->include_all)
4098 			continue;
4099 
4100 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4101 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4102 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4103 					(void *)atsr + atsr->header.length,
4104 					atsr->segment, atsru->devices,
4105 					atsru->devices_cnt);
4106 			if (ret > 0)
4107 				break;
4108 			else if (ret < 0)
4109 				return ret;
4110 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4111 			if (dmar_remove_dev_scope(info, atsr->segment,
4112 					atsru->devices, atsru->devices_cnt))
4113 				break;
4114 		}
4115 	}
4116 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4117 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4118 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4119 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4120 					(void *)satc + satc->header.length,
4121 					satc->segment, satcu->devices,
4122 					satcu->devices_cnt);
4123 			if (ret > 0)
4124 				break;
4125 			else if (ret < 0)
4126 				return ret;
4127 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4128 			if (dmar_remove_dev_scope(info, satc->segment,
4129 					satcu->devices, satcu->devices_cnt))
4130 				break;
4131 		}
4132 	}
4133 
4134 	return 0;
4135 }
4136 
4137 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4138 				       unsigned long val, void *v)
4139 {
4140 	struct memory_notify *mhp = v;
4141 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4142 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4143 			mhp->nr_pages - 1);
4144 
4145 	switch (val) {
4146 	case MEM_GOING_ONLINE:
4147 		if (iommu_domain_identity_map(si_domain,
4148 					      start_vpfn, last_vpfn)) {
4149 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4150 				start_vpfn, last_vpfn);
4151 			return NOTIFY_BAD;
4152 		}
4153 		break;
4154 
4155 	case MEM_OFFLINE:
4156 	case MEM_CANCEL_ONLINE:
4157 		{
4158 			struct dmar_drhd_unit *drhd;
4159 			struct intel_iommu *iommu;
4160 			LIST_HEAD(freelist);
4161 
4162 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
4163 
4164 			rcu_read_lock();
4165 			for_each_active_iommu(iommu, drhd)
4166 				iommu_flush_iotlb_psi(iommu, si_domain,
4167 					start_vpfn, mhp->nr_pages,
4168 					list_empty(&freelist), 0);
4169 			rcu_read_unlock();
4170 			put_pages_list(&freelist);
4171 		}
4172 		break;
4173 	}
4174 
4175 	return NOTIFY_OK;
4176 }
4177 
4178 static struct notifier_block intel_iommu_memory_nb = {
4179 	.notifier_call = intel_iommu_memory_notifier,
4180 	.priority = 0
4181 };
4182 
4183 static void intel_disable_iommus(void)
4184 {
4185 	struct intel_iommu *iommu = NULL;
4186 	struct dmar_drhd_unit *drhd;
4187 
4188 	for_each_iommu(iommu, drhd)
4189 		iommu_disable_translation(iommu);
4190 }
4191 
4192 void intel_iommu_shutdown(void)
4193 {
4194 	struct dmar_drhd_unit *drhd;
4195 	struct intel_iommu *iommu = NULL;
4196 
4197 	if (no_iommu || dmar_disabled)
4198 		return;
4199 
4200 	down_write(&dmar_global_lock);
4201 
4202 	/* Disable PMRs explicitly here. */
4203 	for_each_iommu(iommu, drhd)
4204 		iommu_disable_protect_mem_regions(iommu);
4205 
4206 	/* Make sure the IOMMUs are switched off */
4207 	intel_disable_iommus();
4208 
4209 	up_write(&dmar_global_lock);
4210 }
4211 
4212 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4213 {
4214 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4215 
4216 	return container_of(iommu_dev, struct intel_iommu, iommu);
4217 }
4218 
4219 static ssize_t version_show(struct device *dev,
4220 			    struct device_attribute *attr, char *buf)
4221 {
4222 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4223 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4224 	return sprintf(buf, "%d:%d\n",
4225 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4226 }
4227 static DEVICE_ATTR_RO(version);
4228 
4229 static ssize_t address_show(struct device *dev,
4230 			    struct device_attribute *attr, char *buf)
4231 {
4232 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4233 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4234 }
4235 static DEVICE_ATTR_RO(address);
4236 
4237 static ssize_t cap_show(struct device *dev,
4238 			struct device_attribute *attr, char *buf)
4239 {
4240 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4241 	return sprintf(buf, "%llx\n", iommu->cap);
4242 }
4243 static DEVICE_ATTR_RO(cap);
4244 
4245 static ssize_t ecap_show(struct device *dev,
4246 			 struct device_attribute *attr, char *buf)
4247 {
4248 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4249 	return sprintf(buf, "%llx\n", iommu->ecap);
4250 }
4251 static DEVICE_ATTR_RO(ecap);
4252 
4253 static ssize_t domains_supported_show(struct device *dev,
4254 				      struct device_attribute *attr, char *buf)
4255 {
4256 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4257 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4258 }
4259 static DEVICE_ATTR_RO(domains_supported);
4260 
4261 static ssize_t domains_used_show(struct device *dev,
4262 				 struct device_attribute *attr, char *buf)
4263 {
4264 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4265 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4266 						  cap_ndoms(iommu->cap)));
4267 }
4268 static DEVICE_ATTR_RO(domains_used);
4269 
4270 static struct attribute *intel_iommu_attrs[] = {
4271 	&dev_attr_version.attr,
4272 	&dev_attr_address.attr,
4273 	&dev_attr_cap.attr,
4274 	&dev_attr_ecap.attr,
4275 	&dev_attr_domains_supported.attr,
4276 	&dev_attr_domains_used.attr,
4277 	NULL,
4278 };
4279 
4280 static struct attribute_group intel_iommu_group = {
4281 	.name = "intel-iommu",
4282 	.attrs = intel_iommu_attrs,
4283 };
4284 
4285 const struct attribute_group *intel_iommu_groups[] = {
4286 	&intel_iommu_group,
4287 	NULL,
4288 };
4289 
4290 static inline bool has_external_pci(void)
4291 {
4292 	struct pci_dev *pdev = NULL;
4293 
4294 	for_each_pci_dev(pdev)
4295 		if (pdev->external_facing)
4296 			return true;
4297 
4298 	return false;
4299 }
4300 
4301 static int __init platform_optin_force_iommu(void)
4302 {
4303 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4304 		return 0;
4305 
4306 	if (no_iommu || dmar_disabled)
4307 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4308 
4309 	/*
4310 	 * If Intel-IOMMU is disabled by default, we will apply identity
4311 	 * map for all devices except those marked as being untrusted.
4312 	 */
4313 	if (dmar_disabled)
4314 		iommu_set_default_passthrough(false);
4315 
4316 	dmar_disabled = 0;
4317 	no_iommu = 0;
4318 
4319 	return 1;
4320 }
4321 
4322 static int __init probe_acpi_namespace_devices(void)
4323 {
4324 	struct dmar_drhd_unit *drhd;
4325 	/* To avoid a -Wunused-but-set-variable warning. */
4326 	struct intel_iommu *iommu __maybe_unused;
4327 	struct device *dev;
4328 	int i, ret = 0;
4329 
4330 	for_each_active_iommu(iommu, drhd) {
4331 		for_each_active_dev_scope(drhd->devices,
4332 					  drhd->devices_cnt, i, dev) {
4333 			struct acpi_device_physical_node *pn;
4334 			struct iommu_group *group;
4335 			struct acpi_device *adev;
4336 
4337 			if (dev->bus != &acpi_bus_type)
4338 				continue;
4339 
4340 			adev = to_acpi_device(dev);
4341 			mutex_lock(&adev->physical_node_lock);
4342 			list_for_each_entry(pn,
4343 					    &adev->physical_node_list, node) {
4344 				group = iommu_group_get(pn->dev);
4345 				if (group) {
4346 					iommu_group_put(group);
4347 					continue;
4348 				}
4349 
4350 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4351 				ret = iommu_probe_device(pn->dev);
4352 				if (ret)
4353 					break;
4354 			}
4355 			mutex_unlock(&adev->physical_node_lock);
4356 
4357 			if (ret)
4358 				return ret;
4359 		}
4360 	}
4361 
4362 	return 0;
4363 }
4364 
4365 int __init intel_iommu_init(void)
4366 {
4367 	int ret = -ENODEV;
4368 	struct dmar_drhd_unit *drhd;
4369 	struct intel_iommu *iommu;
4370 
4371 	/*
4372 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4373 	 * opt in, so enforce that.
4374 	 */
4375 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4376 		    platform_optin_force_iommu();
4377 
4378 	if (iommu_init_mempool()) {
4379 		if (force_on)
4380 			panic("tboot: Failed to initialize iommu memory\n");
4381 		return -ENOMEM;
4382 	}
4383 
4384 	down_write(&dmar_global_lock);
4385 	if (dmar_table_init()) {
4386 		if (force_on)
4387 			panic("tboot: Failed to initialize DMAR table\n");
4388 		goto out_free_dmar;
4389 	}
4390 
4391 	if (dmar_dev_scope_init() < 0) {
4392 		if (force_on)
4393 			panic("tboot: Failed to initialize DMAR device scope\n");
4394 		goto out_free_dmar;
4395 	}
4396 
4397 	up_write(&dmar_global_lock);
4398 
4399 	/*
4400 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4401 	 * complain later when we register it under the lock.
4402 	 */
4403 	dmar_register_bus_notifier();
4404 
4405 	down_write(&dmar_global_lock);
4406 
4407 	if (!no_iommu)
4408 		intel_iommu_debugfs_init();
4409 
4410 	if (no_iommu || dmar_disabled) {
4411 		/*
4412 		 * We exit the function here to ensure IOMMU's remapping and
4413 		 * mempool aren't setup, which means that the IOMMU's PMRs
4414 		 * won't be disabled via the call to init_dmars(). So disable
4415 		 * it explicitly here. The PMRs were setup by tboot prior to
4416 		 * calling SENTER, but the kernel is expected to reset/tear
4417 		 * down the PMRs.
4418 		 */
4419 		if (intel_iommu_tboot_noforce) {
4420 			for_each_iommu(iommu, drhd)
4421 				iommu_disable_protect_mem_regions(iommu);
4422 		}
4423 
4424 		/*
4425 		 * Make sure the IOMMUs are switched off, even when we
4426 		 * boot into a kexec kernel and the previous kernel left
4427 		 * them enabled
4428 		 */
4429 		intel_disable_iommus();
4430 		goto out_free_dmar;
4431 	}
4432 
4433 	if (list_empty(&dmar_rmrr_units))
4434 		pr_info("No RMRR found\n");
4435 
4436 	if (list_empty(&dmar_atsr_units))
4437 		pr_info("No ATSR found\n");
4438 
4439 	if (list_empty(&dmar_satc_units))
4440 		pr_info("No SATC found\n");
4441 
4442 	if (dmar_map_gfx)
4443 		intel_iommu_gfx_mapped = 1;
4444 
4445 	init_no_remapping_devices();
4446 
4447 	ret = init_dmars();
4448 	if (ret) {
4449 		if (force_on)
4450 			panic("tboot: Failed to initialize DMARs\n");
4451 		pr_err("Initialization failed\n");
4452 		goto out_free_dmar;
4453 	}
4454 	up_write(&dmar_global_lock);
4455 
4456 	init_iommu_pm_ops();
4457 
4458 	down_read(&dmar_global_lock);
4459 	for_each_active_iommu(iommu, drhd) {
4460 		/*
4461 		 * The flush queue implementation does not perform
4462 		 * page-selective invalidations that are required for efficient
4463 		 * TLB flushes in virtual environments.  The benefit of batching
4464 		 * is likely to be much lower than the overhead of synchronizing
4465 		 * the virtual and physical IOMMU page-tables.
4466 		 */
4467 		if (cap_caching_mode(iommu->cap)) {
4468 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4469 			iommu_set_dma_strict();
4470 		}
4471 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4472 				       intel_iommu_groups,
4473 				       "%s", iommu->name);
4474 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4475 	}
4476 	up_read(&dmar_global_lock);
4477 
4478 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4479 	if (si_domain && !hw_pass_through)
4480 		register_memory_notifier(&intel_iommu_memory_nb);
4481 
4482 	down_read(&dmar_global_lock);
4483 	if (probe_acpi_namespace_devices())
4484 		pr_warn("ACPI name space devices didn't probe correctly\n");
4485 
4486 	/* Finally, we enable the DMA remapping hardware. */
4487 	for_each_iommu(iommu, drhd) {
4488 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4489 			iommu_enable_translation(iommu);
4490 
4491 		iommu_disable_protect_mem_regions(iommu);
4492 	}
4493 	up_read(&dmar_global_lock);
4494 
4495 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4496 
4497 	intel_iommu_enabled = 1;
4498 
4499 	return 0;
4500 
4501 out_free_dmar:
4502 	intel_iommu_free_dmars();
4503 	up_write(&dmar_global_lock);
4504 	iommu_exit_mempool();
4505 	return ret;
4506 }
4507 
4508 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4509 {
4510 	struct device_domain_info *info = opaque;
4511 
4512 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4513 	return 0;
4514 }
4515 
4516 /*
4517  * NB - intel-iommu lacks any sort of reference counting for the users of
4518  * dependent devices.  If multiple endpoints have intersecting dependent
4519  * devices, unbinding the driver from any one of them will possibly leave
4520  * the others unable to operate.
4521  */
4522 static void domain_context_clear(struct device_domain_info *info)
4523 {
4524 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4525 		return;
4526 
4527 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4528 			       &domain_context_clear_one_cb, info);
4529 }
4530 
4531 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4532 {
4533 	struct dmar_domain *domain;
4534 	struct intel_iommu *iommu;
4535 	unsigned long flags;
4536 
4537 	assert_spin_locked(&device_domain_lock);
4538 
4539 	if (WARN_ON(!info))
4540 		return;
4541 
4542 	iommu = info->iommu;
4543 	domain = info->domain;
4544 
4545 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4546 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4547 			intel_pasid_tear_down_entry(iommu, info->dev,
4548 					PASID_RID2PASID, false);
4549 
4550 		iommu_disable_dev_iotlb(info);
4551 		domain_context_clear(info);
4552 		intel_pasid_free_table(info->dev);
4553 	}
4554 
4555 	unlink_domain_info(info);
4556 
4557 	spin_lock_irqsave(&iommu->lock, flags);
4558 	domain_detach_iommu(domain, iommu);
4559 	spin_unlock_irqrestore(&iommu->lock, flags);
4560 
4561 	free_devinfo_mem(info);
4562 }
4563 
4564 static void dmar_remove_one_dev_info(struct device *dev)
4565 {
4566 	struct device_domain_info *info;
4567 	unsigned long flags;
4568 
4569 	spin_lock_irqsave(&device_domain_lock, flags);
4570 	info = get_domain_info(dev);
4571 	if (info)
4572 		__dmar_remove_one_dev_info(info);
4573 	spin_unlock_irqrestore(&device_domain_lock, flags);
4574 }
4575 
4576 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4577 {
4578 	int adjust_width;
4579 
4580 	/* calculate AGAW */
4581 	domain->gaw = guest_width;
4582 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4583 	domain->agaw = width_to_agaw(adjust_width);
4584 
4585 	domain->iommu_coherency = false;
4586 	domain->iommu_snooping = false;
4587 	domain->iommu_superpage = 0;
4588 	domain->max_addr = 0;
4589 
4590 	/* always allocate the top pgd */
4591 	domain->pgd = alloc_pgtable_page(domain->nid);
4592 	if (!domain->pgd)
4593 		return -ENOMEM;
4594 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4595 	return 0;
4596 }
4597 
4598 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4599 {
4600 	struct dmar_domain *dmar_domain;
4601 	struct iommu_domain *domain;
4602 
4603 	switch (type) {
4604 	case IOMMU_DOMAIN_DMA:
4605 	case IOMMU_DOMAIN_DMA_FQ:
4606 	case IOMMU_DOMAIN_UNMANAGED:
4607 		dmar_domain = alloc_domain(type);
4608 		if (!dmar_domain) {
4609 			pr_err("Can't allocate dmar_domain\n");
4610 			return NULL;
4611 		}
4612 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4613 			pr_err("Domain initialization failed\n");
4614 			domain_exit(dmar_domain);
4615 			return NULL;
4616 		}
4617 
4618 		domain = &dmar_domain->domain;
4619 		domain->geometry.aperture_start = 0;
4620 		domain->geometry.aperture_end   =
4621 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4622 		domain->geometry.force_aperture = true;
4623 
4624 		return domain;
4625 	case IOMMU_DOMAIN_IDENTITY:
4626 		return &si_domain->domain;
4627 	default:
4628 		return NULL;
4629 	}
4630 
4631 	return NULL;
4632 }
4633 
4634 static void intel_iommu_domain_free(struct iommu_domain *domain)
4635 {
4636 	if (domain != &si_domain->domain)
4637 		domain_exit(to_dmar_domain(domain));
4638 }
4639 
4640 /*
4641  * Check whether a @domain could be attached to the @dev through the
4642  * aux-domain attach/detach APIs.
4643  */
4644 static inline bool
4645 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4646 {
4647 	struct device_domain_info *info = get_domain_info(dev);
4648 
4649 	return info && info->auxd_enabled &&
4650 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4651 }
4652 
4653 static inline struct subdev_domain_info *
4654 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4655 {
4656 	struct subdev_domain_info *sinfo;
4657 
4658 	if (!list_empty(&domain->subdevices)) {
4659 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4660 			if (sinfo->pdev == dev)
4661 				return sinfo;
4662 		}
4663 	}
4664 
4665 	return NULL;
4666 }
4667 
4668 static int auxiliary_link_device(struct dmar_domain *domain,
4669 				 struct device *dev)
4670 {
4671 	struct device_domain_info *info = get_domain_info(dev);
4672 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4673 
4674 	assert_spin_locked(&device_domain_lock);
4675 	if (WARN_ON(!info))
4676 		return -EINVAL;
4677 
4678 	if (!sinfo) {
4679 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4680 		if (!sinfo)
4681 			return -ENOMEM;
4682 		sinfo->domain = domain;
4683 		sinfo->pdev = dev;
4684 		list_add(&sinfo->link_phys, &info->subdevices);
4685 		list_add(&sinfo->link_domain, &domain->subdevices);
4686 	}
4687 
4688 	return ++sinfo->users;
4689 }
4690 
4691 static int auxiliary_unlink_device(struct dmar_domain *domain,
4692 				   struct device *dev)
4693 {
4694 	struct device_domain_info *info = get_domain_info(dev);
4695 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4696 	int ret;
4697 
4698 	assert_spin_locked(&device_domain_lock);
4699 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4700 		return -EINVAL;
4701 
4702 	ret = --sinfo->users;
4703 	if (!ret) {
4704 		list_del(&sinfo->link_phys);
4705 		list_del(&sinfo->link_domain);
4706 		kfree(sinfo);
4707 	}
4708 
4709 	return ret;
4710 }
4711 
4712 static int aux_domain_add_dev(struct dmar_domain *domain,
4713 			      struct device *dev)
4714 {
4715 	int ret;
4716 	unsigned long flags;
4717 	struct intel_iommu *iommu;
4718 
4719 	iommu = device_to_iommu(dev, NULL, NULL);
4720 	if (!iommu)
4721 		return -ENODEV;
4722 
4723 	if (domain->default_pasid <= 0) {
4724 		u32 pasid;
4725 
4726 		/* No private data needed for the default pasid */
4727 		pasid = ioasid_alloc(NULL, PASID_MIN,
4728 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4729 				     NULL);
4730 		if (pasid == INVALID_IOASID) {
4731 			pr_err("Can't allocate default pasid\n");
4732 			return -ENODEV;
4733 		}
4734 		domain->default_pasid = pasid;
4735 	}
4736 
4737 	spin_lock_irqsave(&device_domain_lock, flags);
4738 	ret = auxiliary_link_device(domain, dev);
4739 	if (ret <= 0)
4740 		goto link_failed;
4741 
4742 	/*
4743 	 * Subdevices from the same physical device can be attached to the
4744 	 * same domain. For such cases, only the first subdevice attachment
4745 	 * needs to go through the full steps in this function. So if ret >
4746 	 * 1, just goto out.
4747 	 */
4748 	if (ret > 1)
4749 		goto out;
4750 
4751 	/*
4752 	 * iommu->lock must be held to attach domain to iommu and setup the
4753 	 * pasid entry for second level translation.
4754 	 */
4755 	spin_lock(&iommu->lock);
4756 	ret = domain_attach_iommu(domain, iommu);
4757 	if (ret)
4758 		goto attach_failed;
4759 
4760 	/* Setup the PASID entry for mediated devices: */
4761 	if (domain_use_first_level(domain))
4762 		ret = domain_setup_first_level(iommu, domain, dev,
4763 					       domain->default_pasid);
4764 	else
4765 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4766 						     domain->default_pasid);
4767 	if (ret)
4768 		goto table_failed;
4769 
4770 	spin_unlock(&iommu->lock);
4771 out:
4772 	spin_unlock_irqrestore(&device_domain_lock, flags);
4773 
4774 	return 0;
4775 
4776 table_failed:
4777 	domain_detach_iommu(domain, iommu);
4778 attach_failed:
4779 	spin_unlock(&iommu->lock);
4780 	auxiliary_unlink_device(domain, dev);
4781 link_failed:
4782 	spin_unlock_irqrestore(&device_domain_lock, flags);
4783 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4784 		ioasid_put(domain->default_pasid);
4785 
4786 	return ret;
4787 }
4788 
4789 static void aux_domain_remove_dev(struct dmar_domain *domain,
4790 				  struct device *dev)
4791 {
4792 	struct device_domain_info *info;
4793 	struct intel_iommu *iommu;
4794 	unsigned long flags;
4795 
4796 	if (!is_aux_domain(dev, &domain->domain))
4797 		return;
4798 
4799 	spin_lock_irqsave(&device_domain_lock, flags);
4800 	info = get_domain_info(dev);
4801 	iommu = info->iommu;
4802 
4803 	if (!auxiliary_unlink_device(domain, dev)) {
4804 		spin_lock(&iommu->lock);
4805 		intel_pasid_tear_down_entry(iommu, dev,
4806 					    domain->default_pasid, false);
4807 		domain_detach_iommu(domain, iommu);
4808 		spin_unlock(&iommu->lock);
4809 	}
4810 
4811 	spin_unlock_irqrestore(&device_domain_lock, flags);
4812 
4813 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4814 		ioasid_put(domain->default_pasid);
4815 }
4816 
4817 static int prepare_domain_attach_device(struct iommu_domain *domain,
4818 					struct device *dev)
4819 {
4820 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4821 	struct intel_iommu *iommu;
4822 	int addr_width;
4823 
4824 	iommu = device_to_iommu(dev, NULL, NULL);
4825 	if (!iommu)
4826 		return -ENODEV;
4827 
4828 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4829 	    !ecap_nest(iommu->ecap)) {
4830 		dev_err(dev, "%s: iommu not support nested translation\n",
4831 			iommu->name);
4832 		return -EINVAL;
4833 	}
4834 
4835 	/* check if this iommu agaw is sufficient for max mapped address */
4836 	addr_width = agaw_to_width(iommu->agaw);
4837 	if (addr_width > cap_mgaw(iommu->cap))
4838 		addr_width = cap_mgaw(iommu->cap);
4839 
4840 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4841 		dev_err(dev, "%s: iommu width (%d) is not "
4842 		        "sufficient for the mapped address (%llx)\n",
4843 		        __func__, addr_width, dmar_domain->max_addr);
4844 		return -EFAULT;
4845 	}
4846 	dmar_domain->gaw = addr_width;
4847 
4848 	/*
4849 	 * Knock out extra levels of page tables if necessary
4850 	 */
4851 	while (iommu->agaw < dmar_domain->agaw) {
4852 		struct dma_pte *pte;
4853 
4854 		pte = dmar_domain->pgd;
4855 		if (dma_pte_present(pte)) {
4856 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4857 			free_pgtable_page(pte);
4858 		}
4859 		dmar_domain->agaw--;
4860 	}
4861 
4862 	return 0;
4863 }
4864 
4865 static int intel_iommu_attach_device(struct iommu_domain *domain,
4866 				     struct device *dev)
4867 {
4868 	int ret;
4869 
4870 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4871 	    device_is_rmrr_locked(dev)) {
4872 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4873 		return -EPERM;
4874 	}
4875 
4876 	if (is_aux_domain(dev, domain))
4877 		return -EPERM;
4878 
4879 	/* normally dev is not mapped */
4880 	if (unlikely(domain_context_mapped(dev))) {
4881 		struct dmar_domain *old_domain;
4882 
4883 		old_domain = find_domain(dev);
4884 		if (old_domain)
4885 			dmar_remove_one_dev_info(dev);
4886 	}
4887 
4888 	ret = prepare_domain_attach_device(domain, dev);
4889 	if (ret)
4890 		return ret;
4891 
4892 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4893 }
4894 
4895 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4896 					 struct device *dev)
4897 {
4898 	int ret;
4899 
4900 	if (!is_aux_domain(dev, domain))
4901 		return -EPERM;
4902 
4903 	ret = prepare_domain_attach_device(domain, dev);
4904 	if (ret)
4905 		return ret;
4906 
4907 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4908 }
4909 
4910 static void intel_iommu_detach_device(struct iommu_domain *domain,
4911 				      struct device *dev)
4912 {
4913 	dmar_remove_one_dev_info(dev);
4914 }
4915 
4916 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4917 					  struct device *dev)
4918 {
4919 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4920 }
4921 
4922 #ifdef CONFIG_INTEL_IOMMU_SVM
4923 /*
4924  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4925  * VT-d granularity. Invalidation is typically included in the unmap operation
4926  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4927  * owns the first level page tables. Invalidations of translation caches in the
4928  * guest are trapped and passed down to the host.
4929  *
4930  * vIOMMU in the guest will only expose first level page tables, therefore
4931  * we do not support IOTLB granularity for request without PASID (second level).
4932  *
4933  * For example, to find the VT-d granularity encoding for IOTLB
4934  * type and page selective granularity within PASID:
4935  * X: indexed by iommu cache type
4936  * Y: indexed by enum iommu_inv_granularity
4937  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4938  */
4939 
4940 static const int
4941 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4942 	/*
4943 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4944 	 * page selective (address granularity)
4945 	 */
4946 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4947 	/* PASID based dev TLBs */
4948 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4949 	/* PASID cache */
4950 	{-EINVAL, -EINVAL, -EINVAL}
4951 };
4952 
4953 static inline int to_vtd_granularity(int type, int granu)
4954 {
4955 	return inv_type_granu_table[type][granu];
4956 }
4957 
4958 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4959 {
4960 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4961 
4962 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4963 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4964 	 * granu size in contiguous memory.
4965 	 */
4966 	return order_base_2(nr_pages);
4967 }
4968 
4969 static int
4970 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4971 			   struct iommu_cache_invalidate_info *inv_info)
4972 {
4973 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4974 	struct device_domain_info *info;
4975 	struct intel_iommu *iommu;
4976 	unsigned long flags;
4977 	int cache_type;
4978 	u8 bus, devfn;
4979 	u16 did, sid;
4980 	int ret = 0;
4981 	u64 size = 0;
4982 
4983 	if (!inv_info || !dmar_domain)
4984 		return -EINVAL;
4985 
4986 	if (!dev || !dev_is_pci(dev))
4987 		return -ENODEV;
4988 
4989 	iommu = device_to_iommu(dev, &bus, &devfn);
4990 	if (!iommu)
4991 		return -ENODEV;
4992 
4993 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4994 		return -EINVAL;
4995 
4996 	spin_lock_irqsave(&device_domain_lock, flags);
4997 	spin_lock(&iommu->lock);
4998 	info = get_domain_info(dev);
4999 	if (!info) {
5000 		ret = -EINVAL;
5001 		goto out_unlock;
5002 	}
5003 	did = dmar_domain->iommu_did[iommu->seq_id];
5004 	sid = PCI_DEVID(bus, devfn);
5005 
5006 	/* Size is only valid in address selective invalidation */
5007 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5008 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5009 				   inv_info->granu.addr_info.nb_granules);
5010 
5011 	for_each_set_bit(cache_type,
5012 			 (unsigned long *)&inv_info->cache,
5013 			 IOMMU_CACHE_INV_TYPE_NR) {
5014 		int granu = 0;
5015 		u64 pasid = 0;
5016 		u64 addr = 0;
5017 
5018 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5019 		if (granu == -EINVAL) {
5020 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5021 					   cache_type, inv_info->granularity);
5022 			break;
5023 		}
5024 
5025 		/*
5026 		 * PASID is stored in different locations based on the
5027 		 * granularity.
5028 		 */
5029 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5030 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5031 			pasid = inv_info->granu.pasid_info.pasid;
5032 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5033 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5034 			pasid = inv_info->granu.addr_info.pasid;
5035 
5036 		switch (BIT(cache_type)) {
5037 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5038 			/* HW will ignore LSB bits based on address mask */
5039 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5040 			    size &&
5041 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5042 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5043 						   inv_info->granu.addr_info.addr, size);
5044 			}
5045 
5046 			/*
5047 			 * If granu is PASID-selective, address is ignored.
5048 			 * We use npages = -1 to indicate that.
5049 			 */
5050 			qi_flush_piotlb(iommu, did, pasid,
5051 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5052 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5053 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5054 
5055 			if (!info->ats_enabled)
5056 				break;
5057 			/*
5058 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5059 			 * in the guest may assume IOTLB flush is inclusive,
5060 			 * which is more efficient.
5061 			 */
5062 			fallthrough;
5063 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5064 			/*
5065 			 * PASID based device TLB invalidation does not support
5066 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5067 			 * IOMMU_INV_GRANU_ADDR.
5068 			 * The equivalent of that is we set the size to be the
5069 			 * entire range of 64 bit. User only provides PASID info
5070 			 * without address info. So we set addr to 0.
5071 			 */
5072 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5073 				size = 64 - VTD_PAGE_SHIFT;
5074 				addr = 0;
5075 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5076 				addr = inv_info->granu.addr_info.addr;
5077 			}
5078 
5079 			if (info->ats_enabled)
5080 				qi_flush_dev_iotlb_pasid(iommu, sid,
5081 						info->pfsid, pasid,
5082 						info->ats_qdep, addr,
5083 						size);
5084 			else
5085 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5086 			break;
5087 		default:
5088 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5089 					    cache_type);
5090 			ret = -EINVAL;
5091 		}
5092 	}
5093 out_unlock:
5094 	spin_unlock(&iommu->lock);
5095 	spin_unlock_irqrestore(&device_domain_lock, flags);
5096 
5097 	return ret;
5098 }
5099 #endif
5100 
5101 static int intel_iommu_map(struct iommu_domain *domain,
5102 			   unsigned long iova, phys_addr_t hpa,
5103 			   size_t size, int iommu_prot, gfp_t gfp)
5104 {
5105 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5106 	u64 max_addr;
5107 	int prot = 0;
5108 
5109 	if (iommu_prot & IOMMU_READ)
5110 		prot |= DMA_PTE_READ;
5111 	if (iommu_prot & IOMMU_WRITE)
5112 		prot |= DMA_PTE_WRITE;
5113 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5114 		prot |= DMA_PTE_SNP;
5115 
5116 	max_addr = iova + size;
5117 	if (dmar_domain->max_addr < max_addr) {
5118 		u64 end;
5119 
5120 		/* check if minimum agaw is sufficient for mapped address */
5121 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5122 		if (end < max_addr) {
5123 			pr_err("%s: iommu width (%d) is not "
5124 			       "sufficient for the mapped address (%llx)\n",
5125 			       __func__, dmar_domain->gaw, max_addr);
5126 			return -EFAULT;
5127 		}
5128 		dmar_domain->max_addr = max_addr;
5129 	}
5130 	/* Round up size to next multiple of PAGE_SIZE, if it and
5131 	   the low bits of hpa would take us onto the next page */
5132 	size = aligned_nrpages(hpa, size);
5133 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5134 				hpa >> VTD_PAGE_SHIFT, size, prot);
5135 }
5136 
5137 static int intel_iommu_map_pages(struct iommu_domain *domain,
5138 				 unsigned long iova, phys_addr_t paddr,
5139 				 size_t pgsize, size_t pgcount,
5140 				 int prot, gfp_t gfp, size_t *mapped)
5141 {
5142 	unsigned long pgshift = __ffs(pgsize);
5143 	size_t size = pgcount << pgshift;
5144 	int ret;
5145 
5146 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5147 		return -EINVAL;
5148 
5149 	if (!IS_ALIGNED(iova | paddr, pgsize))
5150 		return -EINVAL;
5151 
5152 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5153 	if (!ret && mapped)
5154 		*mapped = size;
5155 
5156 	return ret;
5157 }
5158 
5159 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5160 				unsigned long iova, size_t size,
5161 				struct iommu_iotlb_gather *gather)
5162 {
5163 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5164 	unsigned long start_pfn, last_pfn;
5165 	int level = 0;
5166 
5167 	/* Cope with horrid API which requires us to unmap more than the
5168 	   size argument if it happens to be a large-page mapping. */
5169 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5170 
5171 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5172 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5173 
5174 	start_pfn = iova >> VTD_PAGE_SHIFT;
5175 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5176 
5177 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
5178 
5179 	if (dmar_domain->max_addr == iova + size)
5180 		dmar_domain->max_addr = iova;
5181 
5182 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5183 
5184 	return size;
5185 }
5186 
5187 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5188 				      unsigned long iova,
5189 				      size_t pgsize, size_t pgcount,
5190 				      struct iommu_iotlb_gather *gather)
5191 {
5192 	unsigned long pgshift = __ffs(pgsize);
5193 	size_t size = pgcount << pgshift;
5194 
5195 	return intel_iommu_unmap(domain, iova, size, gather);
5196 }
5197 
5198 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5199 				 struct iommu_iotlb_gather *gather)
5200 {
5201 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5202 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5203 	size_t size = gather->end - gather->start;
5204 	unsigned long start_pfn;
5205 	unsigned long nrpages;
5206 	int iommu_id;
5207 
5208 	nrpages = aligned_nrpages(gather->start, size);
5209 	start_pfn = mm_to_dma_pfn(iova_pfn);
5210 
5211 	for_each_domain_iommu(iommu_id, dmar_domain)
5212 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5213 				      start_pfn, nrpages,
5214 				      list_empty(&gather->freelist), 0);
5215 
5216 	put_pages_list(&gather->freelist);
5217 }
5218 
5219 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5220 					    dma_addr_t iova)
5221 {
5222 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5223 	struct dma_pte *pte;
5224 	int level = 0;
5225 	u64 phys = 0;
5226 
5227 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5228 	if (pte && dma_pte_present(pte))
5229 		phys = dma_pte_addr(pte) +
5230 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5231 						VTD_PAGE_SHIFT) - 1));
5232 
5233 	return phys;
5234 }
5235 
5236 static bool intel_iommu_capable(enum iommu_cap cap)
5237 {
5238 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5239 		return domain_update_iommu_snooping(NULL);
5240 	if (cap == IOMMU_CAP_INTR_REMAP)
5241 		return irq_remapping_enabled == 1;
5242 
5243 	return false;
5244 }
5245 
5246 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5247 {
5248 	struct intel_iommu *iommu;
5249 
5250 	iommu = device_to_iommu(dev, NULL, NULL);
5251 	if (!iommu)
5252 		return ERR_PTR(-ENODEV);
5253 
5254 	if (translation_pre_enabled(iommu))
5255 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5256 
5257 	return &iommu->iommu;
5258 }
5259 
5260 static void intel_iommu_release_device(struct device *dev)
5261 {
5262 	struct intel_iommu *iommu;
5263 
5264 	iommu = device_to_iommu(dev, NULL, NULL);
5265 	if (!iommu)
5266 		return;
5267 
5268 	dmar_remove_one_dev_info(dev);
5269 
5270 	set_dma_ops(dev, NULL);
5271 }
5272 
5273 static void intel_iommu_probe_finalize(struct device *dev)
5274 {
5275 	set_dma_ops(dev, NULL);
5276 	iommu_setup_dma_ops(dev, 0, U64_MAX);
5277 }
5278 
5279 static void intel_iommu_get_resv_regions(struct device *device,
5280 					 struct list_head *head)
5281 {
5282 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5283 	struct iommu_resv_region *reg;
5284 	struct dmar_rmrr_unit *rmrr;
5285 	struct device *i_dev;
5286 	int i;
5287 
5288 	down_read(&dmar_global_lock);
5289 	for_each_rmrr_units(rmrr) {
5290 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5291 					  i, i_dev) {
5292 			struct iommu_resv_region *resv;
5293 			enum iommu_resv_type type;
5294 			size_t length;
5295 
5296 			if (i_dev != device &&
5297 			    !is_downstream_to_pci_bridge(device, i_dev))
5298 				continue;
5299 
5300 			length = rmrr->end_address - rmrr->base_address + 1;
5301 
5302 			type = device_rmrr_is_relaxable(device) ?
5303 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5304 
5305 			resv = iommu_alloc_resv_region(rmrr->base_address,
5306 						       length, prot, type);
5307 			if (!resv)
5308 				break;
5309 
5310 			list_add_tail(&resv->list, head);
5311 		}
5312 	}
5313 	up_read(&dmar_global_lock);
5314 
5315 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5316 	if (dev_is_pci(device)) {
5317 		struct pci_dev *pdev = to_pci_dev(device);
5318 
5319 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5320 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5321 						   IOMMU_RESV_DIRECT_RELAXABLE);
5322 			if (reg)
5323 				list_add_tail(&reg->list, head);
5324 		}
5325 	}
5326 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5327 
5328 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5329 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5330 				      0, IOMMU_RESV_MSI);
5331 	if (!reg)
5332 		return;
5333 	list_add_tail(&reg->list, head);
5334 }
5335 
5336 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5337 {
5338 	struct device_domain_info *info;
5339 	struct context_entry *context;
5340 	struct dmar_domain *domain;
5341 	unsigned long flags;
5342 	u64 ctx_lo;
5343 	int ret;
5344 
5345 	domain = find_domain(dev);
5346 	if (!domain)
5347 		return -EINVAL;
5348 
5349 	spin_lock_irqsave(&device_domain_lock, flags);
5350 	spin_lock(&iommu->lock);
5351 
5352 	ret = -EINVAL;
5353 	info = get_domain_info(dev);
5354 	if (!info || !info->pasid_supported)
5355 		goto out;
5356 
5357 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5358 	if (WARN_ON(!context))
5359 		goto out;
5360 
5361 	ctx_lo = context[0].lo;
5362 
5363 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5364 		ctx_lo |= CONTEXT_PASIDE;
5365 		context[0].lo = ctx_lo;
5366 		wmb();
5367 		iommu->flush.flush_context(iommu,
5368 					   domain->iommu_did[iommu->seq_id],
5369 					   PCI_DEVID(info->bus, info->devfn),
5370 					   DMA_CCMD_MASK_NOBIT,
5371 					   DMA_CCMD_DEVICE_INVL);
5372 	}
5373 
5374 	/* Enable PASID support in the device, if it wasn't already */
5375 	if (!info->pasid_enabled)
5376 		iommu_enable_dev_iotlb(info);
5377 
5378 	ret = 0;
5379 
5380  out:
5381 	spin_unlock(&iommu->lock);
5382 	spin_unlock_irqrestore(&device_domain_lock, flags);
5383 
5384 	return ret;
5385 }
5386 
5387 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5388 {
5389 	if (dev_is_pci(dev))
5390 		return pci_device_group(dev);
5391 	return generic_device_group(dev);
5392 }
5393 
5394 static int intel_iommu_enable_auxd(struct device *dev)
5395 {
5396 	struct device_domain_info *info;
5397 	struct intel_iommu *iommu;
5398 	unsigned long flags;
5399 	int ret;
5400 
5401 	iommu = device_to_iommu(dev, NULL, NULL);
5402 	if (!iommu || dmar_disabled)
5403 		return -EINVAL;
5404 
5405 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5406 		return -EINVAL;
5407 
5408 	ret = intel_iommu_enable_pasid(iommu, dev);
5409 	if (ret)
5410 		return -ENODEV;
5411 
5412 	spin_lock_irqsave(&device_domain_lock, flags);
5413 	info = get_domain_info(dev);
5414 	info->auxd_enabled = 1;
5415 	spin_unlock_irqrestore(&device_domain_lock, flags);
5416 
5417 	return 0;
5418 }
5419 
5420 static int intel_iommu_disable_auxd(struct device *dev)
5421 {
5422 	struct device_domain_info *info;
5423 	unsigned long flags;
5424 
5425 	spin_lock_irqsave(&device_domain_lock, flags);
5426 	info = get_domain_info(dev);
5427 	if (!WARN_ON(!info))
5428 		info->auxd_enabled = 0;
5429 	spin_unlock_irqrestore(&device_domain_lock, flags);
5430 
5431 	return 0;
5432 }
5433 
5434 static int intel_iommu_enable_sva(struct device *dev)
5435 {
5436 	struct device_domain_info *info = get_domain_info(dev);
5437 	struct intel_iommu *iommu;
5438 	int ret;
5439 
5440 	if (!info || dmar_disabled)
5441 		return -EINVAL;
5442 
5443 	iommu = info->iommu;
5444 	if (!iommu)
5445 		return -EINVAL;
5446 
5447 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5448 		return -ENODEV;
5449 
5450 	if (intel_iommu_enable_pasid(iommu, dev))
5451 		return -ENODEV;
5452 
5453 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5454 		return -EINVAL;
5455 
5456 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5457 	if (!ret)
5458 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5459 
5460 	return ret;
5461 }
5462 
5463 static int intel_iommu_disable_sva(struct device *dev)
5464 {
5465 	struct device_domain_info *info = get_domain_info(dev);
5466 	struct intel_iommu *iommu = info->iommu;
5467 	int ret;
5468 
5469 	ret = iommu_unregister_device_fault_handler(dev);
5470 	if (!ret)
5471 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5472 
5473 	return ret;
5474 }
5475 
5476 static int intel_iommu_enable_iopf(struct device *dev)
5477 {
5478 	struct device_domain_info *info = get_domain_info(dev);
5479 
5480 	if (info && info->pri_supported)
5481 		return 0;
5482 
5483 	return -ENODEV;
5484 }
5485 
5486 static int
5487 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5488 {
5489 	switch (feat) {
5490 	case IOMMU_DEV_FEAT_AUX:
5491 		return intel_iommu_enable_auxd(dev);
5492 
5493 	case IOMMU_DEV_FEAT_IOPF:
5494 		return intel_iommu_enable_iopf(dev);
5495 
5496 	case IOMMU_DEV_FEAT_SVA:
5497 		return intel_iommu_enable_sva(dev);
5498 
5499 	default:
5500 		return -ENODEV;
5501 	}
5502 }
5503 
5504 static int
5505 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5506 {
5507 	switch (feat) {
5508 	case IOMMU_DEV_FEAT_AUX:
5509 		return intel_iommu_disable_auxd(dev);
5510 
5511 	case IOMMU_DEV_FEAT_IOPF:
5512 		return 0;
5513 
5514 	case IOMMU_DEV_FEAT_SVA:
5515 		return intel_iommu_disable_sva(dev);
5516 
5517 	default:
5518 		return -ENODEV;
5519 	}
5520 }
5521 
5522 static bool
5523 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5524 {
5525 	struct device_domain_info *info = get_domain_info(dev);
5526 
5527 	if (feat == IOMMU_DEV_FEAT_AUX)
5528 		return scalable_mode_support() && info && info->auxd_enabled;
5529 
5530 	return false;
5531 }
5532 
5533 static int
5534 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5535 {
5536 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5537 
5538 	return dmar_domain->default_pasid > 0 ?
5539 			dmar_domain->default_pasid : -EINVAL;
5540 }
5541 
5542 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5543 					   struct device *dev)
5544 {
5545 	return attach_deferred(dev);
5546 }
5547 
5548 static int
5549 intel_iommu_enable_nesting(struct iommu_domain *domain)
5550 {
5551 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5552 	unsigned long flags;
5553 	int ret = -ENODEV;
5554 
5555 	spin_lock_irqsave(&device_domain_lock, flags);
5556 	if (list_empty(&dmar_domain->devices)) {
5557 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5558 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5559 		ret = 0;
5560 	}
5561 	spin_unlock_irqrestore(&device_domain_lock, flags);
5562 
5563 	return ret;
5564 }
5565 
5566 /*
5567  * Check that the device does not live on an external facing PCI port that is
5568  * marked as untrusted. Such devices should not be able to apply quirks and
5569  * thus not be able to bypass the IOMMU restrictions.
5570  */
5571 static bool risky_device(struct pci_dev *pdev)
5572 {
5573 	if (pdev->untrusted) {
5574 		pci_info(pdev,
5575 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5576 			 pdev->vendor, pdev->device);
5577 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5578 		return true;
5579 	}
5580 	return false;
5581 }
5582 
5583 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5584 				       unsigned long iova, size_t size)
5585 {
5586 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5587 	unsigned long pages = aligned_nrpages(iova, size);
5588 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5589 	struct intel_iommu *iommu;
5590 	int iommu_id;
5591 
5592 	for_each_domain_iommu(iommu_id, dmar_domain) {
5593 		iommu = g_iommus[iommu_id];
5594 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5595 	}
5596 }
5597 
5598 const struct iommu_ops intel_iommu_ops = {
5599 	.capable		= intel_iommu_capable,
5600 	.domain_alloc		= intel_iommu_domain_alloc,
5601 	.domain_free		= intel_iommu_domain_free,
5602 	.enable_nesting		= intel_iommu_enable_nesting,
5603 	.attach_dev		= intel_iommu_attach_device,
5604 	.detach_dev		= intel_iommu_detach_device,
5605 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5606 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5607 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5608 	.map_pages		= intel_iommu_map_pages,
5609 	.unmap_pages		= intel_iommu_unmap_pages,
5610 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5611 	.flush_iotlb_all        = intel_flush_iotlb_all,
5612 	.iotlb_sync		= intel_iommu_tlb_sync,
5613 	.iova_to_phys		= intel_iommu_iova_to_phys,
5614 	.probe_device		= intel_iommu_probe_device,
5615 	.probe_finalize		= intel_iommu_probe_finalize,
5616 	.release_device		= intel_iommu_release_device,
5617 	.get_resv_regions	= intel_iommu_get_resv_regions,
5618 	.put_resv_regions	= generic_iommu_put_resv_regions,
5619 	.device_group		= intel_iommu_device_group,
5620 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5621 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5622 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5623 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5624 	.def_domain_type	= device_def_domain_type,
5625 	.pgsize_bitmap		= SZ_4K,
5626 #ifdef CONFIG_INTEL_IOMMU_SVM
5627 	.cache_invalidate	= intel_iommu_sva_invalidate,
5628 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5629 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5630 	.sva_bind		= intel_svm_bind,
5631 	.sva_unbind		= intel_svm_unbind,
5632 	.sva_get_pasid		= intel_svm_get_pasid,
5633 	.page_response		= intel_svm_page_response,
5634 #endif
5635 };
5636 
5637 static void quirk_iommu_igfx(struct pci_dev *dev)
5638 {
5639 	if (risky_device(dev))
5640 		return;
5641 
5642 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5643 	dmar_map_gfx = 0;
5644 }
5645 
5646 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5654 
5655 /* Broadwell igfx malfunctions with dmar */
5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5680 
5681 static void quirk_iommu_rwbf(struct pci_dev *dev)
5682 {
5683 	if (risky_device(dev))
5684 		return;
5685 
5686 	/*
5687 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5688 	 * but needs it. Same seems to hold for the desktop versions.
5689 	 */
5690 	pci_info(dev, "Forcing write-buffer flush capability\n");
5691 	rwbf_quirk = 1;
5692 }
5693 
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5701 
5702 #define GGC 0x52
5703 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5704 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5705 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5706 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5707 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5708 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5709 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5710 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5711 
5712 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5713 {
5714 	unsigned short ggc;
5715 
5716 	if (risky_device(dev))
5717 		return;
5718 
5719 	if (pci_read_config_word(dev, GGC, &ggc))
5720 		return;
5721 
5722 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5723 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5724 		dmar_map_gfx = 0;
5725 	} else if (dmar_map_gfx) {
5726 		/* we have to ensure the gfx device is idle before we flush */
5727 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5728 		iommu_set_dma_strict();
5729 	}
5730 }
5731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5735 
5736 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5737 {
5738 	unsigned short ver;
5739 
5740 	if (!IS_GFX_DEVICE(dev))
5741 		return;
5742 
5743 	ver = (dev->device >> 8) & 0xff;
5744 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5745 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5746 	    ver != 0x9a)
5747 		return;
5748 
5749 	if (risky_device(dev))
5750 		return;
5751 
5752 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5753 	iommu_skip_te_disable = 1;
5754 }
5755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5756 
5757 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5758    ISOCH DMAR unit for the Azalia sound device, but not give it any
5759    TLB entries, which causes it to deadlock. Check for that.  We do
5760    this in a function called from init_dmars(), instead of in a PCI
5761    quirk, because we don't want to print the obnoxious "BIOS broken"
5762    message if VT-d is actually disabled.
5763 */
5764 static void __init check_tylersburg_isoch(void)
5765 {
5766 	struct pci_dev *pdev;
5767 	uint32_t vtisochctrl;
5768 
5769 	/* If there's no Azalia in the system anyway, forget it. */
5770 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5771 	if (!pdev)
5772 		return;
5773 
5774 	if (risky_device(pdev)) {
5775 		pci_dev_put(pdev);
5776 		return;
5777 	}
5778 
5779 	pci_dev_put(pdev);
5780 
5781 	/* System Management Registers. Might be hidden, in which case
5782 	   we can't do the sanity check. But that's OK, because the
5783 	   known-broken BIOSes _don't_ actually hide it, so far. */
5784 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5785 	if (!pdev)
5786 		return;
5787 
5788 	if (risky_device(pdev)) {
5789 		pci_dev_put(pdev);
5790 		return;
5791 	}
5792 
5793 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5794 		pci_dev_put(pdev);
5795 		return;
5796 	}
5797 
5798 	pci_dev_put(pdev);
5799 
5800 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5801 	if (vtisochctrl & 1)
5802 		return;
5803 
5804 	/* Drop all bits other than the number of TLB entries */
5805 	vtisochctrl &= 0x1c;
5806 
5807 	/* If we have the recommended number of TLB entries (16), fine. */
5808 	if (vtisochctrl == 0x10)
5809 		return;
5810 
5811 	/* Zero TLB entries? You get to ride the short bus to school. */
5812 	if (!vtisochctrl) {
5813 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5814 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5815 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5816 		     dmi_get_system_info(DMI_BIOS_VERSION),
5817 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5818 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5819 		return;
5820 	}
5821 
5822 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5823 	       vtisochctrl);
5824 }
5825