xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 01cc2ec6)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360 
361 #define IDENTMAP_GFX		2
362 #define IDENTMAP_AZALIA		4
363 
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 	struct device_domain_info *info;
371 
372 	if (!dev)
373 		return NULL;
374 
375 	info = dev_iommu_priv_get(dev);
376 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 		return NULL;
378 
379 	return info;
380 }
381 
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384 
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
386 				to_pci_dev(d)->untrusted)
387 
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 				     void *data), void *data)
394 {
395 	int ret = 0;
396 	unsigned long flags;
397 	struct device_domain_info *info;
398 
399 	spin_lock_irqsave(&device_domain_lock, flags);
400 	list_for_each_entry(info, &device_domain_list, global) {
401 		ret = fn(info, data);
402 		if (ret) {
403 			spin_unlock_irqrestore(&device_domain_lock, flags);
404 			return ret;
405 		}
406 	}
407 	spin_unlock_irqrestore(&device_domain_lock, flags);
408 
409 	return 0;
410 }
411 
412 const struct iommu_ops intel_iommu_ops;
413 
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418 
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423 
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426 	u32 gsts;
427 
428 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 	if (gsts & DMA_GSTS_TES)
430 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432 
433 static int __init intel_iommu_setup(char *str)
434 {
435 	if (!str)
436 		return -EINVAL;
437 	while (*str) {
438 		if (!strncmp(str, "on", 2)) {
439 			dmar_disabled = 0;
440 			pr_info("IOMMU enabled\n");
441 		} else if (!strncmp(str, "off", 3)) {
442 			dmar_disabled = 1;
443 			no_platform_optin = 1;
444 			pr_info("IOMMU disabled\n");
445 		} else if (!strncmp(str, "igfx_off", 8)) {
446 			dmar_map_gfx = 0;
447 			pr_info("Disable GFX device mapping\n");
448 		} else if (!strncmp(str, "forcedac", 8)) {
449 			pr_info("Forcing DAC for PCI devices\n");
450 			dmar_forcedac = 1;
451 		} else if (!strncmp(str, "strict", 6)) {
452 			pr_info("Disable batched IOTLB flush\n");
453 			intel_iommu_strict = 1;
454 		} else if (!strncmp(str, "sp_off", 6)) {
455 			pr_info("Disable supported super page\n");
456 			intel_iommu_superpage = 0;
457 		} else if (!strncmp(str, "sm_on", 5)) {
458 			pr_info("Intel-IOMMU: scalable mode supported\n");
459 			intel_iommu_sm = 1;
460 		} else if (!strncmp(str, "tboot_noforce", 13)) {
461 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 			intel_iommu_tboot_noforce = 1;
463 		} else if (!strncmp(str, "nobounce", 8)) {
464 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 			intel_no_bounce = 1;
466 		}
467 
468 		str += strcspn(str, ",");
469 		while (*str == ',')
470 			str++;
471 	}
472 	return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475 
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478 
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481 	struct dmar_domain **domains;
482 	int idx = did >> 8;
483 
484 	domains = iommu->domains[idx];
485 	if (!domains)
486 		return NULL;
487 
488 	return domains[did & 0xff];
489 }
490 
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 			     struct dmar_domain *domain)
493 {
494 	struct dmar_domain **domains;
495 	int idx = did >> 8;
496 
497 	if (!iommu->domains[idx]) {
498 		size_t size = 256 * sizeof(struct dmar_domain *);
499 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 	}
501 
502 	domains = iommu->domains[idx];
503 	if (WARN_ON(!domains))
504 		return;
505 	else
506 		domains[did & 0xff] = domain;
507 }
508 
509 void *alloc_pgtable_page(int node)
510 {
511 	struct page *page;
512 	void *vaddr = NULL;
513 
514 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 	if (page)
516 		vaddr = page_address(page);
517 	return vaddr;
518 }
519 
520 void free_pgtable_page(void *vaddr)
521 {
522 	free_page((unsigned long)vaddr);
523 }
524 
525 static inline void *alloc_domain_mem(void)
526 {
527 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529 
530 static void free_domain_mem(void *vaddr)
531 {
532 	kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534 
535 static inline void * alloc_devinfo_mem(void)
536 {
537 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539 
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542 	kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544 
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549 
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554 
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 				       unsigned long pfn)
557 {
558 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559 
560 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562 
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565 	unsigned long sagaw;
566 	int agaw = -1;
567 
568 	sagaw = cap_sagaw(iommu->cap);
569 	for (agaw = width_to_agaw(max_gaw);
570 	     agaw >= 0; agaw--) {
571 		if (test_bit(agaw, &sagaw))
572 			break;
573 	}
574 
575 	return agaw;
576 }
577 
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585 
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595 
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599 	int iommu_id;
600 
601 	/* si_domain and vm domain should not get here. */
602 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 		return NULL;
604 
605 	for_each_domain_iommu(iommu_id, domain)
606 		break;
607 
608 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 		return NULL;
610 
611 	return g_iommus[iommu_id];
612 }
613 
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616 	return sm_supported(iommu) ?
617 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619 
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622 	struct dmar_drhd_unit *drhd;
623 	struct intel_iommu *iommu;
624 	bool found = false;
625 	int i;
626 
627 	domain->iommu_coherency = 1;
628 
629 	for_each_domain_iommu(i, domain) {
630 		found = true;
631 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 			domain->iommu_coherency = 0;
633 			break;
634 		}
635 	}
636 	if (found)
637 		return;
638 
639 	/* No hardware attached; use lowest common denominator */
640 	rcu_read_lock();
641 	for_each_active_iommu(iommu, drhd) {
642 		if (!iommu_paging_structure_coherency(iommu)) {
643 			domain->iommu_coherency = 0;
644 			break;
645 		}
646 	}
647 	rcu_read_unlock();
648 }
649 
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652 	struct dmar_drhd_unit *drhd;
653 	struct intel_iommu *iommu;
654 	int ret = 1;
655 
656 	rcu_read_lock();
657 	for_each_active_iommu(iommu, drhd) {
658 		if (iommu != skip) {
659 			if (!ecap_sc_support(iommu->ecap)) {
660 				ret = 0;
661 				break;
662 			}
663 		}
664 	}
665 	rcu_read_unlock();
666 
667 	return ret;
668 }
669 
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 					 struct intel_iommu *skip)
672 {
673 	struct dmar_drhd_unit *drhd;
674 	struct intel_iommu *iommu;
675 	int mask = 0x3;
676 
677 	if (!intel_iommu_superpage) {
678 		return 0;
679 	}
680 
681 	/* set iommu_superpage to the smallest common denominator */
682 	rcu_read_lock();
683 	for_each_active_iommu(iommu, drhd) {
684 		if (iommu != skip) {
685 			if (domain && domain_use_first_level(domain)) {
686 				if (!cap_fl1gp_support(iommu->cap))
687 					mask = 0x1;
688 			} else {
689 				mask &= cap_super_page_val(iommu->cap);
690 			}
691 
692 			if (!mask)
693 				break;
694 		}
695 	}
696 	rcu_read_unlock();
697 
698 	return fls(mask);
699 }
700 
701 /* Some capabilities may be different across iommus */
702 static void domain_update_iommu_cap(struct dmar_domain *domain)
703 {
704 	domain_update_iommu_coherency(domain);
705 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
706 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
707 }
708 
709 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
710 					 u8 devfn, int alloc)
711 {
712 	struct root_entry *root = &iommu->root_entry[bus];
713 	struct context_entry *context;
714 	u64 *entry;
715 
716 	entry = &root->lo;
717 	if (sm_supported(iommu)) {
718 		if (devfn >= 0x80) {
719 			devfn -= 0x80;
720 			entry = &root->hi;
721 		}
722 		devfn *= 2;
723 	}
724 	if (*entry & 1)
725 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
726 	else {
727 		unsigned long phy_addr;
728 		if (!alloc)
729 			return NULL;
730 
731 		context = alloc_pgtable_page(iommu->node);
732 		if (!context)
733 			return NULL;
734 
735 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
736 		phy_addr = virt_to_phys((void *)context);
737 		*entry = phy_addr | 1;
738 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
739 	}
740 	return &context[devfn];
741 }
742 
743 static bool attach_deferred(struct device *dev)
744 {
745 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
746 }
747 
748 /**
749  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750  *				 sub-hierarchy of a candidate PCI-PCI bridge
751  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752  * @bridge: the candidate PCI-PCI bridge
753  *
754  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
755  */
756 static bool
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
758 {
759 	struct pci_dev *pdev, *pbridge;
760 
761 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
762 		return false;
763 
764 	pdev = to_pci_dev(dev);
765 	pbridge = to_pci_dev(bridge);
766 
767 	if (pbridge->subordinate &&
768 	    pbridge->subordinate->number <= pdev->bus->number &&
769 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
770 		return true;
771 
772 	return false;
773 }
774 
775 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
776 {
777 	struct dmar_drhd_unit *drhd;
778 	u32 vtbar;
779 	int rc;
780 
781 	/* We know that this device on this chipset has its own IOMMU.
782 	 * If we find it under a different IOMMU, then the BIOS is lying
783 	 * to us. Hope that the IOMMU for this device is actually
784 	 * disabled, and it needs no translation...
785 	 */
786 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
787 	if (rc) {
788 		/* "can't" happen */
789 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
790 		return false;
791 	}
792 	vtbar &= 0xffff0000;
793 
794 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
795 	drhd = dmar_find_matched_drhd_unit(pdev);
796 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
797 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
798 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
799 		return true;
800 	}
801 
802 	return false;
803 }
804 
805 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
806 {
807 	if (!iommu || iommu->drhd->ignored)
808 		return true;
809 
810 	if (dev_is_pci(dev)) {
811 		struct pci_dev *pdev = to_pci_dev(dev);
812 
813 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
814 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
815 		    quirk_ioat_snb_local_iommu(pdev))
816 			return true;
817 	}
818 
819 	return false;
820 }
821 
822 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
823 {
824 	struct dmar_drhd_unit *drhd = NULL;
825 	struct pci_dev *pdev = NULL;
826 	struct intel_iommu *iommu;
827 	struct device *tmp;
828 	u16 segment = 0;
829 	int i;
830 
831 	if (!dev)
832 		return NULL;
833 
834 	if (dev_is_pci(dev)) {
835 		struct pci_dev *pf_pdev;
836 
837 		pdev = pci_real_dma_dev(to_pci_dev(dev));
838 
839 		/* VFs aren't listed in scope tables; we need to look up
840 		 * the PF instead to find the IOMMU. */
841 		pf_pdev = pci_physfn(pdev);
842 		dev = &pf_pdev->dev;
843 		segment = pci_domain_nr(pdev->bus);
844 	} else if (has_acpi_companion(dev))
845 		dev = &ACPI_COMPANION(dev)->dev;
846 
847 	rcu_read_lock();
848 	for_each_iommu(iommu, drhd) {
849 		if (pdev && segment != drhd->segment)
850 			continue;
851 
852 		for_each_active_dev_scope(drhd->devices,
853 					  drhd->devices_cnt, i, tmp) {
854 			if (tmp == dev) {
855 				/* For a VF use its original BDF# not that of the PF
856 				 * which we used for the IOMMU lookup. Strictly speaking
857 				 * we could do this for all PCI devices; we only need to
858 				 * get the BDF# from the scope table for ACPI matches. */
859 				if (pdev && pdev->is_virtfn)
860 					goto got_pdev;
861 
862 				if (bus && devfn) {
863 					*bus = drhd->devices[i].bus;
864 					*devfn = drhd->devices[i].devfn;
865 				}
866 				goto out;
867 			}
868 
869 			if (is_downstream_to_pci_bridge(dev, tmp))
870 				goto got_pdev;
871 		}
872 
873 		if (pdev && drhd->include_all) {
874 		got_pdev:
875 			if (bus && devfn) {
876 				*bus = pdev->bus->number;
877 				*devfn = pdev->devfn;
878 			}
879 			goto out;
880 		}
881 	}
882 	iommu = NULL;
883  out:
884 	if (iommu_is_dummy(iommu, dev))
885 		iommu = NULL;
886 
887 	rcu_read_unlock();
888 
889 	return iommu;
890 }
891 
892 static void domain_flush_cache(struct dmar_domain *domain,
893 			       void *addr, int size)
894 {
895 	if (!domain->iommu_coherency)
896 		clflush_cache_range(addr, size);
897 }
898 
899 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
900 {
901 	struct context_entry *context;
902 	int ret = 0;
903 	unsigned long flags;
904 
905 	spin_lock_irqsave(&iommu->lock, flags);
906 	context = iommu_context_addr(iommu, bus, devfn, 0);
907 	if (context)
908 		ret = context_present(context);
909 	spin_unlock_irqrestore(&iommu->lock, flags);
910 	return ret;
911 }
912 
913 static void free_context_table(struct intel_iommu *iommu)
914 {
915 	int i;
916 	unsigned long flags;
917 	struct context_entry *context;
918 
919 	spin_lock_irqsave(&iommu->lock, flags);
920 	if (!iommu->root_entry) {
921 		goto out;
922 	}
923 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
924 		context = iommu_context_addr(iommu, i, 0, 0);
925 		if (context)
926 			free_pgtable_page(context);
927 
928 		if (!sm_supported(iommu))
929 			continue;
930 
931 		context = iommu_context_addr(iommu, i, 0x80, 0);
932 		if (context)
933 			free_pgtable_page(context);
934 
935 	}
936 	free_pgtable_page(iommu->root_entry);
937 	iommu->root_entry = NULL;
938 out:
939 	spin_unlock_irqrestore(&iommu->lock, flags);
940 }
941 
942 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
943 				      unsigned long pfn, int *target_level)
944 {
945 	struct dma_pte *parent, *pte;
946 	int level = agaw_to_level(domain->agaw);
947 	int offset;
948 
949 	BUG_ON(!domain->pgd);
950 
951 	if (!domain_pfn_supported(domain, pfn))
952 		/* Address beyond IOMMU's addressing capabilities. */
953 		return NULL;
954 
955 	parent = domain->pgd;
956 
957 	while (1) {
958 		void *tmp_page;
959 
960 		offset = pfn_level_offset(pfn, level);
961 		pte = &parent[offset];
962 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
963 			break;
964 		if (level == *target_level)
965 			break;
966 
967 		if (!dma_pte_present(pte)) {
968 			uint64_t pteval;
969 
970 			tmp_page = alloc_pgtable_page(domain->nid);
971 
972 			if (!tmp_page)
973 				return NULL;
974 
975 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
976 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
977 			if (domain_use_first_level(domain))
978 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
979 			if (cmpxchg64(&pte->val, 0ULL, pteval))
980 				/* Someone else set it while we were thinking; use theirs. */
981 				free_pgtable_page(tmp_page);
982 			else
983 				domain_flush_cache(domain, pte, sizeof(*pte));
984 		}
985 		if (level == 1)
986 			break;
987 
988 		parent = phys_to_virt(dma_pte_addr(pte));
989 		level--;
990 	}
991 
992 	if (!*target_level)
993 		*target_level = level;
994 
995 	return pte;
996 }
997 
998 /* return address's pte at specific level */
999 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1000 					 unsigned long pfn,
1001 					 int level, int *large_page)
1002 {
1003 	struct dma_pte *parent, *pte;
1004 	int total = agaw_to_level(domain->agaw);
1005 	int offset;
1006 
1007 	parent = domain->pgd;
1008 	while (level <= total) {
1009 		offset = pfn_level_offset(pfn, total);
1010 		pte = &parent[offset];
1011 		if (level == total)
1012 			return pte;
1013 
1014 		if (!dma_pte_present(pte)) {
1015 			*large_page = total;
1016 			break;
1017 		}
1018 
1019 		if (dma_pte_superpage(pte)) {
1020 			*large_page = total;
1021 			return pte;
1022 		}
1023 
1024 		parent = phys_to_virt(dma_pte_addr(pte));
1025 		total--;
1026 	}
1027 	return NULL;
1028 }
1029 
1030 /* clear last level pte, a tlb flush should be followed */
1031 static void dma_pte_clear_range(struct dmar_domain *domain,
1032 				unsigned long start_pfn,
1033 				unsigned long last_pfn)
1034 {
1035 	unsigned int large_page;
1036 	struct dma_pte *first_pte, *pte;
1037 
1038 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040 	BUG_ON(start_pfn > last_pfn);
1041 
1042 	/* we don't need lock here; nobody else touches the iova range */
1043 	do {
1044 		large_page = 1;
1045 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1046 		if (!pte) {
1047 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1048 			continue;
1049 		}
1050 		do {
1051 			dma_clear_pte(pte);
1052 			start_pfn += lvl_to_nr_pages(large_page);
1053 			pte++;
1054 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1055 
1056 		domain_flush_cache(domain, first_pte,
1057 				   (void *)pte - (void *)first_pte);
1058 
1059 	} while (start_pfn && start_pfn <= last_pfn);
1060 }
1061 
1062 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063 			       int retain_level, struct dma_pte *pte,
1064 			       unsigned long pfn, unsigned long start_pfn,
1065 			       unsigned long last_pfn)
1066 {
1067 	pfn = max(start_pfn, pfn);
1068 	pte = &pte[pfn_level_offset(pfn, level)];
1069 
1070 	do {
1071 		unsigned long level_pfn;
1072 		struct dma_pte *level_pte;
1073 
1074 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1075 			goto next;
1076 
1077 		level_pfn = pfn & level_mask(level);
1078 		level_pte = phys_to_virt(dma_pte_addr(pte));
1079 
1080 		if (level > 2) {
1081 			dma_pte_free_level(domain, level - 1, retain_level,
1082 					   level_pte, level_pfn, start_pfn,
1083 					   last_pfn);
1084 		}
1085 
1086 		/*
1087 		 * Free the page table if we're below the level we want to
1088 		 * retain and the range covers the entire table.
1089 		 */
1090 		if (level < retain_level && !(start_pfn > level_pfn ||
1091 		      last_pfn < level_pfn + level_size(level) - 1)) {
1092 			dma_clear_pte(pte);
1093 			domain_flush_cache(domain, pte, sizeof(*pte));
1094 			free_pgtable_page(level_pte);
1095 		}
1096 next:
1097 		pfn += level_size(level);
1098 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1099 }
1100 
1101 /*
1102  * clear last level (leaf) ptes and free page table pages below the
1103  * level we wish to keep intact.
1104  */
1105 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106 				   unsigned long start_pfn,
1107 				   unsigned long last_pfn,
1108 				   int retain_level)
1109 {
1110 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112 	BUG_ON(start_pfn > last_pfn);
1113 
1114 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1115 
1116 	/* We don't need lock here; nobody else touches the iova range */
1117 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118 			   domain->pgd, 0, start_pfn, last_pfn);
1119 
1120 	/* free pgd */
1121 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122 		free_pgtable_page(domain->pgd);
1123 		domain->pgd = NULL;
1124 	}
1125 }
1126 
1127 /* When a page at a given level is being unlinked from its parent, we don't
1128    need to *modify* it at all. All we need to do is make a list of all the
1129    pages which can be freed just as soon as we've flushed the IOTLB and we
1130    know the hardware page-walk will no longer touch them.
1131    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1132    be freed. */
1133 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134 					    int level, struct dma_pte *pte,
1135 					    struct page *freelist)
1136 {
1137 	struct page *pg;
1138 
1139 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140 	pg->freelist = freelist;
1141 	freelist = pg;
1142 
1143 	if (level == 1)
1144 		return freelist;
1145 
1146 	pte = page_address(pg);
1147 	do {
1148 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149 			freelist = dma_pte_list_pagetables(domain, level - 1,
1150 							   pte, freelist);
1151 		pte++;
1152 	} while (!first_pte_in_page(pte));
1153 
1154 	return freelist;
1155 }
1156 
1157 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158 					struct dma_pte *pte, unsigned long pfn,
1159 					unsigned long start_pfn,
1160 					unsigned long last_pfn,
1161 					struct page *freelist)
1162 {
1163 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1164 
1165 	pfn = max(start_pfn, pfn);
1166 	pte = &pte[pfn_level_offset(pfn, level)];
1167 
1168 	do {
1169 		unsigned long level_pfn;
1170 
1171 		if (!dma_pte_present(pte))
1172 			goto next;
1173 
1174 		level_pfn = pfn & level_mask(level);
1175 
1176 		/* If range covers entire pagetable, free it */
1177 		if (start_pfn <= level_pfn &&
1178 		    last_pfn >= level_pfn + level_size(level) - 1) {
1179 			/* These suborbinate page tables are going away entirely. Don't
1180 			   bother to clear them; we're just going to *free* them. */
1181 			if (level > 1 && !dma_pte_superpage(pte))
1182 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1183 
1184 			dma_clear_pte(pte);
1185 			if (!first_pte)
1186 				first_pte = pte;
1187 			last_pte = pte;
1188 		} else if (level > 1) {
1189 			/* Recurse down into a level that isn't *entirely* obsolete */
1190 			freelist = dma_pte_clear_level(domain, level - 1,
1191 						       phys_to_virt(dma_pte_addr(pte)),
1192 						       level_pfn, start_pfn, last_pfn,
1193 						       freelist);
1194 		}
1195 next:
1196 		pfn += level_size(level);
1197 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1198 
1199 	if (first_pte)
1200 		domain_flush_cache(domain, first_pte,
1201 				   (void *)++last_pte - (void *)first_pte);
1202 
1203 	return freelist;
1204 }
1205 
1206 /* We can't just free the pages because the IOMMU may still be walking
1207    the page tables, and may have cached the intermediate levels. The
1208    pages can only be freed after the IOTLB flush has been done. */
1209 static struct page *domain_unmap(struct dmar_domain *domain,
1210 				 unsigned long start_pfn,
1211 				 unsigned long last_pfn)
1212 {
1213 	struct page *freelist;
1214 
1215 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217 	BUG_ON(start_pfn > last_pfn);
1218 
1219 	/* we don't need lock here; nobody else touches the iova range */
1220 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1222 
1223 	/* free pgd */
1224 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225 		struct page *pgd_page = virt_to_page(domain->pgd);
1226 		pgd_page->freelist = freelist;
1227 		freelist = pgd_page;
1228 
1229 		domain->pgd = NULL;
1230 	}
1231 
1232 	return freelist;
1233 }
1234 
1235 static void dma_free_pagelist(struct page *freelist)
1236 {
1237 	struct page *pg;
1238 
1239 	while ((pg = freelist)) {
1240 		freelist = pg->freelist;
1241 		free_pgtable_page(page_address(pg));
1242 	}
1243 }
1244 
1245 static void iova_entry_free(unsigned long data)
1246 {
1247 	struct page *freelist = (struct page *)data;
1248 
1249 	dma_free_pagelist(freelist);
1250 }
1251 
1252 /* iommu handling */
1253 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1254 {
1255 	struct root_entry *root;
1256 	unsigned long flags;
1257 
1258 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1259 	if (!root) {
1260 		pr_err("Allocating root entry for %s failed\n",
1261 			iommu->name);
1262 		return -ENOMEM;
1263 	}
1264 
1265 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1266 
1267 	spin_lock_irqsave(&iommu->lock, flags);
1268 	iommu->root_entry = root;
1269 	spin_unlock_irqrestore(&iommu->lock, flags);
1270 
1271 	return 0;
1272 }
1273 
1274 static void iommu_set_root_entry(struct intel_iommu *iommu)
1275 {
1276 	u64 addr;
1277 	u32 sts;
1278 	unsigned long flag;
1279 
1280 	addr = virt_to_phys(iommu->root_entry);
1281 	if (sm_supported(iommu))
1282 		addr |= DMA_RTADDR_SMT;
1283 
1284 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1286 
1287 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1288 
1289 	/* Make sure hardware complete it */
1290 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291 		      readl, (sts & DMA_GSTS_RTPS), sts);
1292 
1293 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294 }
1295 
1296 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1297 {
1298 	u32 val;
1299 	unsigned long flag;
1300 
1301 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302 		return;
1303 
1304 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1306 
1307 	/* Make sure hardware complete it */
1308 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1310 
1311 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312 }
1313 
1314 /* return value determine if we need a write buffer flush */
1315 static void __iommu_flush_context(struct intel_iommu *iommu,
1316 				  u16 did, u16 source_id, u8 function_mask,
1317 				  u64 type)
1318 {
1319 	u64 val = 0;
1320 	unsigned long flag;
1321 
1322 	switch (type) {
1323 	case DMA_CCMD_GLOBAL_INVL:
1324 		val = DMA_CCMD_GLOBAL_INVL;
1325 		break;
1326 	case DMA_CCMD_DOMAIN_INVL:
1327 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1328 		break;
1329 	case DMA_CCMD_DEVICE_INVL:
1330 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1332 		break;
1333 	default:
1334 		BUG();
1335 	}
1336 	val |= DMA_CCMD_ICC;
1337 
1338 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1340 
1341 	/* Make sure hardware complete it */
1342 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1344 
1345 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346 }
1347 
1348 /* return value determine if we need a write buffer flush */
1349 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350 				u64 addr, unsigned int size_order, u64 type)
1351 {
1352 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353 	u64 val = 0, val_iva = 0;
1354 	unsigned long flag;
1355 
1356 	switch (type) {
1357 	case DMA_TLB_GLOBAL_FLUSH:
1358 		/* global flush doesn't need set IVA_REG */
1359 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1360 		break;
1361 	case DMA_TLB_DSI_FLUSH:
1362 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363 		break;
1364 	case DMA_TLB_PSI_FLUSH:
1365 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366 		/* IH bit is passed in as part of address */
1367 		val_iva = size_order | addr;
1368 		break;
1369 	default:
1370 		BUG();
1371 	}
1372 	/* Note: set drain read/write */
1373 #if 0
1374 	/*
1375 	 * This is probably to be super secure.. Looks like we can
1376 	 * ignore it without any impact.
1377 	 */
1378 	if (cap_read_drain(iommu->cap))
1379 		val |= DMA_TLB_READ_DRAIN;
1380 #endif
1381 	if (cap_write_drain(iommu->cap))
1382 		val |= DMA_TLB_WRITE_DRAIN;
1383 
1384 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385 	/* Note: Only uses first TLB reg currently */
1386 	if (val_iva)
1387 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1389 
1390 	/* Make sure hardware complete it */
1391 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1393 
1394 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395 
1396 	/* check IOTLB invalidation granularity */
1397 	if (DMA_TLB_IAIG(val) == 0)
1398 		pr_err("Flush IOTLB failed\n");
1399 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1401 			(unsigned long long)DMA_TLB_IIRG(type),
1402 			(unsigned long long)DMA_TLB_IAIG(val));
1403 }
1404 
1405 static struct device_domain_info *
1406 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1407 			 u8 bus, u8 devfn)
1408 {
1409 	struct device_domain_info *info;
1410 
1411 	assert_spin_locked(&device_domain_lock);
1412 
1413 	if (!iommu->qi)
1414 		return NULL;
1415 
1416 	list_for_each_entry(info, &domain->devices, link)
1417 		if (info->iommu == iommu && info->bus == bus &&
1418 		    info->devfn == devfn) {
1419 			if (info->ats_supported && info->dev)
1420 				return info;
1421 			break;
1422 		}
1423 
1424 	return NULL;
1425 }
1426 
1427 static void domain_update_iotlb(struct dmar_domain *domain)
1428 {
1429 	struct device_domain_info *info;
1430 	bool has_iotlb_device = false;
1431 
1432 	assert_spin_locked(&device_domain_lock);
1433 
1434 	list_for_each_entry(info, &domain->devices, link) {
1435 		struct pci_dev *pdev;
1436 
1437 		if (!info->dev || !dev_is_pci(info->dev))
1438 			continue;
1439 
1440 		pdev = to_pci_dev(info->dev);
1441 		if (pdev->ats_enabled) {
1442 			has_iotlb_device = true;
1443 			break;
1444 		}
1445 	}
1446 
1447 	domain->has_iotlb_device = has_iotlb_device;
1448 }
1449 
1450 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1451 {
1452 	struct pci_dev *pdev;
1453 
1454 	assert_spin_locked(&device_domain_lock);
1455 
1456 	if (!info || !dev_is_pci(info->dev))
1457 		return;
1458 
1459 	pdev = to_pci_dev(info->dev);
1460 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463 	 * reserved, which should be set to 0.
1464 	 */
1465 	if (!ecap_dit(info->iommu->ecap))
1466 		info->pfsid = 0;
1467 	else {
1468 		struct pci_dev *pf_pdev;
1469 
1470 		/* pdev will be returned if device is not a vf */
1471 		pf_pdev = pci_physfn(pdev);
1472 		info->pfsid = pci_dev_id(pf_pdev);
1473 	}
1474 
1475 #ifdef CONFIG_INTEL_IOMMU_SVM
1476 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1477 	   the device if you enable PASID support after ATS support is
1478 	   undefined. So always enable PASID support on devices which
1479 	   have it, even if we can't yet know if we're ever going to
1480 	   use it. */
1481 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482 		info->pasid_enabled = 1;
1483 
1484 	if (info->pri_supported &&
1485 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1486 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487 		info->pri_enabled = 1;
1488 #endif
1489 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491 		info->ats_enabled = 1;
1492 		domain_update_iotlb(info->domain);
1493 		info->ats_qdep = pci_ats_queue_depth(pdev);
1494 	}
1495 }
1496 
1497 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498 {
1499 	struct pci_dev *pdev;
1500 
1501 	assert_spin_locked(&device_domain_lock);
1502 
1503 	if (!dev_is_pci(info->dev))
1504 		return;
1505 
1506 	pdev = to_pci_dev(info->dev);
1507 
1508 	if (info->ats_enabled) {
1509 		pci_disable_ats(pdev);
1510 		info->ats_enabled = 0;
1511 		domain_update_iotlb(info->domain);
1512 	}
1513 #ifdef CONFIG_INTEL_IOMMU_SVM
1514 	if (info->pri_enabled) {
1515 		pci_disable_pri(pdev);
1516 		info->pri_enabled = 0;
1517 	}
1518 	if (info->pasid_enabled) {
1519 		pci_disable_pasid(pdev);
1520 		info->pasid_enabled = 0;
1521 	}
1522 #endif
1523 }
1524 
1525 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526 				  u64 addr, unsigned mask)
1527 {
1528 	u16 sid, qdep;
1529 	unsigned long flags;
1530 	struct device_domain_info *info;
1531 
1532 	if (!domain->has_iotlb_device)
1533 		return;
1534 
1535 	spin_lock_irqsave(&device_domain_lock, flags);
1536 	list_for_each_entry(info, &domain->devices, link) {
1537 		if (!info->ats_enabled)
1538 			continue;
1539 
1540 		sid = info->bus << 8 | info->devfn;
1541 		qdep = info->ats_qdep;
1542 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1543 				qdep, addr, mask);
1544 	}
1545 	spin_unlock_irqrestore(&device_domain_lock, flags);
1546 }
1547 
1548 static void domain_flush_piotlb(struct intel_iommu *iommu,
1549 				struct dmar_domain *domain,
1550 				u64 addr, unsigned long npages, bool ih)
1551 {
1552 	u16 did = domain->iommu_did[iommu->seq_id];
1553 
1554 	if (domain->default_pasid)
1555 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1556 				addr, npages, ih);
1557 
1558 	if (!list_empty(&domain->devices))
1559 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1560 }
1561 
1562 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563 				  struct dmar_domain *domain,
1564 				  unsigned long pfn, unsigned int pages,
1565 				  int ih, int map)
1566 {
1567 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1568 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569 	u16 did = domain->iommu_did[iommu->seq_id];
1570 
1571 	BUG_ON(pages == 0);
1572 
1573 	if (ih)
1574 		ih = 1 << 6;
1575 
1576 	if (domain_use_first_level(domain)) {
1577 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1578 	} else {
1579 		/*
1580 		 * Fallback to domain selective flush if no PSI support or
1581 		 * the size is too big. PSI requires page size to be 2 ^ x,
1582 		 * and the base address is naturally aligned to the size.
1583 		 */
1584 		if (!cap_pgsel_inv(iommu->cap) ||
1585 		    mask > cap_max_amask_val(iommu->cap))
1586 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587 							DMA_TLB_DSI_FLUSH);
1588 		else
1589 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1590 							DMA_TLB_PSI_FLUSH);
1591 	}
1592 
1593 	/*
1594 	 * In caching mode, changes of pages from non-present to present require
1595 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1596 	 */
1597 	if (!cap_caching_mode(iommu->cap) || !map)
1598 		iommu_flush_dev_iotlb(domain, addr, mask);
1599 }
1600 
1601 /* Notification for newly created mappings */
1602 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603 					struct dmar_domain *domain,
1604 					unsigned long pfn, unsigned int pages)
1605 {
1606 	/*
1607 	 * It's a non-present to present mapping. Only flush if caching mode
1608 	 * and second level.
1609 	 */
1610 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1612 	else
1613 		iommu_flush_write_buffer(iommu);
1614 }
1615 
1616 static void iommu_flush_iova(struct iova_domain *iovad)
1617 {
1618 	struct dmar_domain *domain;
1619 	int idx;
1620 
1621 	domain = container_of(iovad, struct dmar_domain, iovad);
1622 
1623 	for_each_domain_iommu(idx, domain) {
1624 		struct intel_iommu *iommu = g_iommus[idx];
1625 		u16 did = domain->iommu_did[iommu->seq_id];
1626 
1627 		if (domain_use_first_level(domain))
1628 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1629 		else
1630 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1631 						 DMA_TLB_DSI_FLUSH);
1632 
1633 		if (!cap_caching_mode(iommu->cap))
1634 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635 					      0, MAX_AGAW_PFN_WIDTH);
1636 	}
1637 }
1638 
1639 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1640 {
1641 	u32 pmen;
1642 	unsigned long flags;
1643 
1644 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1645 		return;
1646 
1647 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649 	pmen &= ~DMA_PMEN_EPM;
1650 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1651 
1652 	/* wait for the protected region status bit to clear */
1653 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1655 
1656 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1657 }
1658 
1659 static void iommu_enable_translation(struct intel_iommu *iommu)
1660 {
1661 	u32 sts;
1662 	unsigned long flags;
1663 
1664 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665 	iommu->gcmd |= DMA_GCMD_TE;
1666 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1667 
1668 	/* Make sure hardware complete it */
1669 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670 		      readl, (sts & DMA_GSTS_TES), sts);
1671 
1672 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1673 }
1674 
1675 static void iommu_disable_translation(struct intel_iommu *iommu)
1676 {
1677 	u32 sts;
1678 	unsigned long flag;
1679 
1680 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1682 		return;
1683 
1684 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685 	iommu->gcmd &= ~DMA_GCMD_TE;
1686 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687 
1688 	/* Make sure hardware complete it */
1689 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1691 
1692 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693 }
1694 
1695 static int iommu_init_domains(struct intel_iommu *iommu)
1696 {
1697 	u32 ndomains, nlongs;
1698 	size_t size;
1699 
1700 	ndomains = cap_ndoms(iommu->cap);
1701 	pr_debug("%s: Number of Domains supported <%d>\n",
1702 		 iommu->name, ndomains);
1703 	nlongs = BITS_TO_LONGS(ndomains);
1704 
1705 	spin_lock_init(&iommu->lock);
1706 
1707 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708 	if (!iommu->domain_ids) {
1709 		pr_err("%s: Allocating domain id array failed\n",
1710 		       iommu->name);
1711 		return -ENOMEM;
1712 	}
1713 
1714 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715 	iommu->domains = kzalloc(size, GFP_KERNEL);
1716 
1717 	if (iommu->domains) {
1718 		size = 256 * sizeof(struct dmar_domain *);
1719 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1720 	}
1721 
1722 	if (!iommu->domains || !iommu->domains[0]) {
1723 		pr_err("%s: Allocating domain array failed\n",
1724 		       iommu->name);
1725 		kfree(iommu->domain_ids);
1726 		kfree(iommu->domains);
1727 		iommu->domain_ids = NULL;
1728 		iommu->domains    = NULL;
1729 		return -ENOMEM;
1730 	}
1731 
1732 	/*
1733 	 * If Caching mode is set, then invalid translations are tagged
1734 	 * with domain-id 0, hence we need to pre-allocate it. We also
1735 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1736 	 * make sure it is not used for a real domain.
1737 	 */
1738 	set_bit(0, iommu->domain_ids);
1739 
1740 	/*
1741 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742 	 * entry for first-level or pass-through translation modes should
1743 	 * be programmed with a domain id different from those used for
1744 	 * second-level or nested translation. We reserve a domain id for
1745 	 * this purpose.
1746 	 */
1747 	if (sm_supported(iommu))
1748 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1749 
1750 	return 0;
1751 }
1752 
1753 static void disable_dmar_iommu(struct intel_iommu *iommu)
1754 {
1755 	struct device_domain_info *info, *tmp;
1756 	unsigned long flags;
1757 
1758 	if (!iommu->domains || !iommu->domain_ids)
1759 		return;
1760 
1761 	spin_lock_irqsave(&device_domain_lock, flags);
1762 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763 		if (info->iommu != iommu)
1764 			continue;
1765 
1766 		if (!info->dev || !info->domain)
1767 			continue;
1768 
1769 		__dmar_remove_one_dev_info(info);
1770 	}
1771 	spin_unlock_irqrestore(&device_domain_lock, flags);
1772 
1773 	if (iommu->gcmd & DMA_GCMD_TE)
1774 		iommu_disable_translation(iommu);
1775 }
1776 
1777 static void free_dmar_iommu(struct intel_iommu *iommu)
1778 {
1779 	if ((iommu->domains) && (iommu->domain_ids)) {
1780 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1781 		int i;
1782 
1783 		for (i = 0; i < elems; i++)
1784 			kfree(iommu->domains[i]);
1785 		kfree(iommu->domains);
1786 		kfree(iommu->domain_ids);
1787 		iommu->domains = NULL;
1788 		iommu->domain_ids = NULL;
1789 	}
1790 
1791 	g_iommus[iommu->seq_id] = NULL;
1792 
1793 	/* free context mapping */
1794 	free_context_table(iommu);
1795 
1796 #ifdef CONFIG_INTEL_IOMMU_SVM
1797 	if (pasid_supported(iommu)) {
1798 		if (ecap_prs(iommu->ecap))
1799 			intel_svm_finish_prq(iommu);
1800 	}
1801 	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1803 
1804 #endif
1805 }
1806 
1807 /*
1808  * Check and return whether first level is used by default for
1809  * DMA translation.
1810  */
1811 static bool first_level_by_default(void)
1812 {
1813 	struct dmar_drhd_unit *drhd;
1814 	struct intel_iommu *iommu;
1815 	static int first_level_support = -1;
1816 
1817 	if (likely(first_level_support != -1))
1818 		return first_level_support;
1819 
1820 	first_level_support = 1;
1821 
1822 	rcu_read_lock();
1823 	for_each_active_iommu(iommu, drhd) {
1824 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825 			first_level_support = 0;
1826 			break;
1827 		}
1828 	}
1829 	rcu_read_unlock();
1830 
1831 	return first_level_support;
1832 }
1833 
1834 static struct dmar_domain *alloc_domain(int flags)
1835 {
1836 	struct dmar_domain *domain;
1837 
1838 	domain = alloc_domain_mem();
1839 	if (!domain)
1840 		return NULL;
1841 
1842 	memset(domain, 0, sizeof(*domain));
1843 	domain->nid = NUMA_NO_NODE;
1844 	domain->flags = flags;
1845 	if (first_level_by_default())
1846 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847 	domain->has_iotlb_device = false;
1848 	INIT_LIST_HEAD(&domain->devices);
1849 
1850 	return domain;
1851 }
1852 
1853 /* Must be called with iommu->lock */
1854 static int domain_attach_iommu(struct dmar_domain *domain,
1855 			       struct intel_iommu *iommu)
1856 {
1857 	unsigned long ndomains;
1858 	int num;
1859 
1860 	assert_spin_locked(&device_domain_lock);
1861 	assert_spin_locked(&iommu->lock);
1862 
1863 	domain->iommu_refcnt[iommu->seq_id] += 1;
1864 	domain->iommu_count += 1;
1865 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866 		ndomains = cap_ndoms(iommu->cap);
1867 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1868 
1869 		if (num >= ndomains) {
1870 			pr_err("%s: No free domain ids\n", iommu->name);
1871 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 			domain->iommu_count -= 1;
1873 			return -ENOSPC;
1874 		}
1875 
1876 		set_bit(num, iommu->domain_ids);
1877 		set_iommu_domain(iommu, num, domain);
1878 
1879 		domain->iommu_did[iommu->seq_id] = num;
1880 		domain->nid			 = iommu->node;
1881 
1882 		domain_update_iommu_cap(domain);
1883 	}
1884 
1885 	return 0;
1886 }
1887 
1888 static int domain_detach_iommu(struct dmar_domain *domain,
1889 			       struct intel_iommu *iommu)
1890 {
1891 	int num, count;
1892 
1893 	assert_spin_locked(&device_domain_lock);
1894 	assert_spin_locked(&iommu->lock);
1895 
1896 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1897 	count = --domain->iommu_count;
1898 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899 		num = domain->iommu_did[iommu->seq_id];
1900 		clear_bit(num, iommu->domain_ids);
1901 		set_iommu_domain(iommu, num, NULL);
1902 
1903 		domain_update_iommu_cap(domain);
1904 		domain->iommu_did[iommu->seq_id] = 0;
1905 	}
1906 
1907 	return count;
1908 }
1909 
1910 static struct iova_domain reserved_iova_list;
1911 static struct lock_class_key reserved_rbtree_key;
1912 
1913 static int dmar_init_reserved_ranges(void)
1914 {
1915 	struct pci_dev *pdev = NULL;
1916 	struct iova *iova;
1917 	int i;
1918 
1919 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1920 
1921 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922 		&reserved_rbtree_key);
1923 
1924 	/* IOAPIC ranges shouldn't be accessed by DMA */
1925 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926 		IOVA_PFN(IOAPIC_RANGE_END));
1927 	if (!iova) {
1928 		pr_err("Reserve IOAPIC range failed\n");
1929 		return -ENODEV;
1930 	}
1931 
1932 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1933 	for_each_pci_dev(pdev) {
1934 		struct resource *r;
1935 
1936 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937 			r = &pdev->resource[i];
1938 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1939 				continue;
1940 			iova = reserve_iova(&reserved_iova_list,
1941 					    IOVA_PFN(r->start),
1942 					    IOVA_PFN(r->end));
1943 			if (!iova) {
1944 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1945 				return -ENODEV;
1946 			}
1947 		}
1948 	}
1949 	return 0;
1950 }
1951 
1952 static inline int guestwidth_to_adjustwidth(int gaw)
1953 {
1954 	int agaw;
1955 	int r = (gaw - 12) % 9;
1956 
1957 	if (r == 0)
1958 		agaw = gaw;
1959 	else
1960 		agaw = gaw + 9 - r;
1961 	if (agaw > 64)
1962 		agaw = 64;
1963 	return agaw;
1964 }
1965 
1966 static void domain_exit(struct dmar_domain *domain)
1967 {
1968 
1969 	/* Remove associated devices and clear attached or cached domains */
1970 	domain_remove_dev_info(domain);
1971 
1972 	/* destroy iovas */
1973 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974 		put_iova_domain(&domain->iovad);
1975 
1976 	if (domain->pgd) {
1977 		struct page *freelist;
1978 
1979 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980 		dma_free_pagelist(freelist);
1981 	}
1982 
1983 	free_domain_mem(domain);
1984 }
1985 
1986 /*
1987  * Get the PASID directory size for scalable mode context entry.
1988  * Value of X in the PDTS field of a scalable mode context entry
1989  * indicates PASID directory with 2^(X + 7) entries.
1990  */
1991 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1992 {
1993 	int pds, max_pde;
1994 
1995 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1997 	if (pds < 7)
1998 		return 0;
1999 
2000 	return pds - 7;
2001 }
2002 
2003 /*
2004  * Set the RID_PASID field of a scalable mode context entry. The
2005  * IOMMU hardware will use the PASID value set in this field for
2006  * DMA translations of DMA requests without PASID.
2007  */
2008 static inline void
2009 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2010 {
2011 	context->hi |= pasid & ((1 << 20) - 1);
2012 }
2013 
2014 /*
2015  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2016  * entry.
2017  */
2018 static inline void context_set_sm_dte(struct context_entry *context)
2019 {
2020 	context->lo |= (1 << 2);
2021 }
2022 
2023 /*
2024  * Set the PRE(Page Request Enable) field of a scalable mode context
2025  * entry.
2026  */
2027 static inline void context_set_sm_pre(struct context_entry *context)
2028 {
2029 	context->lo |= (1 << 4);
2030 }
2031 
2032 /* Convert value to context PASID directory size field coding. */
2033 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2034 
2035 static int domain_context_mapping_one(struct dmar_domain *domain,
2036 				      struct intel_iommu *iommu,
2037 				      struct pasid_table *table,
2038 				      u8 bus, u8 devfn)
2039 {
2040 	u16 did = domain->iommu_did[iommu->seq_id];
2041 	int translation = CONTEXT_TT_MULTI_LEVEL;
2042 	struct device_domain_info *info = NULL;
2043 	struct context_entry *context;
2044 	unsigned long flags;
2045 	int ret;
2046 
2047 	WARN_ON(did == 0);
2048 
2049 	if (hw_pass_through && domain_type_is_si(domain))
2050 		translation = CONTEXT_TT_PASS_THROUGH;
2051 
2052 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2053 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2054 
2055 	BUG_ON(!domain->pgd);
2056 
2057 	spin_lock_irqsave(&device_domain_lock, flags);
2058 	spin_lock(&iommu->lock);
2059 
2060 	ret = -ENOMEM;
2061 	context = iommu_context_addr(iommu, bus, devfn, 1);
2062 	if (!context)
2063 		goto out_unlock;
2064 
2065 	ret = 0;
2066 	if (context_present(context))
2067 		goto out_unlock;
2068 
2069 	/*
2070 	 * For kdump cases, old valid entries may be cached due to the
2071 	 * in-flight DMA and copied pgtable, but there is no unmapping
2072 	 * behaviour for them, thus we need an explicit cache flush for
2073 	 * the newly-mapped device. For kdump, at this point, the device
2074 	 * is supposed to finish reset at its driver probe stage, so no
2075 	 * in-flight DMA will exist, and we don't need to worry anymore
2076 	 * hereafter.
2077 	 */
2078 	if (context_copied(context)) {
2079 		u16 did_old = context_domain_id(context);
2080 
2081 		if (did_old < cap_ndoms(iommu->cap)) {
2082 			iommu->flush.flush_context(iommu, did_old,
2083 						   (((u16)bus) << 8) | devfn,
2084 						   DMA_CCMD_MASK_NOBIT,
2085 						   DMA_CCMD_DEVICE_INVL);
2086 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2087 						 DMA_TLB_DSI_FLUSH);
2088 		}
2089 	}
2090 
2091 	context_clear_entry(context);
2092 
2093 	if (sm_supported(iommu)) {
2094 		unsigned long pds;
2095 
2096 		WARN_ON(!table);
2097 
2098 		/* Setup the PASID DIR pointer: */
2099 		pds = context_get_sm_pds(table);
2100 		context->lo = (u64)virt_to_phys(table->table) |
2101 				context_pdts(pds);
2102 
2103 		/* Setup the RID_PASID field: */
2104 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2105 
2106 		/*
2107 		 * Setup the Device-TLB enable bit and Page request
2108 		 * Enable bit:
2109 		 */
2110 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111 		if (info && info->ats_supported)
2112 			context_set_sm_dte(context);
2113 		if (info && info->pri_supported)
2114 			context_set_sm_pre(context);
2115 	} else {
2116 		struct dma_pte *pgd = domain->pgd;
2117 		int agaw;
2118 
2119 		context_set_domain_id(context, did);
2120 
2121 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2122 			/*
2123 			 * Skip top levels of page tables for iommu which has
2124 			 * less agaw than default. Unnecessary for PT mode.
2125 			 */
2126 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2127 				ret = -ENOMEM;
2128 				pgd = phys_to_virt(dma_pte_addr(pgd));
2129 				if (!dma_pte_present(pgd))
2130 					goto out_unlock;
2131 			}
2132 
2133 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134 			if (info && info->ats_supported)
2135 				translation = CONTEXT_TT_DEV_IOTLB;
2136 			else
2137 				translation = CONTEXT_TT_MULTI_LEVEL;
2138 
2139 			context_set_address_root(context, virt_to_phys(pgd));
2140 			context_set_address_width(context, agaw);
2141 		} else {
2142 			/*
2143 			 * In pass through mode, AW must be programmed to
2144 			 * indicate the largest AGAW value supported by
2145 			 * hardware. And ASR is ignored by hardware.
2146 			 */
2147 			context_set_address_width(context, iommu->msagaw);
2148 		}
2149 
2150 		context_set_translation_type(context, translation);
2151 	}
2152 
2153 	context_set_fault_enable(context);
2154 	context_set_present(context);
2155 	if (!ecap_coherent(iommu->ecap))
2156 		clflush_cache_range(context, sizeof(*context));
2157 
2158 	/*
2159 	 * It's a non-present to present mapping. If hardware doesn't cache
2160 	 * non-present entry we only need to flush the write-buffer. If the
2161 	 * _does_ cache non-present entries, then it does so in the special
2162 	 * domain #0, which we have to flush:
2163 	 */
2164 	if (cap_caching_mode(iommu->cap)) {
2165 		iommu->flush.flush_context(iommu, 0,
2166 					   (((u16)bus) << 8) | devfn,
2167 					   DMA_CCMD_MASK_NOBIT,
2168 					   DMA_CCMD_DEVICE_INVL);
2169 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2170 	} else {
2171 		iommu_flush_write_buffer(iommu);
2172 	}
2173 	iommu_enable_dev_iotlb(info);
2174 
2175 	ret = 0;
2176 
2177 out_unlock:
2178 	spin_unlock(&iommu->lock);
2179 	spin_unlock_irqrestore(&device_domain_lock, flags);
2180 
2181 	return ret;
2182 }
2183 
2184 struct domain_context_mapping_data {
2185 	struct dmar_domain *domain;
2186 	struct intel_iommu *iommu;
2187 	struct pasid_table *table;
2188 };
2189 
2190 static int domain_context_mapping_cb(struct pci_dev *pdev,
2191 				     u16 alias, void *opaque)
2192 {
2193 	struct domain_context_mapping_data *data = opaque;
2194 
2195 	return domain_context_mapping_one(data->domain, data->iommu,
2196 					  data->table, PCI_BUS_NUM(alias),
2197 					  alias & 0xff);
2198 }
2199 
2200 static int
2201 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2202 {
2203 	struct domain_context_mapping_data data;
2204 	struct pasid_table *table;
2205 	struct intel_iommu *iommu;
2206 	u8 bus, devfn;
2207 
2208 	iommu = device_to_iommu(dev, &bus, &devfn);
2209 	if (!iommu)
2210 		return -ENODEV;
2211 
2212 	table = intel_pasid_get_table(dev);
2213 
2214 	if (!dev_is_pci(dev))
2215 		return domain_context_mapping_one(domain, iommu, table,
2216 						  bus, devfn);
2217 
2218 	data.domain = domain;
2219 	data.iommu = iommu;
2220 	data.table = table;
2221 
2222 	return pci_for_each_dma_alias(to_pci_dev(dev),
2223 				      &domain_context_mapping_cb, &data);
2224 }
2225 
2226 static int domain_context_mapped_cb(struct pci_dev *pdev,
2227 				    u16 alias, void *opaque)
2228 {
2229 	struct intel_iommu *iommu = opaque;
2230 
2231 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2232 }
2233 
2234 static int domain_context_mapped(struct device *dev)
2235 {
2236 	struct intel_iommu *iommu;
2237 	u8 bus, devfn;
2238 
2239 	iommu = device_to_iommu(dev, &bus, &devfn);
2240 	if (!iommu)
2241 		return -ENODEV;
2242 
2243 	if (!dev_is_pci(dev))
2244 		return device_context_mapped(iommu, bus, devfn);
2245 
2246 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2247 				       domain_context_mapped_cb, iommu);
2248 }
2249 
2250 /* Returns a number of VTD pages, but aligned to MM page size */
2251 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2252 					    size_t size)
2253 {
2254 	host_addr &= ~PAGE_MASK;
2255 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2256 }
2257 
2258 /* Return largest possible superpage level for a given mapping */
2259 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260 					  unsigned long iov_pfn,
2261 					  unsigned long phy_pfn,
2262 					  unsigned long pages)
2263 {
2264 	int support, level = 1;
2265 	unsigned long pfnmerge;
2266 
2267 	support = domain->iommu_superpage;
2268 
2269 	/* To use a large page, the virtual *and* physical addresses
2270 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271 	   of them will mean we have to use smaller pages. So just
2272 	   merge them and check both at once. */
2273 	pfnmerge = iov_pfn | phy_pfn;
2274 
2275 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276 		pages >>= VTD_STRIDE_SHIFT;
2277 		if (!pages)
2278 			break;
2279 		pfnmerge >>= VTD_STRIDE_SHIFT;
2280 		level++;
2281 		support--;
2282 	}
2283 	return level;
2284 }
2285 
2286 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287 			    struct scatterlist *sg, unsigned long phys_pfn,
2288 			    unsigned long nr_pages, int prot)
2289 {
2290 	struct dma_pte *first_pte = NULL, *pte = NULL;
2291 	phys_addr_t pteval;
2292 	unsigned long sg_res = 0;
2293 	unsigned int largepage_lvl = 0;
2294 	unsigned long lvl_pages = 0;
2295 	u64 attr;
2296 
2297 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2298 
2299 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2300 		return -EINVAL;
2301 
2302 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303 	if (domain_use_first_level(domain))
2304 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2305 
2306 	if (!sg) {
2307 		sg_res = nr_pages;
2308 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2309 	}
2310 
2311 	while (nr_pages > 0) {
2312 		uint64_t tmp;
2313 
2314 		if (!sg_res) {
2315 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2316 
2317 			sg_res = aligned_nrpages(sg->offset, sg->length);
2318 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319 			sg->dma_length = sg->length;
2320 			pteval = (sg_phys(sg) - pgoff) | attr;
2321 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2322 		}
2323 
2324 		if (!pte) {
2325 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2326 
2327 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328 			if (!pte)
2329 				return -ENOMEM;
2330 			/* It is large page*/
2331 			if (largepage_lvl > 1) {
2332 				unsigned long nr_superpages, end_pfn;
2333 
2334 				pteval |= DMA_PTE_LARGE_PAGE;
2335 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336 
2337 				nr_superpages = sg_res / lvl_pages;
2338 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339 
2340 				/*
2341 				 * Ensure that old small page tables are
2342 				 * removed to make room for superpage(s).
2343 				 * We're adding new large pages, so make sure
2344 				 * we don't remove their parent tables.
2345 				 */
2346 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347 						       largepage_lvl + 1);
2348 			} else {
2349 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350 			}
2351 
2352 		}
2353 		/* We don't need lock here, nobody else
2354 		 * touches the iova range
2355 		 */
2356 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357 		if (tmp) {
2358 			static int dumps = 5;
2359 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360 				iov_pfn, tmp, (unsigned long long)pteval);
2361 			if (dumps) {
2362 				dumps--;
2363 				debug_dma_dump_mappings(NULL);
2364 			}
2365 			WARN_ON(1);
2366 		}
2367 
2368 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369 
2370 		BUG_ON(nr_pages < lvl_pages);
2371 		BUG_ON(sg_res < lvl_pages);
2372 
2373 		nr_pages -= lvl_pages;
2374 		iov_pfn += lvl_pages;
2375 		phys_pfn += lvl_pages;
2376 		pteval += lvl_pages * VTD_PAGE_SIZE;
2377 		sg_res -= lvl_pages;
2378 
2379 		/* If the next PTE would be the first in a new page, then we
2380 		   need to flush the cache on the entries we've just written.
2381 		   And then we'll need to recalculate 'pte', so clear it and
2382 		   let it get set again in the if (!pte) block above.
2383 
2384 		   If we're done (!nr_pages) we need to flush the cache too.
2385 
2386 		   Also if we've been setting superpages, we may need to
2387 		   recalculate 'pte' and switch back to smaller pages for the
2388 		   end of the mapping, if the trailing size is not enough to
2389 		   use another superpage (i.e. sg_res < lvl_pages). */
2390 		pte++;
2391 		if (!nr_pages || first_pte_in_page(pte) ||
2392 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393 			domain_flush_cache(domain, first_pte,
2394 					   (void *)pte - (void *)first_pte);
2395 			pte = NULL;
2396 		}
2397 
2398 		if (!sg_res && nr_pages)
2399 			sg = sg_next(sg);
2400 	}
2401 	return 0;
2402 }
2403 
2404 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405 			  struct scatterlist *sg, unsigned long phys_pfn,
2406 			  unsigned long nr_pages, int prot)
2407 {
2408 	int iommu_id, ret;
2409 	struct intel_iommu *iommu;
2410 
2411 	/* Do the real mapping first */
2412 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2413 	if (ret)
2414 		return ret;
2415 
2416 	for_each_domain_iommu(iommu_id, domain) {
2417 		iommu = g_iommus[iommu_id];
2418 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419 	}
2420 
2421 	return 0;
2422 }
2423 
2424 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425 				    struct scatterlist *sg, unsigned long nr_pages,
2426 				    int prot)
2427 {
2428 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2429 }
2430 
2431 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432 				     unsigned long phys_pfn, unsigned long nr_pages,
2433 				     int prot)
2434 {
2435 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2436 }
2437 
2438 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2439 {
2440 	unsigned long flags;
2441 	struct context_entry *context;
2442 	u16 did_old;
2443 
2444 	if (!iommu)
2445 		return;
2446 
2447 	spin_lock_irqsave(&iommu->lock, flags);
2448 	context = iommu_context_addr(iommu, bus, devfn, 0);
2449 	if (!context) {
2450 		spin_unlock_irqrestore(&iommu->lock, flags);
2451 		return;
2452 	}
2453 	did_old = context_domain_id(context);
2454 	context_clear_entry(context);
2455 	__iommu_flush_cache(iommu, context, sizeof(*context));
2456 	spin_unlock_irqrestore(&iommu->lock, flags);
2457 	iommu->flush.flush_context(iommu,
2458 				   did_old,
2459 				   (((u16)bus) << 8) | devfn,
2460 				   DMA_CCMD_MASK_NOBIT,
2461 				   DMA_CCMD_DEVICE_INVL);
2462 	iommu->flush.flush_iotlb(iommu,
2463 				 did_old,
2464 				 0,
2465 				 0,
2466 				 DMA_TLB_DSI_FLUSH);
2467 }
2468 
2469 static inline void unlink_domain_info(struct device_domain_info *info)
2470 {
2471 	assert_spin_locked(&device_domain_lock);
2472 	list_del(&info->link);
2473 	list_del(&info->global);
2474 	if (info->dev)
2475 		dev_iommu_priv_set(info->dev, NULL);
2476 }
2477 
2478 static void domain_remove_dev_info(struct dmar_domain *domain)
2479 {
2480 	struct device_domain_info *info, *tmp;
2481 	unsigned long flags;
2482 
2483 	spin_lock_irqsave(&device_domain_lock, flags);
2484 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485 		__dmar_remove_one_dev_info(info);
2486 	spin_unlock_irqrestore(&device_domain_lock, flags);
2487 }
2488 
2489 struct dmar_domain *find_domain(struct device *dev)
2490 {
2491 	struct device_domain_info *info;
2492 
2493 	if (unlikely(attach_deferred(dev)))
2494 		return NULL;
2495 
2496 	/* No lock here, assumes no domain exit in normal case */
2497 	info = get_domain_info(dev);
2498 	if (likely(info))
2499 		return info->domain;
2500 
2501 	return NULL;
2502 }
2503 
2504 static void do_deferred_attach(struct device *dev)
2505 {
2506 	struct iommu_domain *domain;
2507 
2508 	dev_iommu_priv_set(dev, NULL);
2509 	domain = iommu_get_domain_for_dev(dev);
2510 	if (domain)
2511 		intel_iommu_attach_device(domain, dev);
2512 }
2513 
2514 static inline struct device_domain_info *
2515 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2516 {
2517 	struct device_domain_info *info;
2518 
2519 	list_for_each_entry(info, &device_domain_list, global)
2520 		if (info->segment == segment && info->bus == bus &&
2521 		    info->devfn == devfn)
2522 			return info;
2523 
2524 	return NULL;
2525 }
2526 
2527 static int domain_setup_first_level(struct intel_iommu *iommu,
2528 				    struct dmar_domain *domain,
2529 				    struct device *dev,
2530 				    int pasid)
2531 {
2532 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2533 	struct dma_pte *pgd = domain->pgd;
2534 	int agaw, level;
2535 
2536 	/*
2537 	 * Skip top levels of page tables for iommu which has
2538 	 * less agaw than default. Unnecessary for PT mode.
2539 	 */
2540 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541 		pgd = phys_to_virt(dma_pte_addr(pgd));
2542 		if (!dma_pte_present(pgd))
2543 			return -ENOMEM;
2544 	}
2545 
2546 	level = agaw_to_level(agaw);
2547 	if (level != 4 && level != 5)
2548 		return -EINVAL;
2549 
2550 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2551 
2552 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553 					     domain->iommu_did[iommu->seq_id],
2554 					     flags);
2555 }
2556 
2557 static bool dev_is_real_dma_subdevice(struct device *dev)
2558 {
2559 	return dev && dev_is_pci(dev) &&
2560 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2561 }
2562 
2563 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2564 						    int bus, int devfn,
2565 						    struct device *dev,
2566 						    struct dmar_domain *domain)
2567 {
2568 	struct dmar_domain *found = NULL;
2569 	struct device_domain_info *info;
2570 	unsigned long flags;
2571 	int ret;
2572 
2573 	info = alloc_devinfo_mem();
2574 	if (!info)
2575 		return NULL;
2576 
2577 	if (!dev_is_real_dma_subdevice(dev)) {
2578 		info->bus = bus;
2579 		info->devfn = devfn;
2580 		info->segment = iommu->segment;
2581 	} else {
2582 		struct pci_dev *pdev = to_pci_dev(dev);
2583 
2584 		info->bus = pdev->bus->number;
2585 		info->devfn = pdev->devfn;
2586 		info->segment = pci_domain_nr(pdev->bus);
2587 	}
2588 
2589 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2591 	info->ats_qdep = 0;
2592 	info->dev = dev;
2593 	info->domain = domain;
2594 	info->iommu = iommu;
2595 	info->pasid_table = NULL;
2596 	info->auxd_enabled = 0;
2597 	INIT_LIST_HEAD(&info->auxiliary_domains);
2598 
2599 	if (dev && dev_is_pci(dev)) {
2600 		struct pci_dev *pdev = to_pci_dev(info->dev);
2601 
2602 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2603 		    pci_ats_supported(pdev) &&
2604 		    dmar_find_matched_atsr_unit(pdev))
2605 			info->ats_supported = 1;
2606 
2607 		if (sm_supported(iommu)) {
2608 			if (pasid_supported(iommu)) {
2609 				int features = pci_pasid_features(pdev);
2610 				if (features >= 0)
2611 					info->pasid_supported = features | 1;
2612 			}
2613 
2614 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615 			    pci_pri_supported(pdev))
2616 				info->pri_supported = 1;
2617 		}
2618 	}
2619 
2620 	spin_lock_irqsave(&device_domain_lock, flags);
2621 	if (dev)
2622 		found = find_domain(dev);
2623 
2624 	if (!found) {
2625 		struct device_domain_info *info2;
2626 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2627 						       info->devfn);
2628 		if (info2) {
2629 			found      = info2->domain;
2630 			info2->dev = dev;
2631 		}
2632 	}
2633 
2634 	if (found) {
2635 		spin_unlock_irqrestore(&device_domain_lock, flags);
2636 		free_devinfo_mem(info);
2637 		/* Caller must free the original domain */
2638 		return found;
2639 	}
2640 
2641 	spin_lock(&iommu->lock);
2642 	ret = domain_attach_iommu(domain, iommu);
2643 	spin_unlock(&iommu->lock);
2644 
2645 	if (ret) {
2646 		spin_unlock_irqrestore(&device_domain_lock, flags);
2647 		free_devinfo_mem(info);
2648 		return NULL;
2649 	}
2650 
2651 	list_add(&info->link, &domain->devices);
2652 	list_add(&info->global, &device_domain_list);
2653 	if (dev)
2654 		dev_iommu_priv_set(dev, info);
2655 	spin_unlock_irqrestore(&device_domain_lock, flags);
2656 
2657 	/* PASID table is mandatory for a PCI device in scalable mode. */
2658 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659 		ret = intel_pasid_alloc_table(dev);
2660 		if (ret) {
2661 			dev_err(dev, "PASID table allocation failed\n");
2662 			dmar_remove_one_dev_info(dev);
2663 			return NULL;
2664 		}
2665 
2666 		/* Setup the PASID entry for requests without PASID: */
2667 		spin_lock(&iommu->lock);
2668 		if (hw_pass_through && domain_type_is_si(domain))
2669 			ret = intel_pasid_setup_pass_through(iommu, domain,
2670 					dev, PASID_RID2PASID);
2671 		else if (domain_use_first_level(domain))
2672 			ret = domain_setup_first_level(iommu, domain, dev,
2673 					PASID_RID2PASID);
2674 		else
2675 			ret = intel_pasid_setup_second_level(iommu, domain,
2676 					dev, PASID_RID2PASID);
2677 		spin_unlock(&iommu->lock);
2678 		if (ret) {
2679 			dev_err(dev, "Setup RID2PASID failed\n");
2680 			dmar_remove_one_dev_info(dev);
2681 			return NULL;
2682 		}
2683 	}
2684 
2685 	if (dev && domain_context_mapping(domain, dev)) {
2686 		dev_err(dev, "Domain context map failed\n");
2687 		dmar_remove_one_dev_info(dev);
2688 		return NULL;
2689 	}
2690 
2691 	return domain;
2692 }
2693 
2694 static int iommu_domain_identity_map(struct dmar_domain *domain,
2695 				     unsigned long first_vpfn,
2696 				     unsigned long last_vpfn)
2697 {
2698 	/*
2699 	 * RMRR range might have overlap with physical memory range,
2700 	 * clear it first
2701 	 */
2702 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2703 
2704 	return __domain_mapping(domain, first_vpfn, NULL,
2705 				first_vpfn, last_vpfn - first_vpfn + 1,
2706 				DMA_PTE_READ|DMA_PTE_WRITE);
2707 }
2708 
2709 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2710 
2711 static int __init si_domain_init(int hw)
2712 {
2713 	struct dmar_rmrr_unit *rmrr;
2714 	struct device *dev;
2715 	int i, nid, ret;
2716 
2717 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2718 	if (!si_domain)
2719 		return -EFAULT;
2720 
2721 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722 		domain_exit(si_domain);
2723 		return -EFAULT;
2724 	}
2725 
2726 	if (hw)
2727 		return 0;
2728 
2729 	for_each_online_node(nid) {
2730 		unsigned long start_pfn, end_pfn;
2731 		int i;
2732 
2733 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734 			ret = iommu_domain_identity_map(si_domain,
2735 					mm_to_dma_pfn(start_pfn),
2736 					mm_to_dma_pfn(end_pfn));
2737 			if (ret)
2738 				return ret;
2739 		}
2740 	}
2741 
2742 	/*
2743 	 * Identity map the RMRRs so that devices with RMRRs could also use
2744 	 * the si_domain.
2745 	 */
2746 	for_each_rmrr_units(rmrr) {
2747 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2748 					  i, dev) {
2749 			unsigned long long start = rmrr->base_address;
2750 			unsigned long long end = rmrr->end_address;
2751 
2752 			if (WARN_ON(end < start ||
2753 				    end >> agaw_to_width(si_domain->agaw)))
2754 				continue;
2755 
2756 			ret = iommu_domain_identity_map(si_domain,
2757 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2758 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2759 			if (ret)
2760 				return ret;
2761 		}
2762 	}
2763 
2764 	return 0;
2765 }
2766 
2767 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2768 {
2769 	struct dmar_domain *ndomain;
2770 	struct intel_iommu *iommu;
2771 	u8 bus, devfn;
2772 
2773 	iommu = device_to_iommu(dev, &bus, &devfn);
2774 	if (!iommu)
2775 		return -ENODEV;
2776 
2777 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778 	if (ndomain != domain)
2779 		return -EBUSY;
2780 
2781 	return 0;
2782 }
2783 
2784 static bool device_has_rmrr(struct device *dev)
2785 {
2786 	struct dmar_rmrr_unit *rmrr;
2787 	struct device *tmp;
2788 	int i;
2789 
2790 	rcu_read_lock();
2791 	for_each_rmrr_units(rmrr) {
2792 		/*
2793 		 * Return TRUE if this RMRR contains the device that
2794 		 * is passed in.
2795 		 */
2796 		for_each_active_dev_scope(rmrr->devices,
2797 					  rmrr->devices_cnt, i, tmp)
2798 			if (tmp == dev ||
2799 			    is_downstream_to_pci_bridge(dev, tmp)) {
2800 				rcu_read_unlock();
2801 				return true;
2802 			}
2803 	}
2804 	rcu_read_unlock();
2805 	return false;
2806 }
2807 
2808 /**
2809  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810  * is relaxable (ie. is allowed to be not enforced under some conditions)
2811  * @dev: device handle
2812  *
2813  * We assume that PCI USB devices with RMRRs have them largely
2814  * for historical reasons and that the RMRR space is not actively used post
2815  * boot.  This exclusion may change if vendors begin to abuse it.
2816  *
2817  * The same exception is made for graphics devices, with the requirement that
2818  * any use of the RMRR regions will be torn down before assigning the device
2819  * to a guest.
2820  *
2821  * Return: true if the RMRR is relaxable, false otherwise
2822  */
2823 static bool device_rmrr_is_relaxable(struct device *dev)
2824 {
2825 	struct pci_dev *pdev;
2826 
2827 	if (!dev_is_pci(dev))
2828 		return false;
2829 
2830 	pdev = to_pci_dev(dev);
2831 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832 		return true;
2833 	else
2834 		return false;
2835 }
2836 
2837 /*
2838  * There are a couple cases where we need to restrict the functionality of
2839  * devices associated with RMRRs.  The first is when evaluating a device for
2840  * identity mapping because problems exist when devices are moved in and out
2841  * of domains and their respective RMRR information is lost.  This means that
2842  * a device with associated RMRRs will never be in a "passthrough" domain.
2843  * The second is use of the device through the IOMMU API.  This interface
2844  * expects to have full control of the IOVA space for the device.  We cannot
2845  * satisfy both the requirement that RMRR access is maintained and have an
2846  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2847  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848  * We therefore prevent devices associated with an RMRR from participating in
2849  * the IOMMU API, which eliminates them from device assignment.
2850  *
2851  * In both cases, devices which have relaxable RMRRs are not concerned by this
2852  * restriction. See device_rmrr_is_relaxable comment.
2853  */
2854 static bool device_is_rmrr_locked(struct device *dev)
2855 {
2856 	if (!device_has_rmrr(dev))
2857 		return false;
2858 
2859 	if (device_rmrr_is_relaxable(dev))
2860 		return false;
2861 
2862 	return true;
2863 }
2864 
2865 /*
2866  * Return the required default domain type for a specific device.
2867  *
2868  * @dev: the device in query
2869  * @startup: true if this is during early boot
2870  *
2871  * Returns:
2872  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874  *  - 0: both identity and dynamic domains work for this device
2875  */
2876 static int device_def_domain_type(struct device *dev)
2877 {
2878 	if (dev_is_pci(dev)) {
2879 		struct pci_dev *pdev = to_pci_dev(dev);
2880 
2881 		/*
2882 		 * Prevent any device marked as untrusted from getting
2883 		 * placed into the statically identity mapping domain.
2884 		 */
2885 		if (pdev->untrusted)
2886 			return IOMMU_DOMAIN_DMA;
2887 
2888 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889 			return IOMMU_DOMAIN_IDENTITY;
2890 
2891 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892 			return IOMMU_DOMAIN_IDENTITY;
2893 	}
2894 
2895 	return 0;
2896 }
2897 
2898 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899 {
2900 	/*
2901 	 * Start from the sane iommu hardware state.
2902 	 * If the queued invalidation is already initialized by us
2903 	 * (for example, while enabling interrupt-remapping) then
2904 	 * we got the things already rolling from a sane state.
2905 	 */
2906 	if (!iommu->qi) {
2907 		/*
2908 		 * Clear any previous faults.
2909 		 */
2910 		dmar_fault(-1, iommu);
2911 		/*
2912 		 * Disable queued invalidation if supported and already enabled
2913 		 * before OS handover.
2914 		 */
2915 		dmar_disable_qi(iommu);
2916 	}
2917 
2918 	if (dmar_enable_qi(iommu)) {
2919 		/*
2920 		 * Queued Invalidate not enabled, use Register Based Invalidate
2921 		 */
2922 		iommu->flush.flush_context = __iommu_flush_context;
2923 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924 		pr_info("%s: Using Register based invalidation\n",
2925 			iommu->name);
2926 	} else {
2927 		iommu->flush.flush_context = qi_flush_context;
2928 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2929 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2930 	}
2931 }
2932 
2933 static int copy_context_table(struct intel_iommu *iommu,
2934 			      struct root_entry *old_re,
2935 			      struct context_entry **tbl,
2936 			      int bus, bool ext)
2937 {
2938 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939 	struct context_entry *new_ce = NULL, ce;
2940 	struct context_entry *old_ce = NULL;
2941 	struct root_entry re;
2942 	phys_addr_t old_ce_phys;
2943 
2944 	tbl_idx = ext ? bus * 2 : bus;
2945 	memcpy(&re, old_re, sizeof(re));
2946 
2947 	for (devfn = 0; devfn < 256; devfn++) {
2948 		/* First calculate the correct index */
2949 		idx = (ext ? devfn * 2 : devfn) % 256;
2950 
2951 		if (idx == 0) {
2952 			/* First save what we may have and clean up */
2953 			if (new_ce) {
2954 				tbl[tbl_idx] = new_ce;
2955 				__iommu_flush_cache(iommu, new_ce,
2956 						    VTD_PAGE_SIZE);
2957 				pos = 1;
2958 			}
2959 
2960 			if (old_ce)
2961 				memunmap(old_ce);
2962 
2963 			ret = 0;
2964 			if (devfn < 0x80)
2965 				old_ce_phys = root_entry_lctp(&re);
2966 			else
2967 				old_ce_phys = root_entry_uctp(&re);
2968 
2969 			if (!old_ce_phys) {
2970 				if (ext && devfn == 0) {
2971 					/* No LCTP, try UCTP */
2972 					devfn = 0x7f;
2973 					continue;
2974 				} else {
2975 					goto out;
2976 				}
2977 			}
2978 
2979 			ret = -ENOMEM;
2980 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981 					MEMREMAP_WB);
2982 			if (!old_ce)
2983 				goto out;
2984 
2985 			new_ce = alloc_pgtable_page(iommu->node);
2986 			if (!new_ce)
2987 				goto out_unmap;
2988 
2989 			ret = 0;
2990 		}
2991 
2992 		/* Now copy the context entry */
2993 		memcpy(&ce, old_ce + idx, sizeof(ce));
2994 
2995 		if (!__context_present(&ce))
2996 			continue;
2997 
2998 		did = context_domain_id(&ce);
2999 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3000 			set_bit(did, iommu->domain_ids);
3001 
3002 		/*
3003 		 * We need a marker for copied context entries. This
3004 		 * marker needs to work for the old format as well as
3005 		 * for extended context entries.
3006 		 *
3007 		 * Bit 67 of the context entry is used. In the old
3008 		 * format this bit is available to software, in the
3009 		 * extended format it is the PGE bit, but PGE is ignored
3010 		 * by HW if PASIDs are disabled (and thus still
3011 		 * available).
3012 		 *
3013 		 * So disable PASIDs first and then mark the entry
3014 		 * copied. This means that we don't copy PASID
3015 		 * translations from the old kernel, but this is fine as
3016 		 * faults there are not fatal.
3017 		 */
3018 		context_clear_pasid_enable(&ce);
3019 		context_set_copied(&ce);
3020 
3021 		new_ce[idx] = ce;
3022 	}
3023 
3024 	tbl[tbl_idx + pos] = new_ce;
3025 
3026 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027 
3028 out_unmap:
3029 	memunmap(old_ce);
3030 
3031 out:
3032 	return ret;
3033 }
3034 
3035 static int copy_translation_tables(struct intel_iommu *iommu)
3036 {
3037 	struct context_entry **ctxt_tbls;
3038 	struct root_entry *old_rt;
3039 	phys_addr_t old_rt_phys;
3040 	int ctxt_table_entries;
3041 	unsigned long flags;
3042 	u64 rtaddr_reg;
3043 	int bus, ret;
3044 	bool new_ext, ext;
3045 
3046 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048 	new_ext    = !!ecap_ecs(iommu->ecap);
3049 
3050 	/*
3051 	 * The RTT bit can only be changed when translation is disabled,
3052 	 * but disabling translation means to open a window for data
3053 	 * corruption. So bail out and don't copy anything if we would
3054 	 * have to change the bit.
3055 	 */
3056 	if (new_ext != ext)
3057 		return -EINVAL;
3058 
3059 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060 	if (!old_rt_phys)
3061 		return -EINVAL;
3062 
3063 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064 	if (!old_rt)
3065 		return -ENOMEM;
3066 
3067 	/* This is too big for the stack - allocate it from slab */
3068 	ctxt_table_entries = ext ? 512 : 256;
3069 	ret = -ENOMEM;
3070 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3071 	if (!ctxt_tbls)
3072 		goto out_unmap;
3073 
3074 	for (bus = 0; bus < 256; bus++) {
3075 		ret = copy_context_table(iommu, &old_rt[bus],
3076 					 ctxt_tbls, bus, ext);
3077 		if (ret) {
3078 			pr_err("%s: Failed to copy context table for bus %d\n",
3079 				iommu->name, bus);
3080 			continue;
3081 		}
3082 	}
3083 
3084 	spin_lock_irqsave(&iommu->lock, flags);
3085 
3086 	/* Context tables are copied, now write them to the root_entry table */
3087 	for (bus = 0; bus < 256; bus++) {
3088 		int idx = ext ? bus * 2 : bus;
3089 		u64 val;
3090 
3091 		if (ctxt_tbls[idx]) {
3092 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093 			iommu->root_entry[bus].lo = val;
3094 		}
3095 
3096 		if (!ext || !ctxt_tbls[idx + 1])
3097 			continue;
3098 
3099 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100 		iommu->root_entry[bus].hi = val;
3101 	}
3102 
3103 	spin_unlock_irqrestore(&iommu->lock, flags);
3104 
3105 	kfree(ctxt_tbls);
3106 
3107 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108 
3109 	ret = 0;
3110 
3111 out_unmap:
3112 	memunmap(old_rt);
3113 
3114 	return ret;
3115 }
3116 
3117 #ifdef CONFIG_INTEL_IOMMU_SVM
3118 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3119 {
3120 	struct intel_iommu *iommu = data;
3121 	ioasid_t ioasid;
3122 
3123 	if (!iommu)
3124 		return INVALID_IOASID;
3125 	/*
3126 	 * VT-d virtual command interface always uses the full 20 bit
3127 	 * PASID range. Host can partition guest PASID range based on
3128 	 * policies but it is out of guest's control.
3129 	 */
3130 	if (min < PASID_MIN || max > intel_pasid_max_id)
3131 		return INVALID_IOASID;
3132 
3133 	if (vcmd_alloc_pasid(iommu, &ioasid))
3134 		return INVALID_IOASID;
3135 
3136 	return ioasid;
3137 }
3138 
3139 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3140 {
3141 	struct intel_iommu *iommu = data;
3142 
3143 	if (!iommu)
3144 		return;
3145 	/*
3146 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147 	 * We can only free the PASID when all the devices are unbound.
3148 	 */
3149 	if (ioasid_find(NULL, ioasid, NULL)) {
3150 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3151 		return;
3152 	}
3153 	vcmd_free_pasid(iommu, ioasid);
3154 }
3155 
3156 static void register_pasid_allocator(struct intel_iommu *iommu)
3157 {
3158 	/*
3159 	 * If we are running in the host, no need for custom allocator
3160 	 * in that PASIDs are allocated from the host system-wide.
3161 	 */
3162 	if (!cap_caching_mode(iommu->cap))
3163 		return;
3164 
3165 	if (!sm_supported(iommu)) {
3166 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3167 		return;
3168 	}
3169 
3170 	/*
3171 	 * Register a custom PASID allocator if we are running in a guest,
3172 	 * guest PASID must be obtained via virtual command interface.
3173 	 * There can be multiple vIOMMUs in each guest but only one allocator
3174 	 * is active. All vIOMMU allocators will eventually be calling the same
3175 	 * host allocator.
3176 	 */
3177 	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3178 		return;
3179 
3180 	pr_info("Register custom PASID allocator\n");
3181 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183 	iommu->pasid_allocator.pdata = (void *)iommu;
3184 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3186 		/*
3187 		 * Disable scalable mode on this IOMMU if there
3188 		 * is no custom allocator. Mixing SM capable vIOMMU
3189 		 * and non-SM vIOMMU are not supported.
3190 		 */
3191 		intel_iommu_sm = 0;
3192 	}
3193 }
3194 #endif
3195 
3196 static int __init init_dmars(void)
3197 {
3198 	struct dmar_drhd_unit *drhd;
3199 	struct intel_iommu *iommu;
3200 	int ret;
3201 
3202 	/*
3203 	 * for each drhd
3204 	 *    allocate root
3205 	 *    initialize and program root entry to not present
3206 	 * endfor
3207 	 */
3208 	for_each_drhd_unit(drhd) {
3209 		/*
3210 		 * lock not needed as this is only incremented in the single
3211 		 * threaded kernel __init code path all other access are read
3212 		 * only
3213 		 */
3214 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3215 			g_num_of_iommus++;
3216 			continue;
3217 		}
3218 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3219 	}
3220 
3221 	/* Preallocate enough resources for IOMMU hot-addition */
3222 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224 
3225 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3226 			GFP_KERNEL);
3227 	if (!g_iommus) {
3228 		pr_err("Allocating global iommu array failed\n");
3229 		ret = -ENOMEM;
3230 		goto error;
3231 	}
3232 
3233 	for_each_iommu(iommu, drhd) {
3234 		if (drhd->ignored) {
3235 			iommu_disable_translation(iommu);
3236 			continue;
3237 		}
3238 
3239 		/*
3240 		 * Find the max pasid size of all IOMMU's in the system.
3241 		 * We need to ensure the system pasid table is no bigger
3242 		 * than the smallest supported.
3243 		 */
3244 		if (pasid_supported(iommu)) {
3245 			u32 temp = 2 << ecap_pss(iommu->ecap);
3246 
3247 			intel_pasid_max_id = min_t(u32, temp,
3248 						   intel_pasid_max_id);
3249 		}
3250 
3251 		g_iommus[iommu->seq_id] = iommu;
3252 
3253 		intel_iommu_init_qi(iommu);
3254 
3255 		ret = iommu_init_domains(iommu);
3256 		if (ret)
3257 			goto free_iommu;
3258 
3259 		init_translation_status(iommu);
3260 
3261 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262 			iommu_disable_translation(iommu);
3263 			clear_translation_pre_enabled(iommu);
3264 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265 				iommu->name);
3266 		}
3267 
3268 		/*
3269 		 * TBD:
3270 		 * we could share the same root & context tables
3271 		 * among all IOMMU's. Need to Split it later.
3272 		 */
3273 		ret = iommu_alloc_root_entry(iommu);
3274 		if (ret)
3275 			goto free_iommu;
3276 
3277 		if (translation_pre_enabled(iommu)) {
3278 			pr_info("Translation already enabled - trying to copy translation structures\n");
3279 
3280 			ret = copy_translation_tables(iommu);
3281 			if (ret) {
3282 				/*
3283 				 * We found the IOMMU with translation
3284 				 * enabled - but failed to copy over the
3285 				 * old root-entry table. Try to proceed
3286 				 * by disabling translation now and
3287 				 * allocating a clean root-entry table.
3288 				 * This might cause DMAR faults, but
3289 				 * probably the dump will still succeed.
3290 				 */
3291 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292 				       iommu->name);
3293 				iommu_disable_translation(iommu);
3294 				clear_translation_pre_enabled(iommu);
3295 			} else {
3296 				pr_info("Copied translation tables from previous kernel for %s\n",
3297 					iommu->name);
3298 			}
3299 		}
3300 
3301 		if (!ecap_pass_through(iommu->ecap))
3302 			hw_pass_through = 0;
3303 		intel_svm_check(iommu);
3304 	}
3305 
3306 	/*
3307 	 * Now that qi is enabled on all iommus, set the root entry and flush
3308 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3309 	 * flush_context function will loop forever and the boot hangs.
3310 	 */
3311 	for_each_active_iommu(iommu, drhd) {
3312 		iommu_flush_write_buffer(iommu);
3313 #ifdef CONFIG_INTEL_IOMMU_SVM
3314 		register_pasid_allocator(iommu);
3315 #endif
3316 		iommu_set_root_entry(iommu);
3317 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3319 	}
3320 
3321 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322 	dmar_map_gfx = 0;
3323 #endif
3324 
3325 	if (!dmar_map_gfx)
3326 		iommu_identity_mapping |= IDENTMAP_GFX;
3327 
3328 	check_tylersburg_isoch();
3329 
3330 	ret = si_domain_init(hw_pass_through);
3331 	if (ret)
3332 		goto free_iommu;
3333 
3334 	/*
3335 	 * for each drhd
3336 	 *   enable fault log
3337 	 *   global invalidate context cache
3338 	 *   global invalidate iotlb
3339 	 *   enable translation
3340 	 */
3341 	for_each_iommu(iommu, drhd) {
3342 		if (drhd->ignored) {
3343 			/*
3344 			 * we always have to disable PMRs or DMA may fail on
3345 			 * this device
3346 			 */
3347 			if (force_on)
3348 				iommu_disable_protect_mem_regions(iommu);
3349 			continue;
3350 		}
3351 
3352 		iommu_flush_write_buffer(iommu);
3353 
3354 #ifdef CONFIG_INTEL_IOMMU_SVM
3355 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356 			/*
3357 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358 			 * could cause possible lock race condition.
3359 			 */
3360 			up_write(&dmar_global_lock);
3361 			ret = intel_svm_enable_prq(iommu);
3362 			down_write(&dmar_global_lock);
3363 			if (ret)
3364 				goto free_iommu;
3365 		}
3366 #endif
3367 		ret = dmar_set_interrupt(iommu);
3368 		if (ret)
3369 			goto free_iommu;
3370 	}
3371 
3372 	return 0;
3373 
3374 free_iommu:
3375 	for_each_active_iommu(iommu, drhd) {
3376 		disable_dmar_iommu(iommu);
3377 		free_dmar_iommu(iommu);
3378 	}
3379 
3380 	kfree(g_iommus);
3381 
3382 error:
3383 	return ret;
3384 }
3385 
3386 /* This takes a number of _MM_ pages, not VTD pages */
3387 static unsigned long intel_alloc_iova(struct device *dev,
3388 				     struct dmar_domain *domain,
3389 				     unsigned long nrpages, uint64_t dma_mask)
3390 {
3391 	unsigned long iova_pfn;
3392 
3393 	/*
3394 	 * Restrict dma_mask to the width that the iommu can handle.
3395 	 * First-level translation restricts the input-address to a
3396 	 * canonical address (i.e., address bits 63:N have the same
3397 	 * value as address bit [N-1], where N is 48-bits with 4-level
3398 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3399 	 * [N-1].
3400 	 */
3401 	if (domain_use_first_level(domain))
3402 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3403 				 dma_mask);
3404 	else
3405 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3406 				 dma_mask);
3407 
3408 	/* Ensure we reserve the whole size-aligned region */
3409 	nrpages = __roundup_pow_of_two(nrpages);
3410 
3411 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3412 		/*
3413 		 * First try to allocate an io virtual address in
3414 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3415 		 * from higher range
3416 		 */
3417 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3419 		if (iova_pfn)
3420 			return iova_pfn;
3421 	}
3422 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423 				   IOVA_PFN(dma_mask), true);
3424 	if (unlikely(!iova_pfn)) {
3425 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3426 			     nrpages);
3427 		return 0;
3428 	}
3429 
3430 	return iova_pfn;
3431 }
3432 
3433 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434 				     size_t size, int dir, u64 dma_mask)
3435 {
3436 	struct dmar_domain *domain;
3437 	phys_addr_t start_paddr;
3438 	unsigned long iova_pfn;
3439 	int prot = 0;
3440 	int ret;
3441 	struct intel_iommu *iommu;
3442 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3443 
3444 	BUG_ON(dir == DMA_NONE);
3445 
3446 	if (unlikely(attach_deferred(dev)))
3447 		do_deferred_attach(dev);
3448 
3449 	domain = find_domain(dev);
3450 	if (!domain)
3451 		return DMA_MAPPING_ERROR;
3452 
3453 	iommu = domain_get_iommu(domain);
3454 	size = aligned_nrpages(paddr, size);
3455 
3456 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3457 	if (!iova_pfn)
3458 		goto error;
3459 
3460 	/*
3461 	 * Check if DMAR supports zero-length reads on write only
3462 	 * mappings..
3463 	 */
3464 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465 			!cap_zlr(iommu->cap))
3466 		prot |= DMA_PTE_READ;
3467 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468 		prot |= DMA_PTE_WRITE;
3469 	/*
3470 	 * paddr - (paddr + size) might be partial page, we should map the whole
3471 	 * page.  Note: if two part of one page are separately mapped, we
3472 	 * might have two guest_addr mapping to the same host paddr, but this
3473 	 * is not a big problem
3474 	 */
3475 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3477 	if (ret)
3478 		goto error;
3479 
3480 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481 	start_paddr += paddr & ~PAGE_MASK;
3482 
3483 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3484 
3485 	return start_paddr;
3486 
3487 error:
3488 	if (iova_pfn)
3489 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491 		size, (unsigned long long)paddr, dir);
3492 	return DMA_MAPPING_ERROR;
3493 }
3494 
3495 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496 				 unsigned long offset, size_t size,
3497 				 enum dma_data_direction dir,
3498 				 unsigned long attrs)
3499 {
3500 	return __intel_map_single(dev, page_to_phys(page) + offset,
3501 				  size, dir, *dev->dma_mask);
3502 }
3503 
3504 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505 				     size_t size, enum dma_data_direction dir,
3506 				     unsigned long attrs)
3507 {
3508 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3509 }
3510 
3511 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3512 {
3513 	struct dmar_domain *domain;
3514 	unsigned long start_pfn, last_pfn;
3515 	unsigned long nrpages;
3516 	unsigned long iova_pfn;
3517 	struct intel_iommu *iommu;
3518 	struct page *freelist;
3519 	struct pci_dev *pdev = NULL;
3520 
3521 	domain = find_domain(dev);
3522 	BUG_ON(!domain);
3523 
3524 	iommu = domain_get_iommu(domain);
3525 
3526 	iova_pfn = IOVA_PFN(dev_addr);
3527 
3528 	nrpages = aligned_nrpages(dev_addr, size);
3529 	start_pfn = mm_to_dma_pfn(iova_pfn);
3530 	last_pfn = start_pfn + nrpages - 1;
3531 
3532 	if (dev_is_pci(dev))
3533 		pdev = to_pci_dev(dev);
3534 
3535 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3536 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537 			!has_iova_flush_queue(&domain->iovad)) {
3538 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539 				      nrpages, !freelist, 0);
3540 		/* free iova */
3541 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542 		dma_free_pagelist(freelist);
3543 	} else {
3544 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3545 			   (unsigned long)freelist);
3546 		/*
3547 		 * queue up the release of the unmap to save the 1/6th of the
3548 		 * cpu used up by the iotlb flush operation...
3549 		 */
3550 	}
3551 
3552 	trace_unmap_single(dev, dev_addr, size);
3553 }
3554 
3555 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556 			     size_t size, enum dma_data_direction dir,
3557 			     unsigned long attrs)
3558 {
3559 	intel_unmap(dev, dev_addr, size);
3560 }
3561 
3562 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3564 {
3565 	intel_unmap(dev, dev_addr, size);
3566 }
3567 
3568 static void *intel_alloc_coherent(struct device *dev, size_t size,
3569 				  dma_addr_t *dma_handle, gfp_t flags,
3570 				  unsigned long attrs)
3571 {
3572 	struct page *page = NULL;
3573 	int order;
3574 
3575 	if (unlikely(attach_deferred(dev)))
3576 		do_deferred_attach(dev);
3577 
3578 	size = PAGE_ALIGN(size);
3579 	order = get_order(size);
3580 
3581 	if (gfpflags_allow_blocking(flags)) {
3582 		unsigned int count = size >> PAGE_SHIFT;
3583 
3584 		page = dma_alloc_from_contiguous(dev, count, order,
3585 						 flags & __GFP_NOWARN);
3586 	}
3587 
3588 	if (!page)
3589 		page = alloc_pages(flags, order);
3590 	if (!page)
3591 		return NULL;
3592 	memset(page_address(page), 0, size);
3593 
3594 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595 					 DMA_BIDIRECTIONAL,
3596 					 dev->coherent_dma_mask);
3597 	if (*dma_handle != DMA_MAPPING_ERROR)
3598 		return page_address(page);
3599 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600 		__free_pages(page, order);
3601 
3602 	return NULL;
3603 }
3604 
3605 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606 				dma_addr_t dma_handle, unsigned long attrs)
3607 {
3608 	int order;
3609 	struct page *page = virt_to_page(vaddr);
3610 
3611 	size = PAGE_ALIGN(size);
3612 	order = get_order(size);
3613 
3614 	intel_unmap(dev, dma_handle, size);
3615 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616 		__free_pages(page, order);
3617 }
3618 
3619 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620 			   int nelems, enum dma_data_direction dir,
3621 			   unsigned long attrs)
3622 {
3623 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624 	unsigned long nrpages = 0;
3625 	struct scatterlist *sg;
3626 	int i;
3627 
3628 	for_each_sg(sglist, sg, nelems, i) {
3629 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3630 	}
3631 
3632 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3633 
3634 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3635 }
3636 
3637 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638 			enum dma_data_direction dir, unsigned long attrs)
3639 {
3640 	int i;
3641 	struct dmar_domain *domain;
3642 	size_t size = 0;
3643 	int prot = 0;
3644 	unsigned long iova_pfn;
3645 	int ret;
3646 	struct scatterlist *sg;
3647 	unsigned long start_vpfn;
3648 	struct intel_iommu *iommu;
3649 
3650 	BUG_ON(dir == DMA_NONE);
3651 
3652 	if (unlikely(attach_deferred(dev)))
3653 		do_deferred_attach(dev);
3654 
3655 	domain = find_domain(dev);
3656 	if (!domain)
3657 		return 0;
3658 
3659 	iommu = domain_get_iommu(domain);
3660 
3661 	for_each_sg(sglist, sg, nelems, i)
3662 		size += aligned_nrpages(sg->offset, sg->length);
3663 
3664 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3665 				*dev->dma_mask);
3666 	if (!iova_pfn) {
3667 		sglist->dma_length = 0;
3668 		return 0;
3669 	}
3670 
3671 	/*
3672 	 * Check if DMAR supports zero-length reads on write only
3673 	 * mappings..
3674 	 */
3675 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676 			!cap_zlr(iommu->cap))
3677 		prot |= DMA_PTE_READ;
3678 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679 		prot |= DMA_PTE_WRITE;
3680 
3681 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3682 
3683 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684 	if (unlikely(ret)) {
3685 		dma_pte_free_pagetable(domain, start_vpfn,
3686 				       start_vpfn + size - 1,
3687 				       agaw_to_level(domain->agaw) + 1);
3688 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3689 		return 0;
3690 	}
3691 
3692 	for_each_sg(sglist, sg, nelems, i)
3693 		trace_map_sg(dev, i + 1, nelems, sg);
3694 
3695 	return nelems;
3696 }
3697 
3698 static u64 intel_get_required_mask(struct device *dev)
3699 {
3700 	return DMA_BIT_MASK(32);
3701 }
3702 
3703 static const struct dma_map_ops intel_dma_ops = {
3704 	.alloc = intel_alloc_coherent,
3705 	.free = intel_free_coherent,
3706 	.map_sg = intel_map_sg,
3707 	.unmap_sg = intel_unmap_sg,
3708 	.map_page = intel_map_page,
3709 	.unmap_page = intel_unmap_page,
3710 	.map_resource = intel_map_resource,
3711 	.unmap_resource = intel_unmap_resource,
3712 	.dma_supported = dma_direct_supported,
3713 	.mmap = dma_common_mmap,
3714 	.get_sgtable = dma_common_get_sgtable,
3715 	.get_required_mask = intel_get_required_mask,
3716 };
3717 
3718 static void
3719 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720 		   enum dma_data_direction dir, enum dma_sync_target target)
3721 {
3722 	struct dmar_domain *domain;
3723 	phys_addr_t tlb_addr;
3724 
3725 	domain = find_domain(dev);
3726 	if (WARN_ON(!domain))
3727 		return;
3728 
3729 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730 	if (is_swiotlb_buffer(tlb_addr))
3731 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3732 }
3733 
3734 static dma_addr_t
3735 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736 		  enum dma_data_direction dir, unsigned long attrs,
3737 		  u64 dma_mask)
3738 {
3739 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740 	struct dmar_domain *domain;
3741 	struct intel_iommu *iommu;
3742 	unsigned long iova_pfn;
3743 	unsigned long nrpages;
3744 	phys_addr_t tlb_addr;
3745 	int prot = 0;
3746 	int ret;
3747 
3748 	if (unlikely(attach_deferred(dev)))
3749 		do_deferred_attach(dev);
3750 
3751 	domain = find_domain(dev);
3752 
3753 	if (WARN_ON(dir == DMA_NONE || !domain))
3754 		return DMA_MAPPING_ERROR;
3755 
3756 	iommu = domain_get_iommu(domain);
3757 	if (WARN_ON(!iommu))
3758 		return DMA_MAPPING_ERROR;
3759 
3760 	nrpages = aligned_nrpages(0, size);
3761 	iova_pfn = intel_alloc_iova(dev, domain,
3762 				    dma_to_mm_pfn(nrpages), dma_mask);
3763 	if (!iova_pfn)
3764 		return DMA_MAPPING_ERROR;
3765 
3766 	/*
3767 	 * Check if DMAR supports zero-length reads on write only
3768 	 * mappings..
3769 	 */
3770 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771 			!cap_zlr(iommu->cap))
3772 		prot |= DMA_PTE_READ;
3773 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774 		prot |= DMA_PTE_WRITE;
3775 
3776 	/*
3777 	 * If both the physical buffer start address and size are
3778 	 * page aligned, we don't need to use a bounce page.
3779 	 */
3780 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781 		tlb_addr = swiotlb_tbl_map_single(dev,
3782 				__phys_to_dma(dev, io_tlb_start),
3783 				paddr, size, aligned_size, dir, attrs);
3784 		if (tlb_addr == DMA_MAPPING_ERROR) {
3785 			goto swiotlb_error;
3786 		} else {
3787 			/* Cleanup the padding area. */
3788 			void *padding_start = phys_to_virt(tlb_addr);
3789 			size_t padding_size = aligned_size;
3790 
3791 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792 			    (dir == DMA_TO_DEVICE ||
3793 			     dir == DMA_BIDIRECTIONAL)) {
3794 				padding_start += size;
3795 				padding_size -= size;
3796 			}
3797 
3798 			memset(padding_start, 0, padding_size);
3799 		}
3800 	} else {
3801 		tlb_addr = paddr;
3802 	}
3803 
3804 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3806 	if (ret)
3807 		goto mapping_error;
3808 
3809 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3810 
3811 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3812 
3813 mapping_error:
3814 	if (is_swiotlb_buffer(tlb_addr))
3815 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816 					 aligned_size, dir, attrs);
3817 swiotlb_error:
3818 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820 		size, (unsigned long long)paddr, dir);
3821 
3822 	return DMA_MAPPING_ERROR;
3823 }
3824 
3825 static void
3826 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827 		    enum dma_data_direction dir, unsigned long attrs)
3828 {
3829 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830 	struct dmar_domain *domain;
3831 	phys_addr_t tlb_addr;
3832 
3833 	domain = find_domain(dev);
3834 	if (WARN_ON(!domain))
3835 		return;
3836 
3837 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838 	if (WARN_ON(!tlb_addr))
3839 		return;
3840 
3841 	intel_unmap(dev, dev_addr, size);
3842 	if (is_swiotlb_buffer(tlb_addr))
3843 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844 					 aligned_size, dir, attrs);
3845 
3846 	trace_bounce_unmap_single(dev, dev_addr, size);
3847 }
3848 
3849 static dma_addr_t
3850 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3852 {
3853 	return bounce_map_single(dev, page_to_phys(page) + offset,
3854 				 size, dir, attrs, *dev->dma_mask);
3855 }
3856 
3857 static dma_addr_t
3858 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859 		    enum dma_data_direction dir, unsigned long attrs)
3860 {
3861 	return bounce_map_single(dev, phys_addr, size,
3862 				 dir, attrs, *dev->dma_mask);
3863 }
3864 
3865 static void
3866 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867 		  enum dma_data_direction dir, unsigned long attrs)
3868 {
3869 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3870 }
3871 
3872 static void
3873 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874 		      enum dma_data_direction dir, unsigned long attrs)
3875 {
3876 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3877 }
3878 
3879 static void
3880 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881 		enum dma_data_direction dir, unsigned long attrs)
3882 {
3883 	struct scatterlist *sg;
3884 	int i;
3885 
3886 	for_each_sg(sglist, sg, nelems, i)
3887 		bounce_unmap_page(dev, sg->dma_address,
3888 				  sg_dma_len(sg), dir, attrs);
3889 }
3890 
3891 static int
3892 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893 	      enum dma_data_direction dir, unsigned long attrs)
3894 {
3895 	int i;
3896 	struct scatterlist *sg;
3897 
3898 	for_each_sg(sglist, sg, nelems, i) {
3899 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900 						  sg->offset, sg->length,
3901 						  dir, attrs);
3902 		if (sg->dma_address == DMA_MAPPING_ERROR)
3903 			goto out_unmap;
3904 		sg_dma_len(sg) = sg->length;
3905 	}
3906 
3907 	for_each_sg(sglist, sg, nelems, i)
3908 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3909 
3910 	return nelems;
3911 
3912 out_unmap:
3913 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3914 	return 0;
3915 }
3916 
3917 static void
3918 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919 			   size_t size, enum dma_data_direction dir)
3920 {
3921 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3922 }
3923 
3924 static void
3925 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926 			      size_t size, enum dma_data_direction dir)
3927 {
3928 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3929 }
3930 
3931 static void
3932 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933 		       int nelems, enum dma_data_direction dir)
3934 {
3935 	struct scatterlist *sg;
3936 	int i;
3937 
3938 	for_each_sg(sglist, sg, nelems, i)
3939 		bounce_sync_single(dev, sg_dma_address(sg),
3940 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3941 }
3942 
3943 static void
3944 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945 			  int nelems, enum dma_data_direction dir)
3946 {
3947 	struct scatterlist *sg;
3948 	int i;
3949 
3950 	for_each_sg(sglist, sg, nelems, i)
3951 		bounce_sync_single(dev, sg_dma_address(sg),
3952 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3953 }
3954 
3955 static const struct dma_map_ops bounce_dma_ops = {
3956 	.alloc			= intel_alloc_coherent,
3957 	.free			= intel_free_coherent,
3958 	.map_sg			= bounce_map_sg,
3959 	.unmap_sg		= bounce_unmap_sg,
3960 	.map_page		= bounce_map_page,
3961 	.unmap_page		= bounce_unmap_page,
3962 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
3963 	.sync_single_for_device	= bounce_sync_single_for_device,
3964 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
3965 	.sync_sg_for_device	= bounce_sync_sg_for_device,
3966 	.map_resource		= bounce_map_resource,
3967 	.unmap_resource		= bounce_unmap_resource,
3968 	.dma_supported		= dma_direct_supported,
3969 };
3970 
3971 static inline int iommu_domain_cache_init(void)
3972 {
3973 	int ret = 0;
3974 
3975 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3976 					 sizeof(struct dmar_domain),
3977 					 0,
3978 					 SLAB_HWCACHE_ALIGN,
3979 
3980 					 NULL);
3981 	if (!iommu_domain_cache) {
3982 		pr_err("Couldn't create iommu_domain cache\n");
3983 		ret = -ENOMEM;
3984 	}
3985 
3986 	return ret;
3987 }
3988 
3989 static inline int iommu_devinfo_cache_init(void)
3990 {
3991 	int ret = 0;
3992 
3993 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994 					 sizeof(struct device_domain_info),
3995 					 0,
3996 					 SLAB_HWCACHE_ALIGN,
3997 					 NULL);
3998 	if (!iommu_devinfo_cache) {
3999 		pr_err("Couldn't create devinfo cache\n");
4000 		ret = -ENOMEM;
4001 	}
4002 
4003 	return ret;
4004 }
4005 
4006 static int __init iommu_init_mempool(void)
4007 {
4008 	int ret;
4009 	ret = iova_cache_get();
4010 	if (ret)
4011 		return ret;
4012 
4013 	ret = iommu_domain_cache_init();
4014 	if (ret)
4015 		goto domain_error;
4016 
4017 	ret = iommu_devinfo_cache_init();
4018 	if (!ret)
4019 		return ret;
4020 
4021 	kmem_cache_destroy(iommu_domain_cache);
4022 domain_error:
4023 	iova_cache_put();
4024 
4025 	return -ENOMEM;
4026 }
4027 
4028 static void __init iommu_exit_mempool(void)
4029 {
4030 	kmem_cache_destroy(iommu_devinfo_cache);
4031 	kmem_cache_destroy(iommu_domain_cache);
4032 	iova_cache_put();
4033 }
4034 
4035 static void __init init_no_remapping_devices(void)
4036 {
4037 	struct dmar_drhd_unit *drhd;
4038 	struct device *dev;
4039 	int i;
4040 
4041 	for_each_drhd_unit(drhd) {
4042 		if (!drhd->include_all) {
4043 			for_each_active_dev_scope(drhd->devices,
4044 						  drhd->devices_cnt, i, dev)
4045 				break;
4046 			/* ignore DMAR unit if no devices exist */
4047 			if (i == drhd->devices_cnt)
4048 				drhd->ignored = 1;
4049 		}
4050 	}
4051 
4052 	for_each_active_drhd_unit(drhd) {
4053 		if (drhd->include_all)
4054 			continue;
4055 
4056 		for_each_active_dev_scope(drhd->devices,
4057 					  drhd->devices_cnt, i, dev)
4058 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4059 				break;
4060 		if (i < drhd->devices_cnt)
4061 			continue;
4062 
4063 		/* This IOMMU has *only* gfx devices. Either bypass it or
4064 		   set the gfx_mapped flag, as appropriate */
4065 		drhd->gfx_dedicated = 1;
4066 		if (!dmar_map_gfx)
4067 			drhd->ignored = 1;
4068 	}
4069 }
4070 
4071 #ifdef CONFIG_SUSPEND
4072 static int init_iommu_hw(void)
4073 {
4074 	struct dmar_drhd_unit *drhd;
4075 	struct intel_iommu *iommu = NULL;
4076 
4077 	for_each_active_iommu(iommu, drhd)
4078 		if (iommu->qi)
4079 			dmar_reenable_qi(iommu);
4080 
4081 	for_each_iommu(iommu, drhd) {
4082 		if (drhd->ignored) {
4083 			/*
4084 			 * we always have to disable PMRs or DMA may fail on
4085 			 * this device
4086 			 */
4087 			if (force_on)
4088 				iommu_disable_protect_mem_regions(iommu);
4089 			continue;
4090 		}
4091 
4092 		iommu_flush_write_buffer(iommu);
4093 
4094 		iommu_set_root_entry(iommu);
4095 
4096 		iommu->flush.flush_context(iommu, 0, 0, 0,
4097 					   DMA_CCMD_GLOBAL_INVL);
4098 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099 		iommu_enable_translation(iommu);
4100 		iommu_disable_protect_mem_regions(iommu);
4101 	}
4102 
4103 	return 0;
4104 }
4105 
4106 static void iommu_flush_all(void)
4107 {
4108 	struct dmar_drhd_unit *drhd;
4109 	struct intel_iommu *iommu;
4110 
4111 	for_each_active_iommu(iommu, drhd) {
4112 		iommu->flush.flush_context(iommu, 0, 0, 0,
4113 					   DMA_CCMD_GLOBAL_INVL);
4114 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115 					 DMA_TLB_GLOBAL_FLUSH);
4116 	}
4117 }
4118 
4119 static int iommu_suspend(void)
4120 {
4121 	struct dmar_drhd_unit *drhd;
4122 	struct intel_iommu *iommu = NULL;
4123 	unsigned long flag;
4124 
4125 	for_each_active_iommu(iommu, drhd) {
4126 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4127 						 GFP_ATOMIC);
4128 		if (!iommu->iommu_state)
4129 			goto nomem;
4130 	}
4131 
4132 	iommu_flush_all();
4133 
4134 	for_each_active_iommu(iommu, drhd) {
4135 		iommu_disable_translation(iommu);
4136 
4137 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4138 
4139 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140 			readl(iommu->reg + DMAR_FECTL_REG);
4141 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142 			readl(iommu->reg + DMAR_FEDATA_REG);
4143 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144 			readl(iommu->reg + DMAR_FEADDR_REG);
4145 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146 			readl(iommu->reg + DMAR_FEUADDR_REG);
4147 
4148 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4149 	}
4150 	return 0;
4151 
4152 nomem:
4153 	for_each_active_iommu(iommu, drhd)
4154 		kfree(iommu->iommu_state);
4155 
4156 	return -ENOMEM;
4157 }
4158 
4159 static void iommu_resume(void)
4160 {
4161 	struct dmar_drhd_unit *drhd;
4162 	struct intel_iommu *iommu = NULL;
4163 	unsigned long flag;
4164 
4165 	if (init_iommu_hw()) {
4166 		if (force_on)
4167 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4168 		else
4169 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4170 		return;
4171 	}
4172 
4173 	for_each_active_iommu(iommu, drhd) {
4174 
4175 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4176 
4177 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178 			iommu->reg + DMAR_FECTL_REG);
4179 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180 			iommu->reg + DMAR_FEDATA_REG);
4181 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182 			iommu->reg + DMAR_FEADDR_REG);
4183 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184 			iommu->reg + DMAR_FEUADDR_REG);
4185 
4186 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4187 	}
4188 
4189 	for_each_active_iommu(iommu, drhd)
4190 		kfree(iommu->iommu_state);
4191 }
4192 
4193 static struct syscore_ops iommu_syscore_ops = {
4194 	.resume		= iommu_resume,
4195 	.suspend	= iommu_suspend,
4196 };
4197 
4198 static void __init init_iommu_pm_ops(void)
4199 {
4200 	register_syscore_ops(&iommu_syscore_ops);
4201 }
4202 
4203 #else
4204 static inline void init_iommu_pm_ops(void) {}
4205 #endif	/* CONFIG_PM */
4206 
4207 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4208 {
4209 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211 	    rmrr->end_address <= rmrr->base_address ||
4212 	    arch_rmrr_sanity_check(rmrr))
4213 		return -EINVAL;
4214 
4215 	return 0;
4216 }
4217 
4218 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4219 {
4220 	struct acpi_dmar_reserved_memory *rmrr;
4221 	struct dmar_rmrr_unit *rmrru;
4222 
4223 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4224 	if (rmrr_sanity_check(rmrr)) {
4225 		pr_warn(FW_BUG
4226 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228 			   rmrr->base_address, rmrr->end_address,
4229 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4230 			   dmi_get_system_info(DMI_BIOS_VERSION),
4231 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4232 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4233 	}
4234 
4235 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4236 	if (!rmrru)
4237 		goto out;
4238 
4239 	rmrru->hdr = header;
4240 
4241 	rmrru->base_address = rmrr->base_address;
4242 	rmrru->end_address = rmrr->end_address;
4243 
4244 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245 				((void *)rmrr) + rmrr->header.length,
4246 				&rmrru->devices_cnt);
4247 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4248 		goto free_rmrru;
4249 
4250 	list_add(&rmrru->list, &dmar_rmrr_units);
4251 
4252 	return 0;
4253 free_rmrru:
4254 	kfree(rmrru);
4255 out:
4256 	return -ENOMEM;
4257 }
4258 
4259 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4260 {
4261 	struct dmar_atsr_unit *atsru;
4262 	struct acpi_dmar_atsr *tmp;
4263 
4264 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4265 				dmar_rcu_check()) {
4266 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267 		if (atsr->segment != tmp->segment)
4268 			continue;
4269 		if (atsr->header.length != tmp->header.length)
4270 			continue;
4271 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4272 			return atsru;
4273 	}
4274 
4275 	return NULL;
4276 }
4277 
4278 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279 {
4280 	struct acpi_dmar_atsr *atsr;
4281 	struct dmar_atsr_unit *atsru;
4282 
4283 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4284 		return 0;
4285 
4286 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287 	atsru = dmar_find_atsr(atsr);
4288 	if (atsru)
4289 		return 0;
4290 
4291 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4292 	if (!atsru)
4293 		return -ENOMEM;
4294 
4295 	/*
4296 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4297 	 * copy the memory content because the memory buffer will be freed
4298 	 * on return.
4299 	 */
4300 	atsru->hdr = (void *)(atsru + 1);
4301 	memcpy(atsru->hdr, hdr, hdr->length);
4302 	atsru->include_all = atsr->flags & 0x1;
4303 	if (!atsru->include_all) {
4304 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305 				(void *)atsr + atsr->header.length,
4306 				&atsru->devices_cnt);
4307 		if (atsru->devices_cnt && atsru->devices == NULL) {
4308 			kfree(atsru);
4309 			return -ENOMEM;
4310 		}
4311 	}
4312 
4313 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4314 
4315 	return 0;
4316 }
4317 
4318 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4319 {
4320 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4321 	kfree(atsru);
4322 }
4323 
4324 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325 {
4326 	struct acpi_dmar_atsr *atsr;
4327 	struct dmar_atsr_unit *atsru;
4328 
4329 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330 	atsru = dmar_find_atsr(atsr);
4331 	if (atsru) {
4332 		list_del_rcu(&atsru->list);
4333 		synchronize_rcu();
4334 		intel_iommu_free_atsr(atsru);
4335 	}
4336 
4337 	return 0;
4338 }
4339 
4340 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4341 {
4342 	int i;
4343 	struct device *dev;
4344 	struct acpi_dmar_atsr *atsr;
4345 	struct dmar_atsr_unit *atsru;
4346 
4347 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348 	atsru = dmar_find_atsr(atsr);
4349 	if (!atsru)
4350 		return 0;
4351 
4352 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4354 					  i, dev)
4355 			return -EBUSY;
4356 	}
4357 
4358 	return 0;
4359 }
4360 
4361 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4362 {
4363 	int sp, ret;
4364 	struct intel_iommu *iommu = dmaru->iommu;
4365 
4366 	if (g_iommus[iommu->seq_id])
4367 		return 0;
4368 
4369 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370 		pr_warn("%s: Doesn't support hardware pass through.\n",
4371 			iommu->name);
4372 		return -ENXIO;
4373 	}
4374 	if (!ecap_sc_support(iommu->ecap) &&
4375 	    domain_update_iommu_snooping(iommu)) {
4376 		pr_warn("%s: Doesn't support snooping.\n",
4377 			iommu->name);
4378 		return -ENXIO;
4379 	}
4380 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382 		pr_warn("%s: Doesn't support large page.\n",
4383 			iommu->name);
4384 		return -ENXIO;
4385 	}
4386 
4387 	/*
4388 	 * Disable translation if already enabled prior to OS handover.
4389 	 */
4390 	if (iommu->gcmd & DMA_GCMD_TE)
4391 		iommu_disable_translation(iommu);
4392 
4393 	g_iommus[iommu->seq_id] = iommu;
4394 	ret = iommu_init_domains(iommu);
4395 	if (ret == 0)
4396 		ret = iommu_alloc_root_entry(iommu);
4397 	if (ret)
4398 		goto out;
4399 
4400 	intel_svm_check(iommu);
4401 
4402 	if (dmaru->ignored) {
4403 		/*
4404 		 * we always have to disable PMRs or DMA may fail on this device
4405 		 */
4406 		if (force_on)
4407 			iommu_disable_protect_mem_regions(iommu);
4408 		return 0;
4409 	}
4410 
4411 	intel_iommu_init_qi(iommu);
4412 	iommu_flush_write_buffer(iommu);
4413 
4414 #ifdef CONFIG_INTEL_IOMMU_SVM
4415 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416 		ret = intel_svm_enable_prq(iommu);
4417 		if (ret)
4418 			goto disable_iommu;
4419 	}
4420 #endif
4421 	ret = dmar_set_interrupt(iommu);
4422 	if (ret)
4423 		goto disable_iommu;
4424 
4425 	iommu_set_root_entry(iommu);
4426 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428 	iommu_enable_translation(iommu);
4429 
4430 	iommu_disable_protect_mem_regions(iommu);
4431 	return 0;
4432 
4433 disable_iommu:
4434 	disable_dmar_iommu(iommu);
4435 out:
4436 	free_dmar_iommu(iommu);
4437 	return ret;
4438 }
4439 
4440 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4441 {
4442 	int ret = 0;
4443 	struct intel_iommu *iommu = dmaru->iommu;
4444 
4445 	if (!intel_iommu_enabled)
4446 		return 0;
4447 	if (iommu == NULL)
4448 		return -EINVAL;
4449 
4450 	if (insert) {
4451 		ret = intel_iommu_add(dmaru);
4452 	} else {
4453 		disable_dmar_iommu(iommu);
4454 		free_dmar_iommu(iommu);
4455 	}
4456 
4457 	return ret;
4458 }
4459 
4460 static void intel_iommu_free_dmars(void)
4461 {
4462 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463 	struct dmar_atsr_unit *atsru, *atsr_n;
4464 
4465 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466 		list_del(&rmrru->list);
4467 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4468 		kfree(rmrru);
4469 	}
4470 
4471 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472 		list_del(&atsru->list);
4473 		intel_iommu_free_atsr(atsru);
4474 	}
4475 }
4476 
4477 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4478 {
4479 	int i, ret = 1;
4480 	struct pci_bus *bus;
4481 	struct pci_dev *bridge = NULL;
4482 	struct device *tmp;
4483 	struct acpi_dmar_atsr *atsr;
4484 	struct dmar_atsr_unit *atsru;
4485 
4486 	dev = pci_physfn(dev);
4487 	for (bus = dev->bus; bus; bus = bus->parent) {
4488 		bridge = bus->self;
4489 		/* If it's an integrated device, allow ATS */
4490 		if (!bridge)
4491 			return 1;
4492 		/* Connected via non-PCIe: no ATS */
4493 		if (!pci_is_pcie(bridge) ||
4494 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4495 			return 0;
4496 		/* If we found the root port, look it up in the ATSR */
4497 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4498 			break;
4499 	}
4500 
4501 	rcu_read_lock();
4502 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504 		if (atsr->segment != pci_domain_nr(dev->bus))
4505 			continue;
4506 
4507 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508 			if (tmp == &bridge->dev)
4509 				goto out;
4510 
4511 		if (atsru->include_all)
4512 			goto out;
4513 	}
4514 	ret = 0;
4515 out:
4516 	rcu_read_unlock();
4517 
4518 	return ret;
4519 }
4520 
4521 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4522 {
4523 	int ret;
4524 	struct dmar_rmrr_unit *rmrru;
4525 	struct dmar_atsr_unit *atsru;
4526 	struct acpi_dmar_atsr *atsr;
4527 	struct acpi_dmar_reserved_memory *rmrr;
4528 
4529 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4530 		return 0;
4531 
4532 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533 		rmrr = container_of(rmrru->hdr,
4534 				    struct acpi_dmar_reserved_memory, header);
4535 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537 				((void *)rmrr) + rmrr->header.length,
4538 				rmrr->segment, rmrru->devices,
4539 				rmrru->devices_cnt);
4540 			if (ret < 0)
4541 				return ret;
4542 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543 			dmar_remove_dev_scope(info, rmrr->segment,
4544 				rmrru->devices, rmrru->devices_cnt);
4545 		}
4546 	}
4547 
4548 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549 		if (atsru->include_all)
4550 			continue;
4551 
4552 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555 					(void *)atsr + atsr->header.length,
4556 					atsr->segment, atsru->devices,
4557 					atsru->devices_cnt);
4558 			if (ret > 0)
4559 				break;
4560 			else if (ret < 0)
4561 				return ret;
4562 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563 			if (dmar_remove_dev_scope(info, atsr->segment,
4564 					atsru->devices, atsru->devices_cnt))
4565 				break;
4566 		}
4567 	}
4568 
4569 	return 0;
4570 }
4571 
4572 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573 				       unsigned long val, void *v)
4574 {
4575 	struct memory_notify *mhp = v;
4576 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4578 			mhp->nr_pages - 1);
4579 
4580 	switch (val) {
4581 	case MEM_GOING_ONLINE:
4582 		if (iommu_domain_identity_map(si_domain,
4583 					      start_vpfn, last_vpfn)) {
4584 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585 				start_vpfn, last_vpfn);
4586 			return NOTIFY_BAD;
4587 		}
4588 		break;
4589 
4590 	case MEM_OFFLINE:
4591 	case MEM_CANCEL_ONLINE:
4592 		{
4593 			struct dmar_drhd_unit *drhd;
4594 			struct intel_iommu *iommu;
4595 			struct page *freelist;
4596 
4597 			freelist = domain_unmap(si_domain,
4598 						start_vpfn, last_vpfn);
4599 
4600 			rcu_read_lock();
4601 			for_each_active_iommu(iommu, drhd)
4602 				iommu_flush_iotlb_psi(iommu, si_domain,
4603 					start_vpfn, mhp->nr_pages,
4604 					!freelist, 0);
4605 			rcu_read_unlock();
4606 			dma_free_pagelist(freelist);
4607 		}
4608 		break;
4609 	}
4610 
4611 	return NOTIFY_OK;
4612 }
4613 
4614 static struct notifier_block intel_iommu_memory_nb = {
4615 	.notifier_call = intel_iommu_memory_notifier,
4616 	.priority = 0
4617 };
4618 
4619 static void free_all_cpu_cached_iovas(unsigned int cpu)
4620 {
4621 	int i;
4622 
4623 	for (i = 0; i < g_num_of_iommus; i++) {
4624 		struct intel_iommu *iommu = g_iommus[i];
4625 		struct dmar_domain *domain;
4626 		int did;
4627 
4628 		if (!iommu)
4629 			continue;
4630 
4631 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632 			domain = get_iommu_domain(iommu, (u16)did);
4633 
4634 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4635 				continue;
4636 
4637 			free_cpu_cached_iovas(cpu, &domain->iovad);
4638 		}
4639 	}
4640 }
4641 
4642 static int intel_iommu_cpu_dead(unsigned int cpu)
4643 {
4644 	free_all_cpu_cached_iovas(cpu);
4645 	return 0;
4646 }
4647 
4648 static void intel_disable_iommus(void)
4649 {
4650 	struct intel_iommu *iommu = NULL;
4651 	struct dmar_drhd_unit *drhd;
4652 
4653 	for_each_iommu(iommu, drhd)
4654 		iommu_disable_translation(iommu);
4655 }
4656 
4657 void intel_iommu_shutdown(void)
4658 {
4659 	struct dmar_drhd_unit *drhd;
4660 	struct intel_iommu *iommu = NULL;
4661 
4662 	if (no_iommu || dmar_disabled)
4663 		return;
4664 
4665 	down_write(&dmar_global_lock);
4666 
4667 	/* Disable PMRs explicitly here. */
4668 	for_each_iommu(iommu, drhd)
4669 		iommu_disable_protect_mem_regions(iommu);
4670 
4671 	/* Make sure the IOMMUs are switched off */
4672 	intel_disable_iommus();
4673 
4674 	up_write(&dmar_global_lock);
4675 }
4676 
4677 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4678 {
4679 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4680 
4681 	return container_of(iommu_dev, struct intel_iommu, iommu);
4682 }
4683 
4684 static ssize_t intel_iommu_show_version(struct device *dev,
4685 					struct device_attribute *attr,
4686 					char *buf)
4687 {
4688 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690 	return sprintf(buf, "%d:%d\n",
4691 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4692 }
4693 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4694 
4695 static ssize_t intel_iommu_show_address(struct device *dev,
4696 					struct device_attribute *attr,
4697 					char *buf)
4698 {
4699 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4701 }
4702 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4703 
4704 static ssize_t intel_iommu_show_cap(struct device *dev,
4705 				    struct device_attribute *attr,
4706 				    char *buf)
4707 {
4708 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709 	return sprintf(buf, "%llx\n", iommu->cap);
4710 }
4711 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4712 
4713 static ssize_t intel_iommu_show_ecap(struct device *dev,
4714 				    struct device_attribute *attr,
4715 				    char *buf)
4716 {
4717 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718 	return sprintf(buf, "%llx\n", iommu->ecap);
4719 }
4720 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4721 
4722 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723 				      struct device_attribute *attr,
4724 				      char *buf)
4725 {
4726 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4728 }
4729 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4730 
4731 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732 					   struct device_attribute *attr,
4733 					   char *buf)
4734 {
4735 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737 						  cap_ndoms(iommu->cap)));
4738 }
4739 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4740 
4741 static struct attribute *intel_iommu_attrs[] = {
4742 	&dev_attr_version.attr,
4743 	&dev_attr_address.attr,
4744 	&dev_attr_cap.attr,
4745 	&dev_attr_ecap.attr,
4746 	&dev_attr_domains_supported.attr,
4747 	&dev_attr_domains_used.attr,
4748 	NULL,
4749 };
4750 
4751 static struct attribute_group intel_iommu_group = {
4752 	.name = "intel-iommu",
4753 	.attrs = intel_iommu_attrs,
4754 };
4755 
4756 const struct attribute_group *intel_iommu_groups[] = {
4757 	&intel_iommu_group,
4758 	NULL,
4759 };
4760 
4761 static inline bool has_external_pci(void)
4762 {
4763 	struct pci_dev *pdev = NULL;
4764 
4765 	for_each_pci_dev(pdev)
4766 		if (pdev->external_facing)
4767 			return true;
4768 
4769 	return false;
4770 }
4771 
4772 static int __init platform_optin_force_iommu(void)
4773 {
4774 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4775 		return 0;
4776 
4777 	if (no_iommu || dmar_disabled)
4778 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4779 
4780 	/*
4781 	 * If Intel-IOMMU is disabled by default, we will apply identity
4782 	 * map for all devices except those marked as being untrusted.
4783 	 */
4784 	if (dmar_disabled)
4785 		iommu_set_default_passthrough(false);
4786 
4787 	dmar_disabled = 0;
4788 	no_iommu = 0;
4789 
4790 	return 1;
4791 }
4792 
4793 static int __init probe_acpi_namespace_devices(void)
4794 {
4795 	struct dmar_drhd_unit *drhd;
4796 	/* To avoid a -Wunused-but-set-variable warning. */
4797 	struct intel_iommu *iommu __maybe_unused;
4798 	struct device *dev;
4799 	int i, ret = 0;
4800 
4801 	for_each_active_iommu(iommu, drhd) {
4802 		for_each_active_dev_scope(drhd->devices,
4803 					  drhd->devices_cnt, i, dev) {
4804 			struct acpi_device_physical_node *pn;
4805 			struct iommu_group *group;
4806 			struct acpi_device *adev;
4807 
4808 			if (dev->bus != &acpi_bus_type)
4809 				continue;
4810 
4811 			adev = to_acpi_device(dev);
4812 			mutex_lock(&adev->physical_node_lock);
4813 			list_for_each_entry(pn,
4814 					    &adev->physical_node_list, node) {
4815 				group = iommu_group_get(pn->dev);
4816 				if (group) {
4817 					iommu_group_put(group);
4818 					continue;
4819 				}
4820 
4821 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822 				ret = iommu_probe_device(pn->dev);
4823 				if (ret)
4824 					break;
4825 			}
4826 			mutex_unlock(&adev->physical_node_lock);
4827 
4828 			if (ret)
4829 				return ret;
4830 		}
4831 	}
4832 
4833 	return 0;
4834 }
4835 
4836 int __init intel_iommu_init(void)
4837 {
4838 	int ret = -ENODEV;
4839 	struct dmar_drhd_unit *drhd;
4840 	struct intel_iommu *iommu;
4841 
4842 	/*
4843 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4844 	 * opt in, so enforce that.
4845 	 */
4846 	force_on = tboot_force_iommu() || platform_optin_force_iommu();
4847 
4848 	if (iommu_init_mempool()) {
4849 		if (force_on)
4850 			panic("tboot: Failed to initialize iommu memory\n");
4851 		return -ENOMEM;
4852 	}
4853 
4854 	down_write(&dmar_global_lock);
4855 	if (dmar_table_init()) {
4856 		if (force_on)
4857 			panic("tboot: Failed to initialize DMAR table\n");
4858 		goto out_free_dmar;
4859 	}
4860 
4861 	if (dmar_dev_scope_init() < 0) {
4862 		if (force_on)
4863 			panic("tboot: Failed to initialize DMAR device scope\n");
4864 		goto out_free_dmar;
4865 	}
4866 
4867 	up_write(&dmar_global_lock);
4868 
4869 	/*
4870 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4871 	 * complain later when we register it under the lock.
4872 	 */
4873 	dmar_register_bus_notifier();
4874 
4875 	down_write(&dmar_global_lock);
4876 
4877 	if (!no_iommu)
4878 		intel_iommu_debugfs_init();
4879 
4880 	if (no_iommu || dmar_disabled) {
4881 		/*
4882 		 * We exit the function here to ensure IOMMU's remapping and
4883 		 * mempool aren't setup, which means that the IOMMU's PMRs
4884 		 * won't be disabled via the call to init_dmars(). So disable
4885 		 * it explicitly here. The PMRs were setup by tboot prior to
4886 		 * calling SENTER, but the kernel is expected to reset/tear
4887 		 * down the PMRs.
4888 		 */
4889 		if (intel_iommu_tboot_noforce) {
4890 			for_each_iommu(iommu, drhd)
4891 				iommu_disable_protect_mem_regions(iommu);
4892 		}
4893 
4894 		/*
4895 		 * Make sure the IOMMUs are switched off, even when we
4896 		 * boot into a kexec kernel and the previous kernel left
4897 		 * them enabled
4898 		 */
4899 		intel_disable_iommus();
4900 		goto out_free_dmar;
4901 	}
4902 
4903 	if (list_empty(&dmar_rmrr_units))
4904 		pr_info("No RMRR found\n");
4905 
4906 	if (list_empty(&dmar_atsr_units))
4907 		pr_info("No ATSR found\n");
4908 
4909 	if (dmar_init_reserved_ranges()) {
4910 		if (force_on)
4911 			panic("tboot: Failed to reserve iommu ranges\n");
4912 		goto out_free_reserved_range;
4913 	}
4914 
4915 	if (dmar_map_gfx)
4916 		intel_iommu_gfx_mapped = 1;
4917 
4918 	init_no_remapping_devices();
4919 
4920 	ret = init_dmars();
4921 	if (ret) {
4922 		if (force_on)
4923 			panic("tboot: Failed to initialize DMARs\n");
4924 		pr_err("Initialization failed\n");
4925 		goto out_free_reserved_range;
4926 	}
4927 	up_write(&dmar_global_lock);
4928 
4929 	init_iommu_pm_ops();
4930 
4931 	down_read(&dmar_global_lock);
4932 	for_each_active_iommu(iommu, drhd) {
4933 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4934 				       intel_iommu_groups,
4935 				       "%s", iommu->name);
4936 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937 		iommu_device_register(&iommu->iommu);
4938 	}
4939 	up_read(&dmar_global_lock);
4940 
4941 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942 	if (si_domain && !hw_pass_through)
4943 		register_memory_notifier(&intel_iommu_memory_nb);
4944 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945 			  intel_iommu_cpu_dead);
4946 
4947 	down_read(&dmar_global_lock);
4948 	if (probe_acpi_namespace_devices())
4949 		pr_warn("ACPI name space devices didn't probe correctly\n");
4950 
4951 	/* Finally, we enable the DMA remapping hardware. */
4952 	for_each_iommu(iommu, drhd) {
4953 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4954 			iommu_enable_translation(iommu);
4955 
4956 		iommu_disable_protect_mem_regions(iommu);
4957 	}
4958 	up_read(&dmar_global_lock);
4959 
4960 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4961 
4962 	intel_iommu_enabled = 1;
4963 
4964 	return 0;
4965 
4966 out_free_reserved_range:
4967 	put_iova_domain(&reserved_iova_list);
4968 out_free_dmar:
4969 	intel_iommu_free_dmars();
4970 	up_write(&dmar_global_lock);
4971 	iommu_exit_mempool();
4972 	return ret;
4973 }
4974 
4975 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4976 {
4977 	struct intel_iommu *iommu = opaque;
4978 
4979 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4980 	return 0;
4981 }
4982 
4983 /*
4984  * NB - intel-iommu lacks any sort of reference counting for the users of
4985  * dependent devices.  If multiple endpoints have intersecting dependent
4986  * devices, unbinding the driver from any one of them will possibly leave
4987  * the others unable to operate.
4988  */
4989 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4990 {
4991 	if (!iommu || !dev || !dev_is_pci(dev))
4992 		return;
4993 
4994 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4995 }
4996 
4997 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4998 {
4999 	struct dmar_domain *domain;
5000 	struct intel_iommu *iommu;
5001 	unsigned long flags;
5002 
5003 	assert_spin_locked(&device_domain_lock);
5004 
5005 	if (WARN_ON(!info))
5006 		return;
5007 
5008 	iommu = info->iommu;
5009 	domain = info->domain;
5010 
5011 	if (info->dev) {
5012 		if (dev_is_pci(info->dev) && sm_supported(iommu))
5013 			intel_pasid_tear_down_entry(iommu, info->dev,
5014 					PASID_RID2PASID, false);
5015 
5016 		iommu_disable_dev_iotlb(info);
5017 		if (!dev_is_real_dma_subdevice(info->dev))
5018 			domain_context_clear(iommu, info->dev);
5019 		intel_pasid_free_table(info->dev);
5020 	}
5021 
5022 	unlink_domain_info(info);
5023 
5024 	spin_lock_irqsave(&iommu->lock, flags);
5025 	domain_detach_iommu(domain, iommu);
5026 	spin_unlock_irqrestore(&iommu->lock, flags);
5027 
5028 	free_devinfo_mem(info);
5029 }
5030 
5031 static void dmar_remove_one_dev_info(struct device *dev)
5032 {
5033 	struct device_domain_info *info;
5034 	unsigned long flags;
5035 
5036 	spin_lock_irqsave(&device_domain_lock, flags);
5037 	info = get_domain_info(dev);
5038 	if (info)
5039 		__dmar_remove_one_dev_info(info);
5040 	spin_unlock_irqrestore(&device_domain_lock, flags);
5041 }
5042 
5043 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5044 {
5045 	int adjust_width;
5046 
5047 	/* calculate AGAW */
5048 	domain->gaw = guest_width;
5049 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5050 	domain->agaw = width_to_agaw(adjust_width);
5051 
5052 	domain->iommu_coherency = 0;
5053 	domain->iommu_snooping = 0;
5054 	domain->iommu_superpage = 0;
5055 	domain->max_addr = 0;
5056 
5057 	/* always allocate the top pgd */
5058 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5059 	if (!domain->pgd)
5060 		return -ENOMEM;
5061 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5062 	return 0;
5063 }
5064 
5065 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5066 {
5067 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5069 
5070 	if (!intel_iommu_strict &&
5071 	    init_iova_flush_queue(&dmar_domain->iovad,
5072 				  iommu_flush_iova, iova_entry_free))
5073 		pr_info("iova flush queue initialization failed\n");
5074 }
5075 
5076 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5077 {
5078 	struct dmar_domain *dmar_domain;
5079 	struct iommu_domain *domain;
5080 
5081 	switch (type) {
5082 	case IOMMU_DOMAIN_DMA:
5083 	case IOMMU_DOMAIN_UNMANAGED:
5084 		dmar_domain = alloc_domain(0);
5085 		if (!dmar_domain) {
5086 			pr_err("Can't allocate dmar_domain\n");
5087 			return NULL;
5088 		}
5089 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090 			pr_err("Domain initialization failed\n");
5091 			domain_exit(dmar_domain);
5092 			return NULL;
5093 		}
5094 
5095 		if (type == IOMMU_DOMAIN_DMA)
5096 			intel_init_iova_domain(dmar_domain);
5097 
5098 		domain_update_iommu_cap(dmar_domain);
5099 
5100 		domain = &dmar_domain->domain;
5101 		domain->geometry.aperture_start = 0;
5102 		domain->geometry.aperture_end   =
5103 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104 		domain->geometry.force_aperture = true;
5105 
5106 		return domain;
5107 	case IOMMU_DOMAIN_IDENTITY:
5108 		return &si_domain->domain;
5109 	default:
5110 		return NULL;
5111 	}
5112 
5113 	return NULL;
5114 }
5115 
5116 static void intel_iommu_domain_free(struct iommu_domain *domain)
5117 {
5118 	if (domain != &si_domain->domain)
5119 		domain_exit(to_dmar_domain(domain));
5120 }
5121 
5122 /*
5123  * Check whether a @domain could be attached to the @dev through the
5124  * aux-domain attach/detach APIs.
5125  */
5126 static inline bool
5127 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5128 {
5129 	struct device_domain_info *info = get_domain_info(dev);
5130 
5131 	return info && info->auxd_enabled &&
5132 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5133 }
5134 
5135 static void auxiliary_link_device(struct dmar_domain *domain,
5136 				  struct device *dev)
5137 {
5138 	struct device_domain_info *info = get_domain_info(dev);
5139 
5140 	assert_spin_locked(&device_domain_lock);
5141 	if (WARN_ON(!info))
5142 		return;
5143 
5144 	domain->auxd_refcnt++;
5145 	list_add(&domain->auxd, &info->auxiliary_domains);
5146 }
5147 
5148 static void auxiliary_unlink_device(struct dmar_domain *domain,
5149 				    struct device *dev)
5150 {
5151 	struct device_domain_info *info = get_domain_info(dev);
5152 
5153 	assert_spin_locked(&device_domain_lock);
5154 	if (WARN_ON(!info))
5155 		return;
5156 
5157 	list_del(&domain->auxd);
5158 	domain->auxd_refcnt--;
5159 
5160 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161 		ioasid_free(domain->default_pasid);
5162 }
5163 
5164 static int aux_domain_add_dev(struct dmar_domain *domain,
5165 			      struct device *dev)
5166 {
5167 	int ret;
5168 	unsigned long flags;
5169 	struct intel_iommu *iommu;
5170 
5171 	iommu = device_to_iommu(dev, NULL, NULL);
5172 	if (!iommu)
5173 		return -ENODEV;
5174 
5175 	if (domain->default_pasid <= 0) {
5176 		int pasid;
5177 
5178 		/* No private data needed for the default pasid */
5179 		pasid = ioasid_alloc(NULL, PASID_MIN,
5180 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5181 				     NULL);
5182 		if (pasid == INVALID_IOASID) {
5183 			pr_err("Can't allocate default pasid\n");
5184 			return -ENODEV;
5185 		}
5186 		domain->default_pasid = pasid;
5187 	}
5188 
5189 	spin_lock_irqsave(&device_domain_lock, flags);
5190 	/*
5191 	 * iommu->lock must be held to attach domain to iommu and setup the
5192 	 * pasid entry for second level translation.
5193 	 */
5194 	spin_lock(&iommu->lock);
5195 	ret = domain_attach_iommu(domain, iommu);
5196 	if (ret)
5197 		goto attach_failed;
5198 
5199 	/* Setup the PASID entry for mediated devices: */
5200 	if (domain_use_first_level(domain))
5201 		ret = domain_setup_first_level(iommu, domain, dev,
5202 					       domain->default_pasid);
5203 	else
5204 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205 						     domain->default_pasid);
5206 	if (ret)
5207 		goto table_failed;
5208 	spin_unlock(&iommu->lock);
5209 
5210 	auxiliary_link_device(domain, dev);
5211 
5212 	spin_unlock_irqrestore(&device_domain_lock, flags);
5213 
5214 	return 0;
5215 
5216 table_failed:
5217 	domain_detach_iommu(domain, iommu);
5218 attach_failed:
5219 	spin_unlock(&iommu->lock);
5220 	spin_unlock_irqrestore(&device_domain_lock, flags);
5221 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222 		ioasid_free(domain->default_pasid);
5223 
5224 	return ret;
5225 }
5226 
5227 static void aux_domain_remove_dev(struct dmar_domain *domain,
5228 				  struct device *dev)
5229 {
5230 	struct device_domain_info *info;
5231 	struct intel_iommu *iommu;
5232 	unsigned long flags;
5233 
5234 	if (!is_aux_domain(dev, &domain->domain))
5235 		return;
5236 
5237 	spin_lock_irqsave(&device_domain_lock, flags);
5238 	info = get_domain_info(dev);
5239 	iommu = info->iommu;
5240 
5241 	auxiliary_unlink_device(domain, dev);
5242 
5243 	spin_lock(&iommu->lock);
5244 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245 	domain_detach_iommu(domain, iommu);
5246 	spin_unlock(&iommu->lock);
5247 
5248 	spin_unlock_irqrestore(&device_domain_lock, flags);
5249 }
5250 
5251 static int prepare_domain_attach_device(struct iommu_domain *domain,
5252 					struct device *dev)
5253 {
5254 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255 	struct intel_iommu *iommu;
5256 	int addr_width;
5257 
5258 	iommu = device_to_iommu(dev, NULL, NULL);
5259 	if (!iommu)
5260 		return -ENODEV;
5261 
5262 	/* check if this iommu agaw is sufficient for max mapped address */
5263 	addr_width = agaw_to_width(iommu->agaw);
5264 	if (addr_width > cap_mgaw(iommu->cap))
5265 		addr_width = cap_mgaw(iommu->cap);
5266 
5267 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5268 		dev_err(dev, "%s: iommu width (%d) is not "
5269 		        "sufficient for the mapped address (%llx)\n",
5270 		        __func__, addr_width, dmar_domain->max_addr);
5271 		return -EFAULT;
5272 	}
5273 	dmar_domain->gaw = addr_width;
5274 
5275 	/*
5276 	 * Knock out extra levels of page tables if necessary
5277 	 */
5278 	while (iommu->agaw < dmar_domain->agaw) {
5279 		struct dma_pte *pte;
5280 
5281 		pte = dmar_domain->pgd;
5282 		if (dma_pte_present(pte)) {
5283 			dmar_domain->pgd = (struct dma_pte *)
5284 				phys_to_virt(dma_pte_addr(pte));
5285 			free_pgtable_page(pte);
5286 		}
5287 		dmar_domain->agaw--;
5288 	}
5289 
5290 	return 0;
5291 }
5292 
5293 static int intel_iommu_attach_device(struct iommu_domain *domain,
5294 				     struct device *dev)
5295 {
5296 	int ret;
5297 
5298 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299 	    device_is_rmrr_locked(dev)) {
5300 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5301 		return -EPERM;
5302 	}
5303 
5304 	if (is_aux_domain(dev, domain))
5305 		return -EPERM;
5306 
5307 	/* normally dev is not mapped */
5308 	if (unlikely(domain_context_mapped(dev))) {
5309 		struct dmar_domain *old_domain;
5310 
5311 		old_domain = find_domain(dev);
5312 		if (old_domain)
5313 			dmar_remove_one_dev_info(dev);
5314 	}
5315 
5316 	ret = prepare_domain_attach_device(domain, dev);
5317 	if (ret)
5318 		return ret;
5319 
5320 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5321 }
5322 
5323 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5324 					 struct device *dev)
5325 {
5326 	int ret;
5327 
5328 	if (!is_aux_domain(dev, domain))
5329 		return -EPERM;
5330 
5331 	ret = prepare_domain_attach_device(domain, dev);
5332 	if (ret)
5333 		return ret;
5334 
5335 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5336 }
5337 
5338 static void intel_iommu_detach_device(struct iommu_domain *domain,
5339 				      struct device *dev)
5340 {
5341 	dmar_remove_one_dev_info(dev);
5342 }
5343 
5344 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5345 					  struct device *dev)
5346 {
5347 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5348 }
5349 
5350 /*
5351  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352  * VT-d granularity. Invalidation is typically included in the unmap operation
5353  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354  * owns the first level page tables. Invalidations of translation caches in the
5355  * guest are trapped and passed down to the host.
5356  *
5357  * vIOMMU in the guest will only expose first level page tables, therefore
5358  * we do not support IOTLB granularity for request without PASID (second level).
5359  *
5360  * For example, to find the VT-d granularity encoding for IOTLB
5361  * type and page selective granularity within PASID:
5362  * X: indexed by iommu cache type
5363  * Y: indexed by enum iommu_inv_granularity
5364  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5365  */
5366 
5367 static const int
5368 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5369 	/*
5370 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5371 	 * page selective (address granularity)
5372 	 */
5373 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374 	/* PASID based dev TLBs */
5375 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5376 	/* PASID cache */
5377 	{-EINVAL, -EINVAL, -EINVAL}
5378 };
5379 
5380 static inline int to_vtd_granularity(int type, int granu)
5381 {
5382 	return inv_type_granu_table[type][granu];
5383 }
5384 
5385 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5386 {
5387 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5388 
5389 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391 	 * granu size in contiguous memory.
5392 	 */
5393 	return order_base_2(nr_pages);
5394 }
5395 
5396 #ifdef CONFIG_INTEL_IOMMU_SVM
5397 static int
5398 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399 			   struct iommu_cache_invalidate_info *inv_info)
5400 {
5401 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402 	struct device_domain_info *info;
5403 	struct intel_iommu *iommu;
5404 	unsigned long flags;
5405 	int cache_type;
5406 	u8 bus, devfn;
5407 	u16 did, sid;
5408 	int ret = 0;
5409 	u64 size = 0;
5410 
5411 	if (!inv_info || !dmar_domain ||
5412 	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5413 		return -EINVAL;
5414 
5415 	if (!dev || !dev_is_pci(dev))
5416 		return -ENODEV;
5417 
5418 	iommu = device_to_iommu(dev, &bus, &devfn);
5419 	if (!iommu)
5420 		return -ENODEV;
5421 
5422 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5423 		return -EINVAL;
5424 
5425 	spin_lock_irqsave(&device_domain_lock, flags);
5426 	spin_lock(&iommu->lock);
5427 	info = get_domain_info(dev);
5428 	if (!info) {
5429 		ret = -EINVAL;
5430 		goto out_unlock;
5431 	}
5432 	did = dmar_domain->iommu_did[iommu->seq_id];
5433 	sid = PCI_DEVID(bus, devfn);
5434 
5435 	/* Size is only valid in address selective invalidation */
5436 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437 		size = to_vtd_size(inv_info->addr_info.granule_size,
5438 				   inv_info->addr_info.nb_granules);
5439 
5440 	for_each_set_bit(cache_type,
5441 			 (unsigned long *)&inv_info->cache,
5442 			 IOMMU_CACHE_INV_TYPE_NR) {
5443 		int granu = 0;
5444 		u64 pasid = 0;
5445 		u64 addr = 0;
5446 
5447 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448 		if (granu == -EINVAL) {
5449 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450 					   cache_type, inv_info->granularity);
5451 			break;
5452 		}
5453 
5454 		/*
5455 		 * PASID is stored in different locations based on the
5456 		 * granularity.
5457 		 */
5458 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459 		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460 			pasid = inv_info->pasid_info.pasid;
5461 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462 			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463 			pasid = inv_info->addr_info.pasid;
5464 
5465 		switch (BIT(cache_type)) {
5466 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5467 			/* HW will ignore LSB bits based on address mask */
5468 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5469 			    size &&
5470 			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472 						   inv_info->addr_info.addr, size);
5473 			}
5474 
5475 			/*
5476 			 * If granu is PASID-selective, address is ignored.
5477 			 * We use npages = -1 to indicate that.
5478 			 */
5479 			qi_flush_piotlb(iommu, did, pasid,
5480 					mm_to_dma_pfn(inv_info->addr_info.addr),
5481 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482 					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5483 
5484 			if (!info->ats_enabled)
5485 				break;
5486 			/*
5487 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5488 			 * in the guest may assume IOTLB flush is inclusive,
5489 			 * which is more efficient.
5490 			 */
5491 			fallthrough;
5492 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5493 			/*
5494 			 * PASID based device TLB invalidation does not support
5495 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5496 			 * IOMMU_INV_GRANU_ADDR.
5497 			 * The equivalent of that is we set the size to be the
5498 			 * entire range of 64 bit. User only provides PASID info
5499 			 * without address info. So we set addr to 0.
5500 			 */
5501 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502 				size = 64 - VTD_PAGE_SHIFT;
5503 				addr = 0;
5504 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505 				addr = inv_info->addr_info.addr;
5506 			}
5507 
5508 			if (info->ats_enabled)
5509 				qi_flush_dev_iotlb_pasid(iommu, sid,
5510 						info->pfsid, pasid,
5511 						info->ats_qdep, addr,
5512 						size);
5513 			else
5514 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5515 			break;
5516 		default:
5517 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5518 					    cache_type);
5519 			ret = -EINVAL;
5520 		}
5521 	}
5522 out_unlock:
5523 	spin_unlock(&iommu->lock);
5524 	spin_unlock_irqrestore(&device_domain_lock, flags);
5525 
5526 	return ret;
5527 }
5528 #endif
5529 
5530 static int intel_iommu_map(struct iommu_domain *domain,
5531 			   unsigned long iova, phys_addr_t hpa,
5532 			   size_t size, int iommu_prot, gfp_t gfp)
5533 {
5534 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535 	u64 max_addr;
5536 	int prot = 0;
5537 	int ret;
5538 
5539 	if (iommu_prot & IOMMU_READ)
5540 		prot |= DMA_PTE_READ;
5541 	if (iommu_prot & IOMMU_WRITE)
5542 		prot |= DMA_PTE_WRITE;
5543 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544 		prot |= DMA_PTE_SNP;
5545 
5546 	max_addr = iova + size;
5547 	if (dmar_domain->max_addr < max_addr) {
5548 		u64 end;
5549 
5550 		/* check if minimum agaw is sufficient for mapped address */
5551 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552 		if (end < max_addr) {
5553 			pr_err("%s: iommu width (%d) is not "
5554 			       "sufficient for the mapped address (%llx)\n",
5555 			       __func__, dmar_domain->gaw, max_addr);
5556 			return -EFAULT;
5557 		}
5558 		dmar_domain->max_addr = max_addr;
5559 	}
5560 	/* Round up size to next multiple of PAGE_SIZE, if it and
5561 	   the low bits of hpa would take us onto the next page */
5562 	size = aligned_nrpages(hpa, size);
5563 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5565 	return ret;
5566 }
5567 
5568 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569 				unsigned long iova, size_t size,
5570 				struct iommu_iotlb_gather *gather)
5571 {
5572 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573 	struct page *freelist = NULL;
5574 	unsigned long start_pfn, last_pfn;
5575 	unsigned int npages;
5576 	int iommu_id, level = 0;
5577 
5578 	/* Cope with horrid API which requires us to unmap more than the
5579 	   size argument if it happens to be a large-page mapping. */
5580 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5581 
5582 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5584 
5585 	start_pfn = iova >> VTD_PAGE_SHIFT;
5586 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5587 
5588 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5589 
5590 	npages = last_pfn - start_pfn + 1;
5591 
5592 	for_each_domain_iommu(iommu_id, dmar_domain)
5593 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594 				      start_pfn, npages, !freelist, 0);
5595 
5596 	dma_free_pagelist(freelist);
5597 
5598 	if (dmar_domain->max_addr == iova + size)
5599 		dmar_domain->max_addr = iova;
5600 
5601 	return size;
5602 }
5603 
5604 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5605 					    dma_addr_t iova)
5606 {
5607 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608 	struct dma_pte *pte;
5609 	int level = 0;
5610 	u64 phys = 0;
5611 
5612 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613 	if (pte && dma_pte_present(pte))
5614 		phys = dma_pte_addr(pte) +
5615 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5616 						VTD_PAGE_SHIFT) - 1));
5617 
5618 	return phys;
5619 }
5620 
5621 static inline bool scalable_mode_support(void)
5622 {
5623 	struct dmar_drhd_unit *drhd;
5624 	struct intel_iommu *iommu;
5625 	bool ret = true;
5626 
5627 	rcu_read_lock();
5628 	for_each_active_iommu(iommu, drhd) {
5629 		if (!sm_supported(iommu)) {
5630 			ret = false;
5631 			break;
5632 		}
5633 	}
5634 	rcu_read_unlock();
5635 
5636 	return ret;
5637 }
5638 
5639 static inline bool iommu_pasid_support(void)
5640 {
5641 	struct dmar_drhd_unit *drhd;
5642 	struct intel_iommu *iommu;
5643 	bool ret = true;
5644 
5645 	rcu_read_lock();
5646 	for_each_active_iommu(iommu, drhd) {
5647 		if (!pasid_supported(iommu)) {
5648 			ret = false;
5649 			break;
5650 		}
5651 	}
5652 	rcu_read_unlock();
5653 
5654 	return ret;
5655 }
5656 
5657 static inline bool nested_mode_support(void)
5658 {
5659 	struct dmar_drhd_unit *drhd;
5660 	struct intel_iommu *iommu;
5661 	bool ret = true;
5662 
5663 	rcu_read_lock();
5664 	for_each_active_iommu(iommu, drhd) {
5665 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5666 			ret = false;
5667 			break;
5668 		}
5669 	}
5670 	rcu_read_unlock();
5671 
5672 	return ret;
5673 }
5674 
5675 static bool intel_iommu_capable(enum iommu_cap cap)
5676 {
5677 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678 		return domain_update_iommu_snooping(NULL) == 1;
5679 	if (cap == IOMMU_CAP_INTR_REMAP)
5680 		return irq_remapping_enabled == 1;
5681 
5682 	return false;
5683 }
5684 
5685 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5686 {
5687 	struct intel_iommu *iommu;
5688 
5689 	iommu = device_to_iommu(dev, NULL, NULL);
5690 	if (!iommu)
5691 		return ERR_PTR(-ENODEV);
5692 
5693 	if (translation_pre_enabled(iommu))
5694 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5695 
5696 	return &iommu->iommu;
5697 }
5698 
5699 static void intel_iommu_release_device(struct device *dev)
5700 {
5701 	struct intel_iommu *iommu;
5702 
5703 	iommu = device_to_iommu(dev, NULL, NULL);
5704 	if (!iommu)
5705 		return;
5706 
5707 	dmar_remove_one_dev_info(dev);
5708 
5709 	set_dma_ops(dev, NULL);
5710 }
5711 
5712 static void intel_iommu_probe_finalize(struct device *dev)
5713 {
5714 	struct iommu_domain *domain;
5715 
5716 	domain = iommu_get_domain_for_dev(dev);
5717 	if (device_needs_bounce(dev))
5718 		set_dma_ops(dev, &bounce_dma_ops);
5719 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720 		set_dma_ops(dev, &intel_dma_ops);
5721 	else
5722 		set_dma_ops(dev, NULL);
5723 }
5724 
5725 static void intel_iommu_get_resv_regions(struct device *device,
5726 					 struct list_head *head)
5727 {
5728 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729 	struct iommu_resv_region *reg;
5730 	struct dmar_rmrr_unit *rmrr;
5731 	struct device *i_dev;
5732 	int i;
5733 
5734 	down_read(&dmar_global_lock);
5735 	for_each_rmrr_units(rmrr) {
5736 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5737 					  i, i_dev) {
5738 			struct iommu_resv_region *resv;
5739 			enum iommu_resv_type type;
5740 			size_t length;
5741 
5742 			if (i_dev != device &&
5743 			    !is_downstream_to_pci_bridge(device, i_dev))
5744 				continue;
5745 
5746 			length = rmrr->end_address - rmrr->base_address + 1;
5747 
5748 			type = device_rmrr_is_relaxable(device) ?
5749 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5750 
5751 			resv = iommu_alloc_resv_region(rmrr->base_address,
5752 						       length, prot, type);
5753 			if (!resv)
5754 				break;
5755 
5756 			list_add_tail(&resv->list, head);
5757 		}
5758 	}
5759 	up_read(&dmar_global_lock);
5760 
5761 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762 	if (dev_is_pci(device)) {
5763 		struct pci_dev *pdev = to_pci_dev(device);
5764 
5765 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767 						   IOMMU_RESV_DIRECT_RELAXABLE);
5768 			if (reg)
5769 				list_add_tail(&reg->list, head);
5770 		}
5771 	}
5772 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5773 
5774 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5776 				      0, IOMMU_RESV_MSI);
5777 	if (!reg)
5778 		return;
5779 	list_add_tail(&reg->list, head);
5780 }
5781 
5782 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5783 {
5784 	struct device_domain_info *info;
5785 	struct context_entry *context;
5786 	struct dmar_domain *domain;
5787 	unsigned long flags;
5788 	u64 ctx_lo;
5789 	int ret;
5790 
5791 	domain = find_domain(dev);
5792 	if (!domain)
5793 		return -EINVAL;
5794 
5795 	spin_lock_irqsave(&device_domain_lock, flags);
5796 	spin_lock(&iommu->lock);
5797 
5798 	ret = -EINVAL;
5799 	info = get_domain_info(dev);
5800 	if (!info || !info->pasid_supported)
5801 		goto out;
5802 
5803 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804 	if (WARN_ON(!context))
5805 		goto out;
5806 
5807 	ctx_lo = context[0].lo;
5808 
5809 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5810 		ctx_lo |= CONTEXT_PASIDE;
5811 		context[0].lo = ctx_lo;
5812 		wmb();
5813 		iommu->flush.flush_context(iommu,
5814 					   domain->iommu_did[iommu->seq_id],
5815 					   PCI_DEVID(info->bus, info->devfn),
5816 					   DMA_CCMD_MASK_NOBIT,
5817 					   DMA_CCMD_DEVICE_INVL);
5818 	}
5819 
5820 	/* Enable PASID support in the device, if it wasn't already */
5821 	if (!info->pasid_enabled)
5822 		iommu_enable_dev_iotlb(info);
5823 
5824 	ret = 0;
5825 
5826  out:
5827 	spin_unlock(&iommu->lock);
5828 	spin_unlock_irqrestore(&device_domain_lock, flags);
5829 
5830 	return ret;
5831 }
5832 
5833 static void intel_iommu_apply_resv_region(struct device *dev,
5834 					  struct iommu_domain *domain,
5835 					  struct iommu_resv_region *region)
5836 {
5837 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838 	unsigned long start, end;
5839 
5840 	start = IOVA_PFN(region->start);
5841 	end   = IOVA_PFN(region->start + region->length - 1);
5842 
5843 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5844 }
5845 
5846 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5847 {
5848 	if (dev_is_pci(dev))
5849 		return pci_device_group(dev);
5850 	return generic_device_group(dev);
5851 }
5852 
5853 static int intel_iommu_enable_auxd(struct device *dev)
5854 {
5855 	struct device_domain_info *info;
5856 	struct intel_iommu *iommu;
5857 	unsigned long flags;
5858 	int ret;
5859 
5860 	iommu = device_to_iommu(dev, NULL, NULL);
5861 	if (!iommu || dmar_disabled)
5862 		return -EINVAL;
5863 
5864 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5865 		return -EINVAL;
5866 
5867 	ret = intel_iommu_enable_pasid(iommu, dev);
5868 	if (ret)
5869 		return -ENODEV;
5870 
5871 	spin_lock_irqsave(&device_domain_lock, flags);
5872 	info = get_domain_info(dev);
5873 	info->auxd_enabled = 1;
5874 	spin_unlock_irqrestore(&device_domain_lock, flags);
5875 
5876 	return 0;
5877 }
5878 
5879 static int intel_iommu_disable_auxd(struct device *dev)
5880 {
5881 	struct device_domain_info *info;
5882 	unsigned long flags;
5883 
5884 	spin_lock_irqsave(&device_domain_lock, flags);
5885 	info = get_domain_info(dev);
5886 	if (!WARN_ON(!info))
5887 		info->auxd_enabled = 0;
5888 	spin_unlock_irqrestore(&device_domain_lock, flags);
5889 
5890 	return 0;
5891 }
5892 
5893 /*
5894  * A PCI express designated vendor specific extended capability is defined
5895  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896  * for system software and tools to detect endpoint devices supporting the
5897  * Intel scalable IO virtualization without host driver dependency.
5898  *
5899  * Returns the address of the matching extended capability structure within
5900  * the device's PCI configuration space or 0 if the device does not support
5901  * it.
5902  */
5903 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5904 {
5905 	int pos;
5906 	u16 vendor, id;
5907 
5908 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5909 	while (pos) {
5910 		pci_read_config_word(pdev, pos + 4, &vendor);
5911 		pci_read_config_word(pdev, pos + 8, &id);
5912 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913 			return pos;
5914 
5915 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916 	}
5917 
5918 	return 0;
5919 }
5920 
5921 static bool
5922 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5923 {
5924 	if (feat == IOMMU_DEV_FEAT_AUX) {
5925 		int ret;
5926 
5927 		if (!dev_is_pci(dev) || dmar_disabled ||
5928 		    !scalable_mode_support() || !iommu_pasid_support())
5929 			return false;
5930 
5931 		ret = pci_pasid_features(to_pci_dev(dev));
5932 		if (ret < 0)
5933 			return false;
5934 
5935 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936 	}
5937 
5938 	if (feat == IOMMU_DEV_FEAT_SVA) {
5939 		struct device_domain_info *info = get_domain_info(dev);
5940 
5941 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942 			info->pasid_supported && info->pri_supported &&
5943 			info->ats_supported;
5944 	}
5945 
5946 	return false;
5947 }
5948 
5949 static int
5950 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5951 {
5952 	if (feat == IOMMU_DEV_FEAT_AUX)
5953 		return intel_iommu_enable_auxd(dev);
5954 
5955 	if (feat == IOMMU_DEV_FEAT_SVA) {
5956 		struct device_domain_info *info = get_domain_info(dev);
5957 
5958 		if (!info)
5959 			return -EINVAL;
5960 
5961 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5962 			return 0;
5963 	}
5964 
5965 	return -ENODEV;
5966 }
5967 
5968 static int
5969 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5970 {
5971 	if (feat == IOMMU_DEV_FEAT_AUX)
5972 		return intel_iommu_disable_auxd(dev);
5973 
5974 	return -ENODEV;
5975 }
5976 
5977 static bool
5978 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5979 {
5980 	struct device_domain_info *info = get_domain_info(dev);
5981 
5982 	if (feat == IOMMU_DEV_FEAT_AUX)
5983 		return scalable_mode_support() && info && info->auxd_enabled;
5984 
5985 	return false;
5986 }
5987 
5988 static int
5989 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5990 {
5991 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5992 
5993 	return dmar_domain->default_pasid > 0 ?
5994 			dmar_domain->default_pasid : -EINVAL;
5995 }
5996 
5997 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998 					   struct device *dev)
5999 {
6000 	return attach_deferred(dev);
6001 }
6002 
6003 static int
6004 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005 			    enum iommu_attr attr, void *data)
6006 {
6007 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008 	unsigned long flags;
6009 	int ret = 0;
6010 
6011 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6012 		return -EINVAL;
6013 
6014 	switch (attr) {
6015 	case DOMAIN_ATTR_NESTING:
6016 		spin_lock_irqsave(&device_domain_lock, flags);
6017 		if (nested_mode_support() &&
6018 		    list_empty(&dmar_domain->devices)) {
6019 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6021 		} else {
6022 			ret = -ENODEV;
6023 		}
6024 		spin_unlock_irqrestore(&device_domain_lock, flags);
6025 		break;
6026 	default:
6027 		ret = -EINVAL;
6028 		break;
6029 	}
6030 
6031 	return ret;
6032 }
6033 
6034 /*
6035  * Check that the device does not live on an external facing PCI port that is
6036  * marked as untrusted. Such devices should not be able to apply quirks and
6037  * thus not be able to bypass the IOMMU restrictions.
6038  */
6039 static bool risky_device(struct pci_dev *pdev)
6040 {
6041 	if (pdev->untrusted) {
6042 		pci_info(pdev,
6043 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044 			 pdev->vendor, pdev->device);
6045 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6046 		return true;
6047 	}
6048 	return false;
6049 }
6050 
6051 const struct iommu_ops intel_iommu_ops = {
6052 	.capable		= intel_iommu_capable,
6053 	.domain_alloc		= intel_iommu_domain_alloc,
6054 	.domain_free		= intel_iommu_domain_free,
6055 	.domain_set_attr	= intel_iommu_domain_set_attr,
6056 	.attach_dev		= intel_iommu_attach_device,
6057 	.detach_dev		= intel_iommu_detach_device,
6058 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6059 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6060 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6061 	.map			= intel_iommu_map,
6062 	.unmap			= intel_iommu_unmap,
6063 	.iova_to_phys		= intel_iommu_iova_to_phys,
6064 	.probe_device		= intel_iommu_probe_device,
6065 	.probe_finalize		= intel_iommu_probe_finalize,
6066 	.release_device		= intel_iommu_release_device,
6067 	.get_resv_regions	= intel_iommu_get_resv_regions,
6068 	.put_resv_regions	= generic_iommu_put_resv_regions,
6069 	.apply_resv_region	= intel_iommu_apply_resv_region,
6070 	.device_group		= intel_iommu_device_group,
6071 	.dev_has_feat		= intel_iommu_dev_has_feat,
6072 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6073 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6074 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6075 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6076 	.def_domain_type	= device_def_domain_type,
6077 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6078 #ifdef CONFIG_INTEL_IOMMU_SVM
6079 	.cache_invalidate	= intel_iommu_sva_invalidate,
6080 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6081 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6082 	.sva_bind		= intel_svm_bind,
6083 	.sva_unbind		= intel_svm_unbind,
6084 	.sva_get_pasid		= intel_svm_get_pasid,
6085 	.page_response		= intel_svm_page_response,
6086 #endif
6087 };
6088 
6089 static void quirk_iommu_igfx(struct pci_dev *dev)
6090 {
6091 	if (risky_device(dev))
6092 		return;
6093 
6094 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095 	dmar_map_gfx = 0;
6096 }
6097 
6098 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6106 
6107 /* Broadwell igfx malfunctions with dmar */
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6132 
6133 static void quirk_iommu_rwbf(struct pci_dev *dev)
6134 {
6135 	if (risky_device(dev))
6136 		return;
6137 
6138 	/*
6139 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6140 	 * but needs it. Same seems to hold for the desktop versions.
6141 	 */
6142 	pci_info(dev, "Forcing write-buffer flush capability\n");
6143 	rwbf_quirk = 1;
6144 }
6145 
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6153 
6154 #define GGC 0x52
6155 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6156 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6157 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6158 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6159 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6160 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6161 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6162 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6163 
6164 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165 {
6166 	unsigned short ggc;
6167 
6168 	if (risky_device(dev))
6169 		return;
6170 
6171 	if (pci_read_config_word(dev, GGC, &ggc))
6172 		return;
6173 
6174 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6176 		dmar_map_gfx = 0;
6177 	} else if (dmar_map_gfx) {
6178 		/* we have to ensure the gfx device is idle before we flush */
6179 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180 		intel_iommu_strict = 1;
6181        }
6182 }
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6187 
6188 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6189 {
6190 	unsigned short ver;
6191 
6192 	if (!IS_GFX_DEVICE(dev))
6193 		return;
6194 
6195 	ver = (dev->device >> 8) & 0xff;
6196 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6198 	    ver != 0x9a)
6199 		return;
6200 
6201 	if (risky_device(dev))
6202 		return;
6203 
6204 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205 	iommu_skip_te_disable = 1;
6206 }
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6208 
6209 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6210    ISOCH DMAR unit for the Azalia sound device, but not give it any
6211    TLB entries, which causes it to deadlock. Check for that.  We do
6212    this in a function called from init_dmars(), instead of in a PCI
6213    quirk, because we don't want to print the obnoxious "BIOS broken"
6214    message if VT-d is actually disabled.
6215 */
6216 static void __init check_tylersburg_isoch(void)
6217 {
6218 	struct pci_dev *pdev;
6219 	uint32_t vtisochctrl;
6220 
6221 	/* If there's no Azalia in the system anyway, forget it. */
6222 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6223 	if (!pdev)
6224 		return;
6225 
6226 	if (risky_device(pdev)) {
6227 		pci_dev_put(pdev);
6228 		return;
6229 	}
6230 
6231 	pci_dev_put(pdev);
6232 
6233 	/* System Management Registers. Might be hidden, in which case
6234 	   we can't do the sanity check. But that's OK, because the
6235 	   known-broken BIOSes _don't_ actually hide it, so far. */
6236 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6237 	if (!pdev)
6238 		return;
6239 
6240 	if (risky_device(pdev)) {
6241 		pci_dev_put(pdev);
6242 		return;
6243 	}
6244 
6245 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6246 		pci_dev_put(pdev);
6247 		return;
6248 	}
6249 
6250 	pci_dev_put(pdev);
6251 
6252 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253 	if (vtisochctrl & 1)
6254 		return;
6255 
6256 	/* Drop all bits other than the number of TLB entries */
6257 	vtisochctrl &= 0x1c;
6258 
6259 	/* If we have the recommended number of TLB entries (16), fine. */
6260 	if (vtisochctrl == 0x10)
6261 		return;
6262 
6263 	/* Zero TLB entries? You get to ride the short bus to school. */
6264 	if (!vtisochctrl) {
6265 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6268 		     dmi_get_system_info(DMI_BIOS_VERSION),
6269 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6270 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6271 		return;
6272 	}
6273 
6274 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6275 	       vtisochctrl);
6276 }
6277