xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 3cea11cd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360 
361 #define IDENTMAP_GFX		2
362 #define IDENTMAP_AZALIA		4
363 
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 	struct device_domain_info *info;
371 
372 	if (!dev)
373 		return NULL;
374 
375 	info = dev_iommu_priv_get(dev);
376 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 		return NULL;
378 
379 	return info;
380 }
381 
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384 
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
386 				to_pci_dev(d)->untrusted)
387 
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 				     void *data), void *data)
394 {
395 	int ret = 0;
396 	unsigned long flags;
397 	struct device_domain_info *info;
398 
399 	spin_lock_irqsave(&device_domain_lock, flags);
400 	list_for_each_entry(info, &device_domain_list, global) {
401 		ret = fn(info, data);
402 		if (ret) {
403 			spin_unlock_irqrestore(&device_domain_lock, flags);
404 			return ret;
405 		}
406 	}
407 	spin_unlock_irqrestore(&device_domain_lock, flags);
408 
409 	return 0;
410 }
411 
412 const struct iommu_ops intel_iommu_ops;
413 
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418 
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423 
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426 	u32 gsts;
427 
428 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 	if (gsts & DMA_GSTS_TES)
430 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432 
433 static int __init intel_iommu_setup(char *str)
434 {
435 	if (!str)
436 		return -EINVAL;
437 	while (*str) {
438 		if (!strncmp(str, "on", 2)) {
439 			dmar_disabled = 0;
440 			pr_info("IOMMU enabled\n");
441 		} else if (!strncmp(str, "off", 3)) {
442 			dmar_disabled = 1;
443 			no_platform_optin = 1;
444 			pr_info("IOMMU disabled\n");
445 		} else if (!strncmp(str, "igfx_off", 8)) {
446 			dmar_map_gfx = 0;
447 			pr_info("Disable GFX device mapping\n");
448 		} else if (!strncmp(str, "forcedac", 8)) {
449 			pr_info("Forcing DAC for PCI devices\n");
450 			dmar_forcedac = 1;
451 		} else if (!strncmp(str, "strict", 6)) {
452 			pr_info("Disable batched IOTLB flush\n");
453 			intel_iommu_strict = 1;
454 		} else if (!strncmp(str, "sp_off", 6)) {
455 			pr_info("Disable supported super page\n");
456 			intel_iommu_superpage = 0;
457 		} else if (!strncmp(str, "sm_on", 5)) {
458 			pr_info("Intel-IOMMU: scalable mode supported\n");
459 			intel_iommu_sm = 1;
460 		} else if (!strncmp(str, "tboot_noforce", 13)) {
461 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 			intel_iommu_tboot_noforce = 1;
463 		} else if (!strncmp(str, "nobounce", 8)) {
464 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 			intel_no_bounce = 1;
466 		}
467 
468 		str += strcspn(str, ",");
469 		while (*str == ',')
470 			str++;
471 	}
472 	return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475 
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478 
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481 	struct dmar_domain **domains;
482 	int idx = did >> 8;
483 
484 	domains = iommu->domains[idx];
485 	if (!domains)
486 		return NULL;
487 
488 	return domains[did & 0xff];
489 }
490 
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 			     struct dmar_domain *domain)
493 {
494 	struct dmar_domain **domains;
495 	int idx = did >> 8;
496 
497 	if (!iommu->domains[idx]) {
498 		size_t size = 256 * sizeof(struct dmar_domain *);
499 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 	}
501 
502 	domains = iommu->domains[idx];
503 	if (WARN_ON(!domains))
504 		return;
505 	else
506 		domains[did & 0xff] = domain;
507 }
508 
509 void *alloc_pgtable_page(int node)
510 {
511 	struct page *page;
512 	void *vaddr = NULL;
513 
514 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 	if (page)
516 		vaddr = page_address(page);
517 	return vaddr;
518 }
519 
520 void free_pgtable_page(void *vaddr)
521 {
522 	free_page((unsigned long)vaddr);
523 }
524 
525 static inline void *alloc_domain_mem(void)
526 {
527 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529 
530 static void free_domain_mem(void *vaddr)
531 {
532 	kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534 
535 static inline void * alloc_devinfo_mem(void)
536 {
537 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539 
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542 	kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544 
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549 
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554 
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 				       unsigned long pfn)
557 {
558 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559 
560 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562 
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565 	unsigned long sagaw;
566 	int agaw = -1;
567 
568 	sagaw = cap_sagaw(iommu->cap);
569 	for (agaw = width_to_agaw(max_gaw);
570 	     agaw >= 0; agaw--) {
571 		if (test_bit(agaw, &sagaw))
572 			break;
573 	}
574 
575 	return agaw;
576 }
577 
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585 
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595 
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599 	int iommu_id;
600 
601 	/* si_domain and vm domain should not get here. */
602 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 		return NULL;
604 
605 	for_each_domain_iommu(iommu_id, domain)
606 		break;
607 
608 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 		return NULL;
610 
611 	return g_iommus[iommu_id];
612 }
613 
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616 	return sm_supported(iommu) ?
617 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619 
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622 	struct dmar_drhd_unit *drhd;
623 	struct intel_iommu *iommu;
624 	bool found = false;
625 	int i;
626 
627 	domain->iommu_coherency = 1;
628 
629 	for_each_domain_iommu(i, domain) {
630 		found = true;
631 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 			domain->iommu_coherency = 0;
633 			break;
634 		}
635 	}
636 	if (found)
637 		return;
638 
639 	/* No hardware attached; use lowest common denominator */
640 	rcu_read_lock();
641 	for_each_active_iommu(iommu, drhd) {
642 		if (!iommu_paging_structure_coherency(iommu)) {
643 			domain->iommu_coherency = 0;
644 			break;
645 		}
646 	}
647 	rcu_read_unlock();
648 }
649 
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652 	struct dmar_drhd_unit *drhd;
653 	struct intel_iommu *iommu;
654 	int ret = 1;
655 
656 	rcu_read_lock();
657 	for_each_active_iommu(iommu, drhd) {
658 		if (iommu != skip) {
659 			if (!ecap_sc_support(iommu->ecap)) {
660 				ret = 0;
661 				break;
662 			}
663 		}
664 	}
665 	rcu_read_unlock();
666 
667 	return ret;
668 }
669 
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 					 struct intel_iommu *skip)
672 {
673 	struct dmar_drhd_unit *drhd;
674 	struct intel_iommu *iommu;
675 	int mask = 0x3;
676 
677 	if (!intel_iommu_superpage) {
678 		return 0;
679 	}
680 
681 	/* set iommu_superpage to the smallest common denominator */
682 	rcu_read_lock();
683 	for_each_active_iommu(iommu, drhd) {
684 		if (iommu != skip) {
685 			if (domain && domain_use_first_level(domain)) {
686 				if (!cap_fl1gp_support(iommu->cap))
687 					mask = 0x1;
688 			} else {
689 				mask &= cap_super_page_val(iommu->cap);
690 			}
691 
692 			if (!mask)
693 				break;
694 		}
695 	}
696 	rcu_read_unlock();
697 
698 	return fls(mask);
699 }
700 
701 static int domain_update_device_node(struct dmar_domain *domain)
702 {
703 	struct device_domain_info *info;
704 	int nid = NUMA_NO_NODE;
705 
706 	assert_spin_locked(&device_domain_lock);
707 
708 	if (list_empty(&domain->devices))
709 		return NUMA_NO_NODE;
710 
711 	list_for_each_entry(info, &domain->devices, link) {
712 		if (!info->dev)
713 			continue;
714 
715 		/*
716 		 * There could possibly be multiple device numa nodes as devices
717 		 * within the same domain may sit behind different IOMMUs. There
718 		 * isn't perfect answer in such situation, so we select first
719 		 * come first served policy.
720 		 */
721 		nid = dev_to_node(info->dev);
722 		if (nid != NUMA_NO_NODE)
723 			break;
724 	}
725 
726 	return nid;
727 }
728 
729 /* Some capabilities may be different across iommus */
730 static void domain_update_iommu_cap(struct dmar_domain *domain)
731 {
732 	domain_update_iommu_coherency(domain);
733 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
734 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
735 
736 	/*
737 	 * If RHSA is missing, we should default to the device numa domain
738 	 * as fall back.
739 	 */
740 	if (domain->nid == NUMA_NO_NODE)
741 		domain->nid = domain_update_device_node(domain);
742 }
743 
744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
745 					 u8 devfn, int alloc)
746 {
747 	struct root_entry *root = &iommu->root_entry[bus];
748 	struct context_entry *context;
749 	u64 *entry;
750 
751 	entry = &root->lo;
752 	if (sm_supported(iommu)) {
753 		if (devfn >= 0x80) {
754 			devfn -= 0x80;
755 			entry = &root->hi;
756 		}
757 		devfn *= 2;
758 	}
759 	if (*entry & 1)
760 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
761 	else {
762 		unsigned long phy_addr;
763 		if (!alloc)
764 			return NULL;
765 
766 		context = alloc_pgtable_page(iommu->node);
767 		if (!context)
768 			return NULL;
769 
770 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
771 		phy_addr = virt_to_phys((void *)context);
772 		*entry = phy_addr | 1;
773 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
774 	}
775 	return &context[devfn];
776 }
777 
778 static bool attach_deferred(struct device *dev)
779 {
780 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
781 }
782 
783 /**
784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
785  *				 sub-hierarchy of a candidate PCI-PCI bridge
786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
787  * @bridge: the candidate PCI-PCI bridge
788  *
789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
790  */
791 static bool
792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
793 {
794 	struct pci_dev *pdev, *pbridge;
795 
796 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
797 		return false;
798 
799 	pdev = to_pci_dev(dev);
800 	pbridge = to_pci_dev(bridge);
801 
802 	if (pbridge->subordinate &&
803 	    pbridge->subordinate->number <= pdev->bus->number &&
804 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
805 		return true;
806 
807 	return false;
808 }
809 
810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
811 {
812 	struct dmar_drhd_unit *drhd;
813 	u32 vtbar;
814 	int rc;
815 
816 	/* We know that this device on this chipset has its own IOMMU.
817 	 * If we find it under a different IOMMU, then the BIOS is lying
818 	 * to us. Hope that the IOMMU for this device is actually
819 	 * disabled, and it needs no translation...
820 	 */
821 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
822 	if (rc) {
823 		/* "can't" happen */
824 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
825 		return false;
826 	}
827 	vtbar &= 0xffff0000;
828 
829 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
830 	drhd = dmar_find_matched_drhd_unit(pdev);
831 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
832 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
833 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
834 		return true;
835 	}
836 
837 	return false;
838 }
839 
840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
841 {
842 	if (!iommu || iommu->drhd->ignored)
843 		return true;
844 
845 	if (dev_is_pci(dev)) {
846 		struct pci_dev *pdev = to_pci_dev(dev);
847 
848 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
849 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
850 		    quirk_ioat_snb_local_iommu(pdev))
851 			return true;
852 	}
853 
854 	return false;
855 }
856 
857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
858 {
859 	struct dmar_drhd_unit *drhd = NULL;
860 	struct pci_dev *pdev = NULL;
861 	struct intel_iommu *iommu;
862 	struct device *tmp;
863 	u16 segment = 0;
864 	int i;
865 
866 	if (!dev)
867 		return NULL;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pf_pdev;
871 
872 		pdev = pci_real_dma_dev(to_pci_dev(dev));
873 
874 		/* VFs aren't listed in scope tables; we need to look up
875 		 * the PF instead to find the IOMMU. */
876 		pf_pdev = pci_physfn(pdev);
877 		dev = &pf_pdev->dev;
878 		segment = pci_domain_nr(pdev->bus);
879 	} else if (has_acpi_companion(dev))
880 		dev = &ACPI_COMPANION(dev)->dev;
881 
882 	rcu_read_lock();
883 	for_each_iommu(iommu, drhd) {
884 		if (pdev && segment != drhd->segment)
885 			continue;
886 
887 		for_each_active_dev_scope(drhd->devices,
888 					  drhd->devices_cnt, i, tmp) {
889 			if (tmp == dev) {
890 				/* For a VF use its original BDF# not that of the PF
891 				 * which we used for the IOMMU lookup. Strictly speaking
892 				 * we could do this for all PCI devices; we only need to
893 				 * get the BDF# from the scope table for ACPI matches. */
894 				if (pdev && pdev->is_virtfn)
895 					goto got_pdev;
896 
897 				if (bus && devfn) {
898 					*bus = drhd->devices[i].bus;
899 					*devfn = drhd->devices[i].devfn;
900 				}
901 				goto out;
902 			}
903 
904 			if (is_downstream_to_pci_bridge(dev, tmp))
905 				goto got_pdev;
906 		}
907 
908 		if (pdev && drhd->include_all) {
909 		got_pdev:
910 			if (bus && devfn) {
911 				*bus = pdev->bus->number;
912 				*devfn = pdev->devfn;
913 			}
914 			goto out;
915 		}
916 	}
917 	iommu = NULL;
918  out:
919 	if (iommu_is_dummy(iommu, dev))
920 		iommu = NULL;
921 
922 	rcu_read_unlock();
923 
924 	return iommu;
925 }
926 
927 static void domain_flush_cache(struct dmar_domain *domain,
928 			       void *addr, int size)
929 {
930 	if (!domain->iommu_coherency)
931 		clflush_cache_range(addr, size);
932 }
933 
934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
935 {
936 	struct context_entry *context;
937 	int ret = 0;
938 	unsigned long flags;
939 
940 	spin_lock_irqsave(&iommu->lock, flags);
941 	context = iommu_context_addr(iommu, bus, devfn, 0);
942 	if (context)
943 		ret = context_present(context);
944 	spin_unlock_irqrestore(&iommu->lock, flags);
945 	return ret;
946 }
947 
948 static void free_context_table(struct intel_iommu *iommu)
949 {
950 	int i;
951 	unsigned long flags;
952 	struct context_entry *context;
953 
954 	spin_lock_irqsave(&iommu->lock, flags);
955 	if (!iommu->root_entry) {
956 		goto out;
957 	}
958 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
959 		context = iommu_context_addr(iommu, i, 0, 0);
960 		if (context)
961 			free_pgtable_page(context);
962 
963 		if (!sm_supported(iommu))
964 			continue;
965 
966 		context = iommu_context_addr(iommu, i, 0x80, 0);
967 		if (context)
968 			free_pgtable_page(context);
969 
970 	}
971 	free_pgtable_page(iommu->root_entry);
972 	iommu->root_entry = NULL;
973 out:
974 	spin_unlock_irqrestore(&iommu->lock, flags);
975 }
976 
977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
978 				      unsigned long pfn, int *target_level)
979 {
980 	struct dma_pte *parent, *pte;
981 	int level = agaw_to_level(domain->agaw);
982 	int offset;
983 
984 	BUG_ON(!domain->pgd);
985 
986 	if (!domain_pfn_supported(domain, pfn))
987 		/* Address beyond IOMMU's addressing capabilities. */
988 		return NULL;
989 
990 	parent = domain->pgd;
991 
992 	while (1) {
993 		void *tmp_page;
994 
995 		offset = pfn_level_offset(pfn, level);
996 		pte = &parent[offset];
997 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
998 			break;
999 		if (level == *target_level)
1000 			break;
1001 
1002 		if (!dma_pte_present(pte)) {
1003 			uint64_t pteval;
1004 
1005 			tmp_page = alloc_pgtable_page(domain->nid);
1006 
1007 			if (!tmp_page)
1008 				return NULL;
1009 
1010 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012 			if (domain_use_first_level(domain))
1013 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1015 				/* Someone else set it while we were thinking; use theirs. */
1016 				free_pgtable_page(tmp_page);
1017 			else
1018 				domain_flush_cache(domain, pte, sizeof(*pte));
1019 		}
1020 		if (level == 1)
1021 			break;
1022 
1023 		parent = phys_to_virt(dma_pte_addr(pte));
1024 		level--;
1025 	}
1026 
1027 	if (!*target_level)
1028 		*target_level = level;
1029 
1030 	return pte;
1031 }
1032 
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035 					 unsigned long pfn,
1036 					 int level, int *large_page)
1037 {
1038 	struct dma_pte *parent, *pte;
1039 	int total = agaw_to_level(domain->agaw);
1040 	int offset;
1041 
1042 	parent = domain->pgd;
1043 	while (level <= total) {
1044 		offset = pfn_level_offset(pfn, total);
1045 		pte = &parent[offset];
1046 		if (level == total)
1047 			return pte;
1048 
1049 		if (!dma_pte_present(pte)) {
1050 			*large_page = total;
1051 			break;
1052 		}
1053 
1054 		if (dma_pte_superpage(pte)) {
1055 			*large_page = total;
1056 			return pte;
1057 		}
1058 
1059 		parent = phys_to_virt(dma_pte_addr(pte));
1060 		total--;
1061 	}
1062 	return NULL;
1063 }
1064 
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067 				unsigned long start_pfn,
1068 				unsigned long last_pfn)
1069 {
1070 	unsigned int large_page;
1071 	struct dma_pte *first_pte, *pte;
1072 
1073 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075 	BUG_ON(start_pfn > last_pfn);
1076 
1077 	/* we don't need lock here; nobody else touches the iova range */
1078 	do {
1079 		large_page = 1;
1080 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081 		if (!pte) {
1082 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083 			continue;
1084 		}
1085 		do {
1086 			dma_clear_pte(pte);
1087 			start_pfn += lvl_to_nr_pages(large_page);
1088 			pte++;
1089 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090 
1091 		domain_flush_cache(domain, first_pte,
1092 				   (void *)pte - (void *)first_pte);
1093 
1094 	} while (start_pfn && start_pfn <= last_pfn);
1095 }
1096 
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098 			       int retain_level, struct dma_pte *pte,
1099 			       unsigned long pfn, unsigned long start_pfn,
1100 			       unsigned long last_pfn)
1101 {
1102 	pfn = max(start_pfn, pfn);
1103 	pte = &pte[pfn_level_offset(pfn, level)];
1104 
1105 	do {
1106 		unsigned long level_pfn;
1107 		struct dma_pte *level_pte;
1108 
1109 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110 			goto next;
1111 
1112 		level_pfn = pfn & level_mask(level);
1113 		level_pte = phys_to_virt(dma_pte_addr(pte));
1114 
1115 		if (level > 2) {
1116 			dma_pte_free_level(domain, level - 1, retain_level,
1117 					   level_pte, level_pfn, start_pfn,
1118 					   last_pfn);
1119 		}
1120 
1121 		/*
1122 		 * Free the page table if we're below the level we want to
1123 		 * retain and the range covers the entire table.
1124 		 */
1125 		if (level < retain_level && !(start_pfn > level_pfn ||
1126 		      last_pfn < level_pfn + level_size(level) - 1)) {
1127 			dma_clear_pte(pte);
1128 			domain_flush_cache(domain, pte, sizeof(*pte));
1129 			free_pgtable_page(level_pte);
1130 		}
1131 next:
1132 		pfn += level_size(level);
1133 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135 
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141 				   unsigned long start_pfn,
1142 				   unsigned long last_pfn,
1143 				   int retain_level)
1144 {
1145 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147 	BUG_ON(start_pfn > last_pfn);
1148 
1149 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1150 
1151 	/* We don't need lock here; nobody else touches the iova range */
1152 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153 			   domain->pgd, 0, start_pfn, last_pfn);
1154 
1155 	/* free pgd */
1156 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157 		free_pgtable_page(domain->pgd);
1158 		domain->pgd = NULL;
1159 	}
1160 }
1161 
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169 					    int level, struct dma_pte *pte,
1170 					    struct page *freelist)
1171 {
1172 	struct page *pg;
1173 
1174 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175 	pg->freelist = freelist;
1176 	freelist = pg;
1177 
1178 	if (level == 1)
1179 		return freelist;
1180 
1181 	pte = page_address(pg);
1182 	do {
1183 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184 			freelist = dma_pte_list_pagetables(domain, level - 1,
1185 							   pte, freelist);
1186 		pte++;
1187 	} while (!first_pte_in_page(pte));
1188 
1189 	return freelist;
1190 }
1191 
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193 					struct dma_pte *pte, unsigned long pfn,
1194 					unsigned long start_pfn,
1195 					unsigned long last_pfn,
1196 					struct page *freelist)
1197 {
1198 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199 
1200 	pfn = max(start_pfn, pfn);
1201 	pte = &pte[pfn_level_offset(pfn, level)];
1202 
1203 	do {
1204 		unsigned long level_pfn;
1205 
1206 		if (!dma_pte_present(pte))
1207 			goto next;
1208 
1209 		level_pfn = pfn & level_mask(level);
1210 
1211 		/* If range covers entire pagetable, free it */
1212 		if (start_pfn <= level_pfn &&
1213 		    last_pfn >= level_pfn + level_size(level) - 1) {
1214 			/* These suborbinate page tables are going away entirely. Don't
1215 			   bother to clear them; we're just going to *free* them. */
1216 			if (level > 1 && !dma_pte_superpage(pte))
1217 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218 
1219 			dma_clear_pte(pte);
1220 			if (!first_pte)
1221 				first_pte = pte;
1222 			last_pte = pte;
1223 		} else if (level > 1) {
1224 			/* Recurse down into a level that isn't *entirely* obsolete */
1225 			freelist = dma_pte_clear_level(domain, level - 1,
1226 						       phys_to_virt(dma_pte_addr(pte)),
1227 						       level_pfn, start_pfn, last_pfn,
1228 						       freelist);
1229 		}
1230 next:
1231 		pfn += level_size(level);
1232 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233 
1234 	if (first_pte)
1235 		domain_flush_cache(domain, first_pte,
1236 				   (void *)++last_pte - (void *)first_pte);
1237 
1238 	return freelist;
1239 }
1240 
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245 				 unsigned long start_pfn,
1246 				 unsigned long last_pfn)
1247 {
1248 	struct page *freelist;
1249 
1250 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1251 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1252 	BUG_ON(start_pfn > last_pfn);
1253 
1254 	/* we don't need lock here; nobody else touches the iova range */
1255 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1256 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1257 
1258 	/* free pgd */
1259 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260 		struct page *pgd_page = virt_to_page(domain->pgd);
1261 		pgd_page->freelist = freelist;
1262 		freelist = pgd_page;
1263 
1264 		domain->pgd = NULL;
1265 	}
1266 
1267 	return freelist;
1268 }
1269 
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272 	struct page *pg;
1273 
1274 	while ((pg = freelist)) {
1275 		freelist = pg->freelist;
1276 		free_pgtable_page(page_address(pg));
1277 	}
1278 }
1279 
1280 static void iova_entry_free(unsigned long data)
1281 {
1282 	struct page *freelist = (struct page *)data;
1283 
1284 	dma_free_pagelist(freelist);
1285 }
1286 
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290 	struct root_entry *root;
1291 	unsigned long flags;
1292 
1293 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294 	if (!root) {
1295 		pr_err("Allocating root entry for %s failed\n",
1296 			iommu->name);
1297 		return -ENOMEM;
1298 	}
1299 
1300 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1301 
1302 	spin_lock_irqsave(&iommu->lock, flags);
1303 	iommu->root_entry = root;
1304 	spin_unlock_irqrestore(&iommu->lock, flags);
1305 
1306 	return 0;
1307 }
1308 
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311 	u64 addr;
1312 	u32 sts;
1313 	unsigned long flag;
1314 
1315 	addr = virt_to_phys(iommu->root_entry);
1316 	if (sm_supported(iommu))
1317 		addr |= DMA_RTADDR_SMT;
1318 
1319 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321 
1322 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323 
1324 	/* Make sure hardware complete it */
1325 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326 		      readl, (sts & DMA_GSTS_RTPS), sts);
1327 
1328 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330 
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333 	u32 val;
1334 	unsigned long flag;
1335 
1336 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337 		return;
1338 
1339 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341 
1342 	/* Make sure hardware complete it */
1343 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1345 
1346 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348 
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351 				  u16 did, u16 source_id, u8 function_mask,
1352 				  u64 type)
1353 {
1354 	u64 val = 0;
1355 	unsigned long flag;
1356 
1357 	switch (type) {
1358 	case DMA_CCMD_GLOBAL_INVL:
1359 		val = DMA_CCMD_GLOBAL_INVL;
1360 		break;
1361 	case DMA_CCMD_DOMAIN_INVL:
1362 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363 		break;
1364 	case DMA_CCMD_DEVICE_INVL:
1365 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367 		break;
1368 	default:
1369 		BUG();
1370 	}
1371 	val |= DMA_CCMD_ICC;
1372 
1373 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375 
1376 	/* Make sure hardware complete it */
1377 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379 
1380 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382 
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385 				u64 addr, unsigned int size_order, u64 type)
1386 {
1387 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388 	u64 val = 0, val_iva = 0;
1389 	unsigned long flag;
1390 
1391 	switch (type) {
1392 	case DMA_TLB_GLOBAL_FLUSH:
1393 		/* global flush doesn't need set IVA_REG */
1394 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395 		break;
1396 	case DMA_TLB_DSI_FLUSH:
1397 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398 		break;
1399 	case DMA_TLB_PSI_FLUSH:
1400 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401 		/* IH bit is passed in as part of address */
1402 		val_iva = size_order | addr;
1403 		break;
1404 	default:
1405 		BUG();
1406 	}
1407 	/* Note: set drain read/write */
1408 #if 0
1409 	/*
1410 	 * This is probably to be super secure.. Looks like we can
1411 	 * ignore it without any impact.
1412 	 */
1413 	if (cap_read_drain(iommu->cap))
1414 		val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416 	if (cap_write_drain(iommu->cap))
1417 		val |= DMA_TLB_WRITE_DRAIN;
1418 
1419 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420 	/* Note: Only uses first TLB reg currently */
1421 	if (val_iva)
1422 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424 
1425 	/* Make sure hardware complete it */
1426 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428 
1429 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430 
1431 	/* check IOTLB invalidation granularity */
1432 	if (DMA_TLB_IAIG(val) == 0)
1433 		pr_err("Flush IOTLB failed\n");
1434 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1436 			(unsigned long long)DMA_TLB_IIRG(type),
1437 			(unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439 
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442 			 u8 bus, u8 devfn)
1443 {
1444 	struct device_domain_info *info;
1445 
1446 	assert_spin_locked(&device_domain_lock);
1447 
1448 	if (!iommu->qi)
1449 		return NULL;
1450 
1451 	list_for_each_entry(info, &domain->devices, link)
1452 		if (info->iommu == iommu && info->bus == bus &&
1453 		    info->devfn == devfn) {
1454 			if (info->ats_supported && info->dev)
1455 				return info;
1456 			break;
1457 		}
1458 
1459 	return NULL;
1460 }
1461 
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464 	struct device_domain_info *info;
1465 	bool has_iotlb_device = false;
1466 
1467 	assert_spin_locked(&device_domain_lock);
1468 
1469 	list_for_each_entry(info, &domain->devices, link) {
1470 		struct pci_dev *pdev;
1471 
1472 		if (!info->dev || !dev_is_pci(info->dev))
1473 			continue;
1474 
1475 		pdev = to_pci_dev(info->dev);
1476 		if (pdev->ats_enabled) {
1477 			has_iotlb_device = true;
1478 			break;
1479 		}
1480 	}
1481 
1482 	domain->has_iotlb_device = has_iotlb_device;
1483 }
1484 
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487 	struct pci_dev *pdev;
1488 
1489 	assert_spin_locked(&device_domain_lock);
1490 
1491 	if (!info || !dev_is_pci(info->dev))
1492 		return;
1493 
1494 	pdev = to_pci_dev(info->dev);
1495 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498 	 * reserved, which should be set to 0.
1499 	 */
1500 	if (!ecap_dit(info->iommu->ecap))
1501 		info->pfsid = 0;
1502 	else {
1503 		struct pci_dev *pf_pdev;
1504 
1505 		/* pdev will be returned if device is not a vf */
1506 		pf_pdev = pci_physfn(pdev);
1507 		info->pfsid = pci_dev_id(pf_pdev);
1508 	}
1509 
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1512 	   the device if you enable PASID support after ATS support is
1513 	   undefined. So always enable PASID support on devices which
1514 	   have it, even if we can't yet know if we're ever going to
1515 	   use it. */
1516 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517 		info->pasid_enabled = 1;
1518 
1519 	if (info->pri_supported &&
1520 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522 		info->pri_enabled = 1;
1523 #endif
1524 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526 		info->ats_enabled = 1;
1527 		domain_update_iotlb(info->domain);
1528 		info->ats_qdep = pci_ats_queue_depth(pdev);
1529 	}
1530 }
1531 
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534 	struct pci_dev *pdev;
1535 
1536 	assert_spin_locked(&device_domain_lock);
1537 
1538 	if (!dev_is_pci(info->dev))
1539 		return;
1540 
1541 	pdev = to_pci_dev(info->dev);
1542 
1543 	if (info->ats_enabled) {
1544 		pci_disable_ats(pdev);
1545 		info->ats_enabled = 0;
1546 		domain_update_iotlb(info->domain);
1547 	}
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549 	if (info->pri_enabled) {
1550 		pci_disable_pri(pdev);
1551 		info->pri_enabled = 0;
1552 	}
1553 	if (info->pasid_enabled) {
1554 		pci_disable_pasid(pdev);
1555 		info->pasid_enabled = 0;
1556 	}
1557 #endif
1558 }
1559 
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561 				  u64 addr, unsigned mask)
1562 {
1563 	u16 sid, qdep;
1564 	unsigned long flags;
1565 	struct device_domain_info *info;
1566 
1567 	if (!domain->has_iotlb_device)
1568 		return;
1569 
1570 	spin_lock_irqsave(&device_domain_lock, flags);
1571 	list_for_each_entry(info, &domain->devices, link) {
1572 		if (!info->ats_enabled)
1573 			continue;
1574 
1575 		sid = info->bus << 8 | info->devfn;
1576 		qdep = info->ats_qdep;
1577 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578 				qdep, addr, mask);
1579 	}
1580 	spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582 
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584 				struct dmar_domain *domain,
1585 				u64 addr, unsigned long npages, bool ih)
1586 {
1587 	u16 did = domain->iommu_did[iommu->seq_id];
1588 
1589 	if (domain->default_pasid)
1590 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1591 				addr, npages, ih);
1592 
1593 	if (!list_empty(&domain->devices))
1594 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596 
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598 				  struct dmar_domain *domain,
1599 				  unsigned long pfn, unsigned int pages,
1600 				  int ih, int map)
1601 {
1602 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604 	u16 did = domain->iommu_did[iommu->seq_id];
1605 
1606 	BUG_ON(pages == 0);
1607 
1608 	if (ih)
1609 		ih = 1 << 6;
1610 
1611 	if (domain_use_first_level(domain)) {
1612 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613 	} else {
1614 		/*
1615 		 * Fallback to domain selective flush if no PSI support or
1616 		 * the size is too big. PSI requires page size to be 2 ^ x,
1617 		 * and the base address is naturally aligned to the size.
1618 		 */
1619 		if (!cap_pgsel_inv(iommu->cap) ||
1620 		    mask > cap_max_amask_val(iommu->cap))
1621 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622 							DMA_TLB_DSI_FLUSH);
1623 		else
1624 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625 							DMA_TLB_PSI_FLUSH);
1626 	}
1627 
1628 	/*
1629 	 * In caching mode, changes of pages from non-present to present require
1630 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1631 	 */
1632 	if (!cap_caching_mode(iommu->cap) || !map)
1633 		iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635 
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638 					struct dmar_domain *domain,
1639 					unsigned long pfn, unsigned int pages)
1640 {
1641 	/*
1642 	 * It's a non-present to present mapping. Only flush if caching mode
1643 	 * and second level.
1644 	 */
1645 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647 	else
1648 		iommu_flush_write_buffer(iommu);
1649 }
1650 
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653 	struct dmar_domain *domain;
1654 	int idx;
1655 
1656 	domain = container_of(iovad, struct dmar_domain, iovad);
1657 
1658 	for_each_domain_iommu(idx, domain) {
1659 		struct intel_iommu *iommu = g_iommus[idx];
1660 		u16 did = domain->iommu_did[iommu->seq_id];
1661 
1662 		if (domain_use_first_level(domain))
1663 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664 		else
1665 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666 						 DMA_TLB_DSI_FLUSH);
1667 
1668 		if (!cap_caching_mode(iommu->cap))
1669 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670 					      0, MAX_AGAW_PFN_WIDTH);
1671 	}
1672 }
1673 
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676 	u32 pmen;
1677 	unsigned long flags;
1678 
1679 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680 		return;
1681 
1682 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684 	pmen &= ~DMA_PMEN_EPM;
1685 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686 
1687 	/* wait for the protected region status bit to clear */
1688 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1690 
1691 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693 
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696 	u32 sts;
1697 	unsigned long flags;
1698 
1699 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700 	iommu->gcmd |= DMA_GCMD_TE;
1701 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702 
1703 	/* Make sure hardware complete it */
1704 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705 		      readl, (sts & DMA_GSTS_TES), sts);
1706 
1707 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709 
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712 	u32 sts;
1713 	unsigned long flag;
1714 
1715 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717 		return;
1718 
1719 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720 	iommu->gcmd &= ~DMA_GCMD_TE;
1721 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722 
1723 	/* Make sure hardware complete it */
1724 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1726 
1727 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729 
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732 	u32 ndomains, nlongs;
1733 	size_t size;
1734 
1735 	ndomains = cap_ndoms(iommu->cap);
1736 	pr_debug("%s: Number of Domains supported <%d>\n",
1737 		 iommu->name, ndomains);
1738 	nlongs = BITS_TO_LONGS(ndomains);
1739 
1740 	spin_lock_init(&iommu->lock);
1741 
1742 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743 	if (!iommu->domain_ids) {
1744 		pr_err("%s: Allocating domain id array failed\n",
1745 		       iommu->name);
1746 		return -ENOMEM;
1747 	}
1748 
1749 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750 	iommu->domains = kzalloc(size, GFP_KERNEL);
1751 
1752 	if (iommu->domains) {
1753 		size = 256 * sizeof(struct dmar_domain *);
1754 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755 	}
1756 
1757 	if (!iommu->domains || !iommu->domains[0]) {
1758 		pr_err("%s: Allocating domain array failed\n",
1759 		       iommu->name);
1760 		kfree(iommu->domain_ids);
1761 		kfree(iommu->domains);
1762 		iommu->domain_ids = NULL;
1763 		iommu->domains    = NULL;
1764 		return -ENOMEM;
1765 	}
1766 
1767 	/*
1768 	 * If Caching mode is set, then invalid translations are tagged
1769 	 * with domain-id 0, hence we need to pre-allocate it. We also
1770 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1771 	 * make sure it is not used for a real domain.
1772 	 */
1773 	set_bit(0, iommu->domain_ids);
1774 
1775 	/*
1776 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777 	 * entry for first-level or pass-through translation modes should
1778 	 * be programmed with a domain id different from those used for
1779 	 * second-level or nested translation. We reserve a domain id for
1780 	 * this purpose.
1781 	 */
1782 	if (sm_supported(iommu))
1783 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784 
1785 	return 0;
1786 }
1787 
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790 	struct device_domain_info *info, *tmp;
1791 	unsigned long flags;
1792 
1793 	if (!iommu->domains || !iommu->domain_ids)
1794 		return;
1795 
1796 	spin_lock_irqsave(&device_domain_lock, flags);
1797 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798 		if (info->iommu != iommu)
1799 			continue;
1800 
1801 		if (!info->dev || !info->domain)
1802 			continue;
1803 
1804 		__dmar_remove_one_dev_info(info);
1805 	}
1806 	spin_unlock_irqrestore(&device_domain_lock, flags);
1807 
1808 	if (iommu->gcmd & DMA_GCMD_TE)
1809 		iommu_disable_translation(iommu);
1810 }
1811 
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814 	if ((iommu->domains) && (iommu->domain_ids)) {
1815 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816 		int i;
1817 
1818 		for (i = 0; i < elems; i++)
1819 			kfree(iommu->domains[i]);
1820 		kfree(iommu->domains);
1821 		kfree(iommu->domain_ids);
1822 		iommu->domains = NULL;
1823 		iommu->domain_ids = NULL;
1824 	}
1825 
1826 	g_iommus[iommu->seq_id] = NULL;
1827 
1828 	/* free context mapping */
1829 	free_context_table(iommu);
1830 
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832 	if (pasid_supported(iommu)) {
1833 		if (ecap_prs(iommu->ecap))
1834 			intel_svm_finish_prq(iommu);
1835 	}
1836 	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1837 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1838 
1839 #endif
1840 }
1841 
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848 	struct dmar_drhd_unit *drhd;
1849 	struct intel_iommu *iommu;
1850 	static int first_level_support = -1;
1851 
1852 	if (likely(first_level_support != -1))
1853 		return first_level_support;
1854 
1855 	first_level_support = 1;
1856 
1857 	rcu_read_lock();
1858 	for_each_active_iommu(iommu, drhd) {
1859 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860 			first_level_support = 0;
1861 			break;
1862 		}
1863 	}
1864 	rcu_read_unlock();
1865 
1866 	return first_level_support;
1867 }
1868 
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871 	struct dmar_domain *domain;
1872 
1873 	domain = alloc_domain_mem();
1874 	if (!domain)
1875 		return NULL;
1876 
1877 	memset(domain, 0, sizeof(*domain));
1878 	domain->nid = NUMA_NO_NODE;
1879 	domain->flags = flags;
1880 	if (first_level_by_default())
1881 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882 	domain->has_iotlb_device = false;
1883 	INIT_LIST_HEAD(&domain->devices);
1884 
1885 	return domain;
1886 }
1887 
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890 			       struct intel_iommu *iommu)
1891 {
1892 	unsigned long ndomains;
1893 	int num;
1894 
1895 	assert_spin_locked(&device_domain_lock);
1896 	assert_spin_locked(&iommu->lock);
1897 
1898 	domain->iommu_refcnt[iommu->seq_id] += 1;
1899 	domain->iommu_count += 1;
1900 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901 		ndomains = cap_ndoms(iommu->cap);
1902 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903 
1904 		if (num >= ndomains) {
1905 			pr_err("%s: No free domain ids\n", iommu->name);
1906 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1907 			domain->iommu_count -= 1;
1908 			return -ENOSPC;
1909 		}
1910 
1911 		set_bit(num, iommu->domain_ids);
1912 		set_iommu_domain(iommu, num, domain);
1913 
1914 		domain->iommu_did[iommu->seq_id] = num;
1915 		domain->nid			 = iommu->node;
1916 
1917 		domain_update_iommu_cap(domain);
1918 	}
1919 
1920 	return 0;
1921 }
1922 
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924 			       struct intel_iommu *iommu)
1925 {
1926 	int num, count;
1927 
1928 	assert_spin_locked(&device_domain_lock);
1929 	assert_spin_locked(&iommu->lock);
1930 
1931 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1932 	count = --domain->iommu_count;
1933 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934 		num = domain->iommu_did[iommu->seq_id];
1935 		clear_bit(num, iommu->domain_ids);
1936 		set_iommu_domain(iommu, num, NULL);
1937 
1938 		domain_update_iommu_cap(domain);
1939 		domain->iommu_did[iommu->seq_id] = 0;
1940 	}
1941 
1942 	return count;
1943 }
1944 
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947 
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950 	struct pci_dev *pdev = NULL;
1951 	struct iova *iova;
1952 	int i;
1953 
1954 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955 
1956 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957 		&reserved_rbtree_key);
1958 
1959 	/* IOAPIC ranges shouldn't be accessed by DMA */
1960 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961 		IOVA_PFN(IOAPIC_RANGE_END));
1962 	if (!iova) {
1963 		pr_err("Reserve IOAPIC range failed\n");
1964 		return -ENODEV;
1965 	}
1966 
1967 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1968 	for_each_pci_dev(pdev) {
1969 		struct resource *r;
1970 
1971 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972 			r = &pdev->resource[i];
1973 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974 				continue;
1975 			iova = reserve_iova(&reserved_iova_list,
1976 					    IOVA_PFN(r->start),
1977 					    IOVA_PFN(r->end));
1978 			if (!iova) {
1979 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980 				return -ENODEV;
1981 			}
1982 		}
1983 	}
1984 	return 0;
1985 }
1986 
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989 	int agaw;
1990 	int r = (gaw - 12) % 9;
1991 
1992 	if (r == 0)
1993 		agaw = gaw;
1994 	else
1995 		agaw = gaw + 9 - r;
1996 	if (agaw > 64)
1997 		agaw = 64;
1998 	return agaw;
1999 }
2000 
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003 
2004 	/* Remove associated devices and clear attached or cached domains */
2005 	domain_remove_dev_info(domain);
2006 
2007 	/* destroy iovas */
2008 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009 		put_iova_domain(&domain->iovad);
2010 
2011 	if (domain->pgd) {
2012 		struct page *freelist;
2013 
2014 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015 		dma_free_pagelist(freelist);
2016 	}
2017 
2018 	free_domain_mem(domain);
2019 }
2020 
2021 /*
2022  * Get the PASID directory size for scalable mode context entry.
2023  * Value of X in the PDTS field of a scalable mode context entry
2024  * indicates PASID directory with 2^(X + 7) entries.
2025  */
2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2027 {
2028 	int pds, max_pde;
2029 
2030 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2031 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2032 	if (pds < 7)
2033 		return 0;
2034 
2035 	return pds - 7;
2036 }
2037 
2038 /*
2039  * Set the RID_PASID field of a scalable mode context entry. The
2040  * IOMMU hardware will use the PASID value set in this field for
2041  * DMA translations of DMA requests without PASID.
2042  */
2043 static inline void
2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2045 {
2046 	context->hi |= pasid & ((1 << 20) - 1);
2047 }
2048 
2049 /*
2050  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_dte(struct context_entry *context)
2054 {
2055 	context->lo |= (1 << 2);
2056 }
2057 
2058 /*
2059  * Set the PRE(Page Request Enable) field of a scalable mode context
2060  * entry.
2061  */
2062 static inline void context_set_sm_pre(struct context_entry *context)
2063 {
2064 	context->lo |= (1 << 4);
2065 }
2066 
2067 /* Convert value to context PASID directory size field coding. */
2068 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2069 
2070 static int domain_context_mapping_one(struct dmar_domain *domain,
2071 				      struct intel_iommu *iommu,
2072 				      struct pasid_table *table,
2073 				      u8 bus, u8 devfn)
2074 {
2075 	u16 did = domain->iommu_did[iommu->seq_id];
2076 	int translation = CONTEXT_TT_MULTI_LEVEL;
2077 	struct device_domain_info *info = NULL;
2078 	struct context_entry *context;
2079 	unsigned long flags;
2080 	int ret;
2081 
2082 	WARN_ON(did == 0);
2083 
2084 	if (hw_pass_through && domain_type_is_si(domain))
2085 		translation = CONTEXT_TT_PASS_THROUGH;
2086 
2087 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2088 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2089 
2090 	BUG_ON(!domain->pgd);
2091 
2092 	spin_lock_irqsave(&device_domain_lock, flags);
2093 	spin_lock(&iommu->lock);
2094 
2095 	ret = -ENOMEM;
2096 	context = iommu_context_addr(iommu, bus, devfn, 1);
2097 	if (!context)
2098 		goto out_unlock;
2099 
2100 	ret = 0;
2101 	if (context_present(context))
2102 		goto out_unlock;
2103 
2104 	/*
2105 	 * For kdump cases, old valid entries may be cached due to the
2106 	 * in-flight DMA and copied pgtable, but there is no unmapping
2107 	 * behaviour for them, thus we need an explicit cache flush for
2108 	 * the newly-mapped device. For kdump, at this point, the device
2109 	 * is supposed to finish reset at its driver probe stage, so no
2110 	 * in-flight DMA will exist, and we don't need to worry anymore
2111 	 * hereafter.
2112 	 */
2113 	if (context_copied(context)) {
2114 		u16 did_old = context_domain_id(context);
2115 
2116 		if (did_old < cap_ndoms(iommu->cap)) {
2117 			iommu->flush.flush_context(iommu, did_old,
2118 						   (((u16)bus) << 8) | devfn,
2119 						   DMA_CCMD_MASK_NOBIT,
2120 						   DMA_CCMD_DEVICE_INVL);
2121 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2122 						 DMA_TLB_DSI_FLUSH);
2123 		}
2124 	}
2125 
2126 	context_clear_entry(context);
2127 
2128 	if (sm_supported(iommu)) {
2129 		unsigned long pds;
2130 
2131 		WARN_ON(!table);
2132 
2133 		/* Setup the PASID DIR pointer: */
2134 		pds = context_get_sm_pds(table);
2135 		context->lo = (u64)virt_to_phys(table->table) |
2136 				context_pdts(pds);
2137 
2138 		/* Setup the RID_PASID field: */
2139 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2140 
2141 		/*
2142 		 * Setup the Device-TLB enable bit and Page request
2143 		 * Enable bit:
2144 		 */
2145 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146 		if (info && info->ats_supported)
2147 			context_set_sm_dte(context);
2148 		if (info && info->pri_supported)
2149 			context_set_sm_pre(context);
2150 	} else {
2151 		struct dma_pte *pgd = domain->pgd;
2152 		int agaw;
2153 
2154 		context_set_domain_id(context, did);
2155 
2156 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2157 			/*
2158 			 * Skip top levels of page tables for iommu which has
2159 			 * less agaw than default. Unnecessary for PT mode.
2160 			 */
2161 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2162 				ret = -ENOMEM;
2163 				pgd = phys_to_virt(dma_pte_addr(pgd));
2164 				if (!dma_pte_present(pgd))
2165 					goto out_unlock;
2166 			}
2167 
2168 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2169 			if (info && info->ats_supported)
2170 				translation = CONTEXT_TT_DEV_IOTLB;
2171 			else
2172 				translation = CONTEXT_TT_MULTI_LEVEL;
2173 
2174 			context_set_address_root(context, virt_to_phys(pgd));
2175 			context_set_address_width(context, agaw);
2176 		} else {
2177 			/*
2178 			 * In pass through mode, AW must be programmed to
2179 			 * indicate the largest AGAW value supported by
2180 			 * hardware. And ASR is ignored by hardware.
2181 			 */
2182 			context_set_address_width(context, iommu->msagaw);
2183 		}
2184 
2185 		context_set_translation_type(context, translation);
2186 	}
2187 
2188 	context_set_fault_enable(context);
2189 	context_set_present(context);
2190 	if (!ecap_coherent(iommu->ecap))
2191 		clflush_cache_range(context, sizeof(*context));
2192 
2193 	/*
2194 	 * It's a non-present to present mapping. If hardware doesn't cache
2195 	 * non-present entry we only need to flush the write-buffer. If the
2196 	 * _does_ cache non-present entries, then it does so in the special
2197 	 * domain #0, which we have to flush:
2198 	 */
2199 	if (cap_caching_mode(iommu->cap)) {
2200 		iommu->flush.flush_context(iommu, 0,
2201 					   (((u16)bus) << 8) | devfn,
2202 					   DMA_CCMD_MASK_NOBIT,
2203 					   DMA_CCMD_DEVICE_INVL);
2204 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2205 	} else {
2206 		iommu_flush_write_buffer(iommu);
2207 	}
2208 	iommu_enable_dev_iotlb(info);
2209 
2210 	ret = 0;
2211 
2212 out_unlock:
2213 	spin_unlock(&iommu->lock);
2214 	spin_unlock_irqrestore(&device_domain_lock, flags);
2215 
2216 	return ret;
2217 }
2218 
2219 struct domain_context_mapping_data {
2220 	struct dmar_domain *domain;
2221 	struct intel_iommu *iommu;
2222 	struct pasid_table *table;
2223 };
2224 
2225 static int domain_context_mapping_cb(struct pci_dev *pdev,
2226 				     u16 alias, void *opaque)
2227 {
2228 	struct domain_context_mapping_data *data = opaque;
2229 
2230 	return domain_context_mapping_one(data->domain, data->iommu,
2231 					  data->table, PCI_BUS_NUM(alias),
2232 					  alias & 0xff);
2233 }
2234 
2235 static int
2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2237 {
2238 	struct domain_context_mapping_data data;
2239 	struct pasid_table *table;
2240 	struct intel_iommu *iommu;
2241 	u8 bus, devfn;
2242 
2243 	iommu = device_to_iommu(dev, &bus, &devfn);
2244 	if (!iommu)
2245 		return -ENODEV;
2246 
2247 	table = intel_pasid_get_table(dev);
2248 
2249 	if (!dev_is_pci(dev))
2250 		return domain_context_mapping_one(domain, iommu, table,
2251 						  bus, devfn);
2252 
2253 	data.domain = domain;
2254 	data.iommu = iommu;
2255 	data.table = table;
2256 
2257 	return pci_for_each_dma_alias(to_pci_dev(dev),
2258 				      &domain_context_mapping_cb, &data);
2259 }
2260 
2261 static int domain_context_mapped_cb(struct pci_dev *pdev,
2262 				    u16 alias, void *opaque)
2263 {
2264 	struct intel_iommu *iommu = opaque;
2265 
2266 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2267 }
2268 
2269 static int domain_context_mapped(struct device *dev)
2270 {
2271 	struct intel_iommu *iommu;
2272 	u8 bus, devfn;
2273 
2274 	iommu = device_to_iommu(dev, &bus, &devfn);
2275 	if (!iommu)
2276 		return -ENODEV;
2277 
2278 	if (!dev_is_pci(dev))
2279 		return device_context_mapped(iommu, bus, devfn);
2280 
2281 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2282 				       domain_context_mapped_cb, iommu);
2283 }
2284 
2285 /* Returns a number of VTD pages, but aligned to MM page size */
2286 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2287 					    size_t size)
2288 {
2289 	host_addr &= ~PAGE_MASK;
2290 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2291 }
2292 
2293 /* Return largest possible superpage level for a given mapping */
2294 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2295 					  unsigned long iov_pfn,
2296 					  unsigned long phy_pfn,
2297 					  unsigned long pages)
2298 {
2299 	int support, level = 1;
2300 	unsigned long pfnmerge;
2301 
2302 	support = domain->iommu_superpage;
2303 
2304 	/* To use a large page, the virtual *and* physical addresses
2305 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2306 	   of them will mean we have to use smaller pages. So just
2307 	   merge them and check both at once. */
2308 	pfnmerge = iov_pfn | phy_pfn;
2309 
2310 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2311 		pages >>= VTD_STRIDE_SHIFT;
2312 		if (!pages)
2313 			break;
2314 		pfnmerge >>= VTD_STRIDE_SHIFT;
2315 		level++;
2316 		support--;
2317 	}
2318 	return level;
2319 }
2320 
2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322 			    struct scatterlist *sg, unsigned long phys_pfn,
2323 			    unsigned long nr_pages, int prot)
2324 {
2325 	struct dma_pte *first_pte = NULL, *pte = NULL;
2326 	phys_addr_t pteval;
2327 	unsigned long sg_res = 0;
2328 	unsigned int largepage_lvl = 0;
2329 	unsigned long lvl_pages = 0;
2330 	u64 attr;
2331 
2332 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2333 
2334 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2335 		return -EINVAL;
2336 
2337 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2338 	if (domain_use_first_level(domain))
2339 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2340 
2341 	if (!sg) {
2342 		sg_res = nr_pages;
2343 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344 	}
2345 
2346 	while (nr_pages > 0) {
2347 		uint64_t tmp;
2348 
2349 		if (!sg_res) {
2350 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2351 
2352 			sg_res = aligned_nrpages(sg->offset, sg->length);
2353 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2354 			sg->dma_length = sg->length;
2355 			pteval = (sg_phys(sg) - pgoff) | attr;
2356 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2357 		}
2358 
2359 		if (!pte) {
2360 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2361 
2362 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2363 			if (!pte)
2364 				return -ENOMEM;
2365 			/* It is large page*/
2366 			if (largepage_lvl > 1) {
2367 				unsigned long nr_superpages, end_pfn;
2368 
2369 				pteval |= DMA_PTE_LARGE_PAGE;
2370 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2371 
2372 				nr_superpages = sg_res / lvl_pages;
2373 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2374 
2375 				/*
2376 				 * Ensure that old small page tables are
2377 				 * removed to make room for superpage(s).
2378 				 * We're adding new large pages, so make sure
2379 				 * we don't remove their parent tables.
2380 				 */
2381 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2382 						       largepage_lvl + 1);
2383 			} else {
2384 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2385 			}
2386 
2387 		}
2388 		/* We don't need lock here, nobody else
2389 		 * touches the iova range
2390 		 */
2391 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2392 		if (tmp) {
2393 			static int dumps = 5;
2394 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2395 				iov_pfn, tmp, (unsigned long long)pteval);
2396 			if (dumps) {
2397 				dumps--;
2398 				debug_dma_dump_mappings(NULL);
2399 			}
2400 			WARN_ON(1);
2401 		}
2402 
2403 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2404 
2405 		BUG_ON(nr_pages < lvl_pages);
2406 		BUG_ON(sg_res < lvl_pages);
2407 
2408 		nr_pages -= lvl_pages;
2409 		iov_pfn += lvl_pages;
2410 		phys_pfn += lvl_pages;
2411 		pteval += lvl_pages * VTD_PAGE_SIZE;
2412 		sg_res -= lvl_pages;
2413 
2414 		/* If the next PTE would be the first in a new page, then we
2415 		   need to flush the cache on the entries we've just written.
2416 		   And then we'll need to recalculate 'pte', so clear it and
2417 		   let it get set again in the if (!pte) block above.
2418 
2419 		   If we're done (!nr_pages) we need to flush the cache too.
2420 
2421 		   Also if we've been setting superpages, we may need to
2422 		   recalculate 'pte' and switch back to smaller pages for the
2423 		   end of the mapping, if the trailing size is not enough to
2424 		   use another superpage (i.e. sg_res < lvl_pages). */
2425 		pte++;
2426 		if (!nr_pages || first_pte_in_page(pte) ||
2427 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2428 			domain_flush_cache(domain, first_pte,
2429 					   (void *)pte - (void *)first_pte);
2430 			pte = NULL;
2431 		}
2432 
2433 		if (!sg_res && nr_pages)
2434 			sg = sg_next(sg);
2435 	}
2436 	return 0;
2437 }
2438 
2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2440 			  struct scatterlist *sg, unsigned long phys_pfn,
2441 			  unsigned long nr_pages, int prot)
2442 {
2443 	int iommu_id, ret;
2444 	struct intel_iommu *iommu;
2445 
2446 	/* Do the real mapping first */
2447 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2448 	if (ret)
2449 		return ret;
2450 
2451 	for_each_domain_iommu(iommu_id, domain) {
2452 		iommu = g_iommus[iommu_id];
2453 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2454 	}
2455 
2456 	return 0;
2457 }
2458 
2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2460 				    struct scatterlist *sg, unsigned long nr_pages,
2461 				    int prot)
2462 {
2463 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2464 }
2465 
2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2467 				     unsigned long phys_pfn, unsigned long nr_pages,
2468 				     int prot)
2469 {
2470 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2471 }
2472 
2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2474 {
2475 	unsigned long flags;
2476 	struct context_entry *context;
2477 	u16 did_old;
2478 
2479 	if (!iommu)
2480 		return;
2481 
2482 	spin_lock_irqsave(&iommu->lock, flags);
2483 	context = iommu_context_addr(iommu, bus, devfn, 0);
2484 	if (!context) {
2485 		spin_unlock_irqrestore(&iommu->lock, flags);
2486 		return;
2487 	}
2488 	did_old = context_domain_id(context);
2489 	context_clear_entry(context);
2490 	__iommu_flush_cache(iommu, context, sizeof(*context));
2491 	spin_unlock_irqrestore(&iommu->lock, flags);
2492 	iommu->flush.flush_context(iommu,
2493 				   did_old,
2494 				   (((u16)bus) << 8) | devfn,
2495 				   DMA_CCMD_MASK_NOBIT,
2496 				   DMA_CCMD_DEVICE_INVL);
2497 	iommu->flush.flush_iotlb(iommu,
2498 				 did_old,
2499 				 0,
2500 				 0,
2501 				 DMA_TLB_DSI_FLUSH);
2502 }
2503 
2504 static inline void unlink_domain_info(struct device_domain_info *info)
2505 {
2506 	assert_spin_locked(&device_domain_lock);
2507 	list_del(&info->link);
2508 	list_del(&info->global);
2509 	if (info->dev)
2510 		dev_iommu_priv_set(info->dev, NULL);
2511 }
2512 
2513 static void domain_remove_dev_info(struct dmar_domain *domain)
2514 {
2515 	struct device_domain_info *info, *tmp;
2516 	unsigned long flags;
2517 
2518 	spin_lock_irqsave(&device_domain_lock, flags);
2519 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2520 		__dmar_remove_one_dev_info(info);
2521 	spin_unlock_irqrestore(&device_domain_lock, flags);
2522 }
2523 
2524 struct dmar_domain *find_domain(struct device *dev)
2525 {
2526 	struct device_domain_info *info;
2527 
2528 	if (unlikely(attach_deferred(dev)))
2529 		return NULL;
2530 
2531 	/* No lock here, assumes no domain exit in normal case */
2532 	info = get_domain_info(dev);
2533 	if (likely(info))
2534 		return info->domain;
2535 
2536 	return NULL;
2537 }
2538 
2539 static void do_deferred_attach(struct device *dev)
2540 {
2541 	struct iommu_domain *domain;
2542 
2543 	dev_iommu_priv_set(dev, NULL);
2544 	domain = iommu_get_domain_for_dev(dev);
2545 	if (domain)
2546 		intel_iommu_attach_device(domain, dev);
2547 }
2548 
2549 static inline struct device_domain_info *
2550 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2551 {
2552 	struct device_domain_info *info;
2553 
2554 	list_for_each_entry(info, &device_domain_list, global)
2555 		if (info->segment == segment && info->bus == bus &&
2556 		    info->devfn == devfn)
2557 			return info;
2558 
2559 	return NULL;
2560 }
2561 
2562 static int domain_setup_first_level(struct intel_iommu *iommu,
2563 				    struct dmar_domain *domain,
2564 				    struct device *dev,
2565 				    u32 pasid)
2566 {
2567 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2568 	struct dma_pte *pgd = domain->pgd;
2569 	int agaw, level;
2570 
2571 	/*
2572 	 * Skip top levels of page tables for iommu which has
2573 	 * less agaw than default. Unnecessary for PT mode.
2574 	 */
2575 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2576 		pgd = phys_to_virt(dma_pte_addr(pgd));
2577 		if (!dma_pte_present(pgd))
2578 			return -ENOMEM;
2579 	}
2580 
2581 	level = agaw_to_level(agaw);
2582 	if (level != 4 && level != 5)
2583 		return -EINVAL;
2584 
2585 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2586 
2587 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2588 					     domain->iommu_did[iommu->seq_id],
2589 					     flags);
2590 }
2591 
2592 static bool dev_is_real_dma_subdevice(struct device *dev)
2593 {
2594 	return dev && dev_is_pci(dev) &&
2595 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2596 }
2597 
2598 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2599 						    int bus, int devfn,
2600 						    struct device *dev,
2601 						    struct dmar_domain *domain)
2602 {
2603 	struct dmar_domain *found = NULL;
2604 	struct device_domain_info *info;
2605 	unsigned long flags;
2606 	int ret;
2607 
2608 	info = alloc_devinfo_mem();
2609 	if (!info)
2610 		return NULL;
2611 
2612 	if (!dev_is_real_dma_subdevice(dev)) {
2613 		info->bus = bus;
2614 		info->devfn = devfn;
2615 		info->segment = iommu->segment;
2616 	} else {
2617 		struct pci_dev *pdev = to_pci_dev(dev);
2618 
2619 		info->bus = pdev->bus->number;
2620 		info->devfn = pdev->devfn;
2621 		info->segment = pci_domain_nr(pdev->bus);
2622 	}
2623 
2624 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2625 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2626 	info->ats_qdep = 0;
2627 	info->dev = dev;
2628 	info->domain = domain;
2629 	info->iommu = iommu;
2630 	info->pasid_table = NULL;
2631 	info->auxd_enabled = 0;
2632 	INIT_LIST_HEAD(&info->auxiliary_domains);
2633 
2634 	if (dev && dev_is_pci(dev)) {
2635 		struct pci_dev *pdev = to_pci_dev(info->dev);
2636 
2637 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2638 		    pci_ats_supported(pdev) &&
2639 		    dmar_find_matched_atsr_unit(pdev))
2640 			info->ats_supported = 1;
2641 
2642 		if (sm_supported(iommu)) {
2643 			if (pasid_supported(iommu)) {
2644 				int features = pci_pasid_features(pdev);
2645 				if (features >= 0)
2646 					info->pasid_supported = features | 1;
2647 			}
2648 
2649 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2650 			    pci_pri_supported(pdev))
2651 				info->pri_supported = 1;
2652 		}
2653 	}
2654 
2655 	spin_lock_irqsave(&device_domain_lock, flags);
2656 	if (dev)
2657 		found = find_domain(dev);
2658 
2659 	if (!found) {
2660 		struct device_domain_info *info2;
2661 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2662 						       info->devfn);
2663 		if (info2) {
2664 			found      = info2->domain;
2665 			info2->dev = dev;
2666 		}
2667 	}
2668 
2669 	if (found) {
2670 		spin_unlock_irqrestore(&device_domain_lock, flags);
2671 		free_devinfo_mem(info);
2672 		/* Caller must free the original domain */
2673 		return found;
2674 	}
2675 
2676 	spin_lock(&iommu->lock);
2677 	ret = domain_attach_iommu(domain, iommu);
2678 	spin_unlock(&iommu->lock);
2679 
2680 	if (ret) {
2681 		spin_unlock_irqrestore(&device_domain_lock, flags);
2682 		free_devinfo_mem(info);
2683 		return NULL;
2684 	}
2685 
2686 	list_add(&info->link, &domain->devices);
2687 	list_add(&info->global, &device_domain_list);
2688 	if (dev)
2689 		dev_iommu_priv_set(dev, info);
2690 	spin_unlock_irqrestore(&device_domain_lock, flags);
2691 
2692 	/* PASID table is mandatory for a PCI device in scalable mode. */
2693 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2694 		ret = intel_pasid_alloc_table(dev);
2695 		if (ret) {
2696 			dev_err(dev, "PASID table allocation failed\n");
2697 			dmar_remove_one_dev_info(dev);
2698 			return NULL;
2699 		}
2700 
2701 		/* Setup the PASID entry for requests without PASID: */
2702 		spin_lock_irqsave(&iommu->lock, flags);
2703 		if (hw_pass_through && domain_type_is_si(domain))
2704 			ret = intel_pasid_setup_pass_through(iommu, domain,
2705 					dev, PASID_RID2PASID);
2706 		else if (domain_use_first_level(domain))
2707 			ret = domain_setup_first_level(iommu, domain, dev,
2708 					PASID_RID2PASID);
2709 		else
2710 			ret = intel_pasid_setup_second_level(iommu, domain,
2711 					dev, PASID_RID2PASID);
2712 		spin_unlock_irqrestore(&iommu->lock, flags);
2713 		if (ret) {
2714 			dev_err(dev, "Setup RID2PASID failed\n");
2715 			dmar_remove_one_dev_info(dev);
2716 			return NULL;
2717 		}
2718 	}
2719 
2720 	if (dev && domain_context_mapping(domain, dev)) {
2721 		dev_err(dev, "Domain context map failed\n");
2722 		dmar_remove_one_dev_info(dev);
2723 		return NULL;
2724 	}
2725 
2726 	return domain;
2727 }
2728 
2729 static int iommu_domain_identity_map(struct dmar_domain *domain,
2730 				     unsigned long first_vpfn,
2731 				     unsigned long last_vpfn)
2732 {
2733 	/*
2734 	 * RMRR range might have overlap with physical memory range,
2735 	 * clear it first
2736 	 */
2737 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2738 
2739 	return __domain_mapping(domain, first_vpfn, NULL,
2740 				first_vpfn, last_vpfn - first_vpfn + 1,
2741 				DMA_PTE_READ|DMA_PTE_WRITE);
2742 }
2743 
2744 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2745 
2746 static int __init si_domain_init(int hw)
2747 {
2748 	struct dmar_rmrr_unit *rmrr;
2749 	struct device *dev;
2750 	int i, nid, ret;
2751 
2752 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2753 	if (!si_domain)
2754 		return -EFAULT;
2755 
2756 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2757 		domain_exit(si_domain);
2758 		return -EFAULT;
2759 	}
2760 
2761 	if (hw)
2762 		return 0;
2763 
2764 	for_each_online_node(nid) {
2765 		unsigned long start_pfn, end_pfn;
2766 		int i;
2767 
2768 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2769 			ret = iommu_domain_identity_map(si_domain,
2770 					mm_to_dma_pfn(start_pfn),
2771 					mm_to_dma_pfn(end_pfn));
2772 			if (ret)
2773 				return ret;
2774 		}
2775 	}
2776 
2777 	/*
2778 	 * Identity map the RMRRs so that devices with RMRRs could also use
2779 	 * the si_domain.
2780 	 */
2781 	for_each_rmrr_units(rmrr) {
2782 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2783 					  i, dev) {
2784 			unsigned long long start = rmrr->base_address;
2785 			unsigned long long end = rmrr->end_address;
2786 
2787 			if (WARN_ON(end < start ||
2788 				    end >> agaw_to_width(si_domain->agaw)))
2789 				continue;
2790 
2791 			ret = iommu_domain_identity_map(si_domain,
2792 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2793 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2794 			if (ret)
2795 				return ret;
2796 		}
2797 	}
2798 
2799 	return 0;
2800 }
2801 
2802 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2803 {
2804 	struct dmar_domain *ndomain;
2805 	struct intel_iommu *iommu;
2806 	u8 bus, devfn;
2807 
2808 	iommu = device_to_iommu(dev, &bus, &devfn);
2809 	if (!iommu)
2810 		return -ENODEV;
2811 
2812 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2813 	if (ndomain != domain)
2814 		return -EBUSY;
2815 
2816 	return 0;
2817 }
2818 
2819 static bool device_has_rmrr(struct device *dev)
2820 {
2821 	struct dmar_rmrr_unit *rmrr;
2822 	struct device *tmp;
2823 	int i;
2824 
2825 	rcu_read_lock();
2826 	for_each_rmrr_units(rmrr) {
2827 		/*
2828 		 * Return TRUE if this RMRR contains the device that
2829 		 * is passed in.
2830 		 */
2831 		for_each_active_dev_scope(rmrr->devices,
2832 					  rmrr->devices_cnt, i, tmp)
2833 			if (tmp == dev ||
2834 			    is_downstream_to_pci_bridge(dev, tmp)) {
2835 				rcu_read_unlock();
2836 				return true;
2837 			}
2838 	}
2839 	rcu_read_unlock();
2840 	return false;
2841 }
2842 
2843 /**
2844  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2845  * is relaxable (ie. is allowed to be not enforced under some conditions)
2846  * @dev: device handle
2847  *
2848  * We assume that PCI USB devices with RMRRs have them largely
2849  * for historical reasons and that the RMRR space is not actively used post
2850  * boot.  This exclusion may change if vendors begin to abuse it.
2851  *
2852  * The same exception is made for graphics devices, with the requirement that
2853  * any use of the RMRR regions will be torn down before assigning the device
2854  * to a guest.
2855  *
2856  * Return: true if the RMRR is relaxable, false otherwise
2857  */
2858 static bool device_rmrr_is_relaxable(struct device *dev)
2859 {
2860 	struct pci_dev *pdev;
2861 
2862 	if (!dev_is_pci(dev))
2863 		return false;
2864 
2865 	pdev = to_pci_dev(dev);
2866 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2867 		return true;
2868 	else
2869 		return false;
2870 }
2871 
2872 /*
2873  * There are a couple cases where we need to restrict the functionality of
2874  * devices associated with RMRRs.  The first is when evaluating a device for
2875  * identity mapping because problems exist when devices are moved in and out
2876  * of domains and their respective RMRR information is lost.  This means that
2877  * a device with associated RMRRs will never be in a "passthrough" domain.
2878  * The second is use of the device through the IOMMU API.  This interface
2879  * expects to have full control of the IOVA space for the device.  We cannot
2880  * satisfy both the requirement that RMRR access is maintained and have an
2881  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2882  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2883  * We therefore prevent devices associated with an RMRR from participating in
2884  * the IOMMU API, which eliminates them from device assignment.
2885  *
2886  * In both cases, devices which have relaxable RMRRs are not concerned by this
2887  * restriction. See device_rmrr_is_relaxable comment.
2888  */
2889 static bool device_is_rmrr_locked(struct device *dev)
2890 {
2891 	if (!device_has_rmrr(dev))
2892 		return false;
2893 
2894 	if (device_rmrr_is_relaxable(dev))
2895 		return false;
2896 
2897 	return true;
2898 }
2899 
2900 /*
2901  * Return the required default domain type for a specific device.
2902  *
2903  * @dev: the device in query
2904  * @startup: true if this is during early boot
2905  *
2906  * Returns:
2907  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2908  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2909  *  - 0: both identity and dynamic domains work for this device
2910  */
2911 static int device_def_domain_type(struct device *dev)
2912 {
2913 	if (dev_is_pci(dev)) {
2914 		struct pci_dev *pdev = to_pci_dev(dev);
2915 
2916 		/*
2917 		 * Prevent any device marked as untrusted from getting
2918 		 * placed into the statically identity mapping domain.
2919 		 */
2920 		if (pdev->untrusted)
2921 			return IOMMU_DOMAIN_DMA;
2922 
2923 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2924 			return IOMMU_DOMAIN_IDENTITY;
2925 
2926 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2927 			return IOMMU_DOMAIN_IDENTITY;
2928 	}
2929 
2930 	return 0;
2931 }
2932 
2933 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2934 {
2935 	/*
2936 	 * Start from the sane iommu hardware state.
2937 	 * If the queued invalidation is already initialized by us
2938 	 * (for example, while enabling interrupt-remapping) then
2939 	 * we got the things already rolling from a sane state.
2940 	 */
2941 	if (!iommu->qi) {
2942 		/*
2943 		 * Clear any previous faults.
2944 		 */
2945 		dmar_fault(-1, iommu);
2946 		/*
2947 		 * Disable queued invalidation if supported and already enabled
2948 		 * before OS handover.
2949 		 */
2950 		dmar_disable_qi(iommu);
2951 	}
2952 
2953 	if (dmar_enable_qi(iommu)) {
2954 		/*
2955 		 * Queued Invalidate not enabled, use Register Based Invalidate
2956 		 */
2957 		iommu->flush.flush_context = __iommu_flush_context;
2958 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2959 		pr_info("%s: Using Register based invalidation\n",
2960 			iommu->name);
2961 	} else {
2962 		iommu->flush.flush_context = qi_flush_context;
2963 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2964 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2965 	}
2966 }
2967 
2968 static int copy_context_table(struct intel_iommu *iommu,
2969 			      struct root_entry *old_re,
2970 			      struct context_entry **tbl,
2971 			      int bus, bool ext)
2972 {
2973 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2974 	struct context_entry *new_ce = NULL, ce;
2975 	struct context_entry *old_ce = NULL;
2976 	struct root_entry re;
2977 	phys_addr_t old_ce_phys;
2978 
2979 	tbl_idx = ext ? bus * 2 : bus;
2980 	memcpy(&re, old_re, sizeof(re));
2981 
2982 	for (devfn = 0; devfn < 256; devfn++) {
2983 		/* First calculate the correct index */
2984 		idx = (ext ? devfn * 2 : devfn) % 256;
2985 
2986 		if (idx == 0) {
2987 			/* First save what we may have and clean up */
2988 			if (new_ce) {
2989 				tbl[tbl_idx] = new_ce;
2990 				__iommu_flush_cache(iommu, new_ce,
2991 						    VTD_PAGE_SIZE);
2992 				pos = 1;
2993 			}
2994 
2995 			if (old_ce)
2996 				memunmap(old_ce);
2997 
2998 			ret = 0;
2999 			if (devfn < 0x80)
3000 				old_ce_phys = root_entry_lctp(&re);
3001 			else
3002 				old_ce_phys = root_entry_uctp(&re);
3003 
3004 			if (!old_ce_phys) {
3005 				if (ext && devfn == 0) {
3006 					/* No LCTP, try UCTP */
3007 					devfn = 0x7f;
3008 					continue;
3009 				} else {
3010 					goto out;
3011 				}
3012 			}
3013 
3014 			ret = -ENOMEM;
3015 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3016 					MEMREMAP_WB);
3017 			if (!old_ce)
3018 				goto out;
3019 
3020 			new_ce = alloc_pgtable_page(iommu->node);
3021 			if (!new_ce)
3022 				goto out_unmap;
3023 
3024 			ret = 0;
3025 		}
3026 
3027 		/* Now copy the context entry */
3028 		memcpy(&ce, old_ce + idx, sizeof(ce));
3029 
3030 		if (!__context_present(&ce))
3031 			continue;
3032 
3033 		did = context_domain_id(&ce);
3034 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3035 			set_bit(did, iommu->domain_ids);
3036 
3037 		/*
3038 		 * We need a marker for copied context entries. This
3039 		 * marker needs to work for the old format as well as
3040 		 * for extended context entries.
3041 		 *
3042 		 * Bit 67 of the context entry is used. In the old
3043 		 * format this bit is available to software, in the
3044 		 * extended format it is the PGE bit, but PGE is ignored
3045 		 * by HW if PASIDs are disabled (and thus still
3046 		 * available).
3047 		 *
3048 		 * So disable PASIDs first and then mark the entry
3049 		 * copied. This means that we don't copy PASID
3050 		 * translations from the old kernel, but this is fine as
3051 		 * faults there are not fatal.
3052 		 */
3053 		context_clear_pasid_enable(&ce);
3054 		context_set_copied(&ce);
3055 
3056 		new_ce[idx] = ce;
3057 	}
3058 
3059 	tbl[tbl_idx + pos] = new_ce;
3060 
3061 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3062 
3063 out_unmap:
3064 	memunmap(old_ce);
3065 
3066 out:
3067 	return ret;
3068 }
3069 
3070 static int copy_translation_tables(struct intel_iommu *iommu)
3071 {
3072 	struct context_entry **ctxt_tbls;
3073 	struct root_entry *old_rt;
3074 	phys_addr_t old_rt_phys;
3075 	int ctxt_table_entries;
3076 	unsigned long flags;
3077 	u64 rtaddr_reg;
3078 	int bus, ret;
3079 	bool new_ext, ext;
3080 
3081 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3082 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3083 	new_ext    = !!ecap_ecs(iommu->ecap);
3084 
3085 	/*
3086 	 * The RTT bit can only be changed when translation is disabled,
3087 	 * but disabling translation means to open a window for data
3088 	 * corruption. So bail out and don't copy anything if we would
3089 	 * have to change the bit.
3090 	 */
3091 	if (new_ext != ext)
3092 		return -EINVAL;
3093 
3094 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3095 	if (!old_rt_phys)
3096 		return -EINVAL;
3097 
3098 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3099 	if (!old_rt)
3100 		return -ENOMEM;
3101 
3102 	/* This is too big for the stack - allocate it from slab */
3103 	ctxt_table_entries = ext ? 512 : 256;
3104 	ret = -ENOMEM;
3105 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3106 	if (!ctxt_tbls)
3107 		goto out_unmap;
3108 
3109 	for (bus = 0; bus < 256; bus++) {
3110 		ret = copy_context_table(iommu, &old_rt[bus],
3111 					 ctxt_tbls, bus, ext);
3112 		if (ret) {
3113 			pr_err("%s: Failed to copy context table for bus %d\n",
3114 				iommu->name, bus);
3115 			continue;
3116 		}
3117 	}
3118 
3119 	spin_lock_irqsave(&iommu->lock, flags);
3120 
3121 	/* Context tables are copied, now write them to the root_entry table */
3122 	for (bus = 0; bus < 256; bus++) {
3123 		int idx = ext ? bus * 2 : bus;
3124 		u64 val;
3125 
3126 		if (ctxt_tbls[idx]) {
3127 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3128 			iommu->root_entry[bus].lo = val;
3129 		}
3130 
3131 		if (!ext || !ctxt_tbls[idx + 1])
3132 			continue;
3133 
3134 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3135 		iommu->root_entry[bus].hi = val;
3136 	}
3137 
3138 	spin_unlock_irqrestore(&iommu->lock, flags);
3139 
3140 	kfree(ctxt_tbls);
3141 
3142 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3143 
3144 	ret = 0;
3145 
3146 out_unmap:
3147 	memunmap(old_rt);
3148 
3149 	return ret;
3150 }
3151 
3152 #ifdef CONFIG_INTEL_IOMMU_SVM
3153 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3154 {
3155 	struct intel_iommu *iommu = data;
3156 	ioasid_t ioasid;
3157 
3158 	if (!iommu)
3159 		return INVALID_IOASID;
3160 	/*
3161 	 * VT-d virtual command interface always uses the full 20 bit
3162 	 * PASID range. Host can partition guest PASID range based on
3163 	 * policies but it is out of guest's control.
3164 	 */
3165 	if (min < PASID_MIN || max > intel_pasid_max_id)
3166 		return INVALID_IOASID;
3167 
3168 	if (vcmd_alloc_pasid(iommu, &ioasid))
3169 		return INVALID_IOASID;
3170 
3171 	return ioasid;
3172 }
3173 
3174 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3175 {
3176 	struct intel_iommu *iommu = data;
3177 
3178 	if (!iommu)
3179 		return;
3180 	/*
3181 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3182 	 * We can only free the PASID when all the devices are unbound.
3183 	 */
3184 	if (ioasid_find(NULL, ioasid, NULL)) {
3185 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3186 		return;
3187 	}
3188 	vcmd_free_pasid(iommu, ioasid);
3189 }
3190 
3191 static void register_pasid_allocator(struct intel_iommu *iommu)
3192 {
3193 	/*
3194 	 * If we are running in the host, no need for custom allocator
3195 	 * in that PASIDs are allocated from the host system-wide.
3196 	 */
3197 	if (!cap_caching_mode(iommu->cap))
3198 		return;
3199 
3200 	if (!sm_supported(iommu)) {
3201 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3202 		return;
3203 	}
3204 
3205 	/*
3206 	 * Register a custom PASID allocator if we are running in a guest,
3207 	 * guest PASID must be obtained via virtual command interface.
3208 	 * There can be multiple vIOMMUs in each guest but only one allocator
3209 	 * is active. All vIOMMU allocators will eventually be calling the same
3210 	 * host allocator.
3211 	 */
3212 	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3213 		return;
3214 
3215 	pr_info("Register custom PASID allocator\n");
3216 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3217 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3218 	iommu->pasid_allocator.pdata = (void *)iommu;
3219 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3220 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3221 		/*
3222 		 * Disable scalable mode on this IOMMU if there
3223 		 * is no custom allocator. Mixing SM capable vIOMMU
3224 		 * and non-SM vIOMMU are not supported.
3225 		 */
3226 		intel_iommu_sm = 0;
3227 	}
3228 }
3229 #endif
3230 
3231 static int __init init_dmars(void)
3232 {
3233 	struct dmar_drhd_unit *drhd;
3234 	struct intel_iommu *iommu;
3235 	int ret;
3236 
3237 	/*
3238 	 * for each drhd
3239 	 *    allocate root
3240 	 *    initialize and program root entry to not present
3241 	 * endfor
3242 	 */
3243 	for_each_drhd_unit(drhd) {
3244 		/*
3245 		 * lock not needed as this is only incremented in the single
3246 		 * threaded kernel __init code path all other access are read
3247 		 * only
3248 		 */
3249 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3250 			g_num_of_iommus++;
3251 			continue;
3252 		}
3253 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3254 	}
3255 
3256 	/* Preallocate enough resources for IOMMU hot-addition */
3257 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3258 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3259 
3260 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3261 			GFP_KERNEL);
3262 	if (!g_iommus) {
3263 		pr_err("Allocating global iommu array failed\n");
3264 		ret = -ENOMEM;
3265 		goto error;
3266 	}
3267 
3268 	for_each_iommu(iommu, drhd) {
3269 		if (drhd->ignored) {
3270 			iommu_disable_translation(iommu);
3271 			continue;
3272 		}
3273 
3274 		/*
3275 		 * Find the max pasid size of all IOMMU's in the system.
3276 		 * We need to ensure the system pasid table is no bigger
3277 		 * than the smallest supported.
3278 		 */
3279 		if (pasid_supported(iommu)) {
3280 			u32 temp = 2 << ecap_pss(iommu->ecap);
3281 
3282 			intel_pasid_max_id = min_t(u32, temp,
3283 						   intel_pasid_max_id);
3284 		}
3285 
3286 		g_iommus[iommu->seq_id] = iommu;
3287 
3288 		intel_iommu_init_qi(iommu);
3289 
3290 		ret = iommu_init_domains(iommu);
3291 		if (ret)
3292 			goto free_iommu;
3293 
3294 		init_translation_status(iommu);
3295 
3296 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3297 			iommu_disable_translation(iommu);
3298 			clear_translation_pre_enabled(iommu);
3299 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3300 				iommu->name);
3301 		}
3302 
3303 		/*
3304 		 * TBD:
3305 		 * we could share the same root & context tables
3306 		 * among all IOMMU's. Need to Split it later.
3307 		 */
3308 		ret = iommu_alloc_root_entry(iommu);
3309 		if (ret)
3310 			goto free_iommu;
3311 
3312 		if (translation_pre_enabled(iommu)) {
3313 			pr_info("Translation already enabled - trying to copy translation structures\n");
3314 
3315 			ret = copy_translation_tables(iommu);
3316 			if (ret) {
3317 				/*
3318 				 * We found the IOMMU with translation
3319 				 * enabled - but failed to copy over the
3320 				 * old root-entry table. Try to proceed
3321 				 * by disabling translation now and
3322 				 * allocating a clean root-entry table.
3323 				 * This might cause DMAR faults, but
3324 				 * probably the dump will still succeed.
3325 				 */
3326 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3327 				       iommu->name);
3328 				iommu_disable_translation(iommu);
3329 				clear_translation_pre_enabled(iommu);
3330 			} else {
3331 				pr_info("Copied translation tables from previous kernel for %s\n",
3332 					iommu->name);
3333 			}
3334 		}
3335 
3336 		if (!ecap_pass_through(iommu->ecap))
3337 			hw_pass_through = 0;
3338 		intel_svm_check(iommu);
3339 	}
3340 
3341 	/*
3342 	 * Now that qi is enabled on all iommus, set the root entry and flush
3343 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3344 	 * flush_context function will loop forever and the boot hangs.
3345 	 */
3346 	for_each_active_iommu(iommu, drhd) {
3347 		iommu_flush_write_buffer(iommu);
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349 		register_pasid_allocator(iommu);
3350 #endif
3351 		iommu_set_root_entry(iommu);
3352 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3353 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3354 	}
3355 
3356 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3357 	dmar_map_gfx = 0;
3358 #endif
3359 
3360 	if (!dmar_map_gfx)
3361 		iommu_identity_mapping |= IDENTMAP_GFX;
3362 
3363 	check_tylersburg_isoch();
3364 
3365 	ret = si_domain_init(hw_pass_through);
3366 	if (ret)
3367 		goto free_iommu;
3368 
3369 	/*
3370 	 * for each drhd
3371 	 *   enable fault log
3372 	 *   global invalidate context cache
3373 	 *   global invalidate iotlb
3374 	 *   enable translation
3375 	 */
3376 	for_each_iommu(iommu, drhd) {
3377 		if (drhd->ignored) {
3378 			/*
3379 			 * we always have to disable PMRs or DMA may fail on
3380 			 * this device
3381 			 */
3382 			if (force_on)
3383 				iommu_disable_protect_mem_regions(iommu);
3384 			continue;
3385 		}
3386 
3387 		iommu_flush_write_buffer(iommu);
3388 
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3391 			/*
3392 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3393 			 * could cause possible lock race condition.
3394 			 */
3395 			up_write(&dmar_global_lock);
3396 			ret = intel_svm_enable_prq(iommu);
3397 			down_write(&dmar_global_lock);
3398 			if (ret)
3399 				goto free_iommu;
3400 		}
3401 #endif
3402 		ret = dmar_set_interrupt(iommu);
3403 		if (ret)
3404 			goto free_iommu;
3405 	}
3406 
3407 	return 0;
3408 
3409 free_iommu:
3410 	for_each_active_iommu(iommu, drhd) {
3411 		disable_dmar_iommu(iommu);
3412 		free_dmar_iommu(iommu);
3413 	}
3414 
3415 	kfree(g_iommus);
3416 
3417 error:
3418 	return ret;
3419 }
3420 
3421 /* This takes a number of _MM_ pages, not VTD pages */
3422 static unsigned long intel_alloc_iova(struct device *dev,
3423 				     struct dmar_domain *domain,
3424 				     unsigned long nrpages, uint64_t dma_mask)
3425 {
3426 	unsigned long iova_pfn;
3427 
3428 	/*
3429 	 * Restrict dma_mask to the width that the iommu can handle.
3430 	 * First-level translation restricts the input-address to a
3431 	 * canonical address (i.e., address bits 63:N have the same
3432 	 * value as address bit [N-1], where N is 48-bits with 4-level
3433 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3434 	 * [N-1].
3435 	 */
3436 	if (domain_use_first_level(domain))
3437 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3438 				 dma_mask);
3439 	else
3440 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3441 				 dma_mask);
3442 
3443 	/* Ensure we reserve the whole size-aligned region */
3444 	nrpages = __roundup_pow_of_two(nrpages);
3445 
3446 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3447 		/*
3448 		 * First try to allocate an io virtual address in
3449 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3450 		 * from higher range
3451 		 */
3452 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3453 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3454 		if (iova_pfn)
3455 			return iova_pfn;
3456 	}
3457 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3458 				   IOVA_PFN(dma_mask), true);
3459 	if (unlikely(!iova_pfn)) {
3460 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3461 			     nrpages);
3462 		return 0;
3463 	}
3464 
3465 	return iova_pfn;
3466 }
3467 
3468 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3469 				     size_t size, int dir, u64 dma_mask)
3470 {
3471 	struct dmar_domain *domain;
3472 	phys_addr_t start_paddr;
3473 	unsigned long iova_pfn;
3474 	int prot = 0;
3475 	int ret;
3476 	struct intel_iommu *iommu;
3477 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3478 
3479 	BUG_ON(dir == DMA_NONE);
3480 
3481 	if (unlikely(attach_deferred(dev)))
3482 		do_deferred_attach(dev);
3483 
3484 	domain = find_domain(dev);
3485 	if (!domain)
3486 		return DMA_MAPPING_ERROR;
3487 
3488 	iommu = domain_get_iommu(domain);
3489 	size = aligned_nrpages(paddr, size);
3490 
3491 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3492 	if (!iova_pfn)
3493 		goto error;
3494 
3495 	/*
3496 	 * Check if DMAR supports zero-length reads on write only
3497 	 * mappings..
3498 	 */
3499 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3500 			!cap_zlr(iommu->cap))
3501 		prot |= DMA_PTE_READ;
3502 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3503 		prot |= DMA_PTE_WRITE;
3504 	/*
3505 	 * paddr - (paddr + size) might be partial page, we should map the whole
3506 	 * page.  Note: if two part of one page are separately mapped, we
3507 	 * might have two guest_addr mapping to the same host paddr, but this
3508 	 * is not a big problem
3509 	 */
3510 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3511 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3512 	if (ret)
3513 		goto error;
3514 
3515 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3516 	start_paddr += paddr & ~PAGE_MASK;
3517 
3518 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3519 
3520 	return start_paddr;
3521 
3522 error:
3523 	if (iova_pfn)
3524 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3525 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3526 		size, (unsigned long long)paddr, dir);
3527 	return DMA_MAPPING_ERROR;
3528 }
3529 
3530 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3531 				 unsigned long offset, size_t size,
3532 				 enum dma_data_direction dir,
3533 				 unsigned long attrs)
3534 {
3535 	return __intel_map_single(dev, page_to_phys(page) + offset,
3536 				  size, dir, *dev->dma_mask);
3537 }
3538 
3539 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3540 				     size_t size, enum dma_data_direction dir,
3541 				     unsigned long attrs)
3542 {
3543 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3544 }
3545 
3546 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3547 {
3548 	struct dmar_domain *domain;
3549 	unsigned long start_pfn, last_pfn;
3550 	unsigned long nrpages;
3551 	unsigned long iova_pfn;
3552 	struct intel_iommu *iommu;
3553 	struct page *freelist;
3554 	struct pci_dev *pdev = NULL;
3555 
3556 	domain = find_domain(dev);
3557 	BUG_ON(!domain);
3558 
3559 	iommu = domain_get_iommu(domain);
3560 
3561 	iova_pfn = IOVA_PFN(dev_addr);
3562 
3563 	nrpages = aligned_nrpages(dev_addr, size);
3564 	start_pfn = mm_to_dma_pfn(iova_pfn);
3565 	last_pfn = start_pfn + nrpages - 1;
3566 
3567 	if (dev_is_pci(dev))
3568 		pdev = to_pci_dev(dev);
3569 
3570 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3571 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3572 			!has_iova_flush_queue(&domain->iovad)) {
3573 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3574 				      nrpages, !freelist, 0);
3575 		/* free iova */
3576 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3577 		dma_free_pagelist(freelist);
3578 	} else {
3579 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3580 			   (unsigned long)freelist);
3581 		/*
3582 		 * queue up the release of the unmap to save the 1/6th of the
3583 		 * cpu used up by the iotlb flush operation...
3584 		 */
3585 	}
3586 
3587 	trace_unmap_single(dev, dev_addr, size);
3588 }
3589 
3590 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3591 			     size_t size, enum dma_data_direction dir,
3592 			     unsigned long attrs)
3593 {
3594 	intel_unmap(dev, dev_addr, size);
3595 }
3596 
3597 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3598 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3599 {
3600 	intel_unmap(dev, dev_addr, size);
3601 }
3602 
3603 static void *intel_alloc_coherent(struct device *dev, size_t size,
3604 				  dma_addr_t *dma_handle, gfp_t flags,
3605 				  unsigned long attrs)
3606 {
3607 	struct page *page = NULL;
3608 	int order;
3609 
3610 	if (unlikely(attach_deferred(dev)))
3611 		do_deferred_attach(dev);
3612 
3613 	size = PAGE_ALIGN(size);
3614 	order = get_order(size);
3615 
3616 	if (gfpflags_allow_blocking(flags)) {
3617 		unsigned int count = size >> PAGE_SHIFT;
3618 
3619 		page = dma_alloc_from_contiguous(dev, count, order,
3620 						 flags & __GFP_NOWARN);
3621 	}
3622 
3623 	if (!page)
3624 		page = alloc_pages(flags, order);
3625 	if (!page)
3626 		return NULL;
3627 	memset(page_address(page), 0, size);
3628 
3629 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3630 					 DMA_BIDIRECTIONAL,
3631 					 dev->coherent_dma_mask);
3632 	if (*dma_handle != DMA_MAPPING_ERROR)
3633 		return page_address(page);
3634 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3635 		__free_pages(page, order);
3636 
3637 	return NULL;
3638 }
3639 
3640 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3641 				dma_addr_t dma_handle, unsigned long attrs)
3642 {
3643 	int order;
3644 	struct page *page = virt_to_page(vaddr);
3645 
3646 	size = PAGE_ALIGN(size);
3647 	order = get_order(size);
3648 
3649 	intel_unmap(dev, dma_handle, size);
3650 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3651 		__free_pages(page, order);
3652 }
3653 
3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3655 			   int nelems, enum dma_data_direction dir,
3656 			   unsigned long attrs)
3657 {
3658 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3659 	unsigned long nrpages = 0;
3660 	struct scatterlist *sg;
3661 	int i;
3662 
3663 	for_each_sg(sglist, sg, nelems, i) {
3664 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3665 	}
3666 
3667 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3668 
3669 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3670 }
3671 
3672 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3673 			enum dma_data_direction dir, unsigned long attrs)
3674 {
3675 	int i;
3676 	struct dmar_domain *domain;
3677 	size_t size = 0;
3678 	int prot = 0;
3679 	unsigned long iova_pfn;
3680 	int ret;
3681 	struct scatterlist *sg;
3682 	unsigned long start_vpfn;
3683 	struct intel_iommu *iommu;
3684 
3685 	BUG_ON(dir == DMA_NONE);
3686 
3687 	if (unlikely(attach_deferred(dev)))
3688 		do_deferred_attach(dev);
3689 
3690 	domain = find_domain(dev);
3691 	if (!domain)
3692 		return 0;
3693 
3694 	iommu = domain_get_iommu(domain);
3695 
3696 	for_each_sg(sglist, sg, nelems, i)
3697 		size += aligned_nrpages(sg->offset, sg->length);
3698 
3699 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3700 				*dev->dma_mask);
3701 	if (!iova_pfn) {
3702 		sglist->dma_length = 0;
3703 		return 0;
3704 	}
3705 
3706 	/*
3707 	 * Check if DMAR supports zero-length reads on write only
3708 	 * mappings..
3709 	 */
3710 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3711 			!cap_zlr(iommu->cap))
3712 		prot |= DMA_PTE_READ;
3713 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714 		prot |= DMA_PTE_WRITE;
3715 
3716 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3717 
3718 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3719 	if (unlikely(ret)) {
3720 		dma_pte_free_pagetable(domain, start_vpfn,
3721 				       start_vpfn + size - 1,
3722 				       agaw_to_level(domain->agaw) + 1);
3723 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3724 		return 0;
3725 	}
3726 
3727 	for_each_sg(sglist, sg, nelems, i)
3728 		trace_map_sg(dev, i + 1, nelems, sg);
3729 
3730 	return nelems;
3731 }
3732 
3733 static u64 intel_get_required_mask(struct device *dev)
3734 {
3735 	return DMA_BIT_MASK(32);
3736 }
3737 
3738 static const struct dma_map_ops intel_dma_ops = {
3739 	.alloc = intel_alloc_coherent,
3740 	.free = intel_free_coherent,
3741 	.map_sg = intel_map_sg,
3742 	.unmap_sg = intel_unmap_sg,
3743 	.map_page = intel_map_page,
3744 	.unmap_page = intel_unmap_page,
3745 	.map_resource = intel_map_resource,
3746 	.unmap_resource = intel_unmap_resource,
3747 	.dma_supported = dma_direct_supported,
3748 	.mmap = dma_common_mmap,
3749 	.get_sgtable = dma_common_get_sgtable,
3750 	.alloc_pages = dma_common_alloc_pages,
3751 	.free_pages = dma_common_free_pages,
3752 	.get_required_mask = intel_get_required_mask,
3753 };
3754 
3755 static void
3756 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3757 		   enum dma_data_direction dir, enum dma_sync_target target)
3758 {
3759 	struct dmar_domain *domain;
3760 	phys_addr_t tlb_addr;
3761 
3762 	domain = find_domain(dev);
3763 	if (WARN_ON(!domain))
3764 		return;
3765 
3766 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3767 	if (is_swiotlb_buffer(tlb_addr))
3768 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3769 }
3770 
3771 static dma_addr_t
3772 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3773 		  enum dma_data_direction dir, unsigned long attrs,
3774 		  u64 dma_mask)
3775 {
3776 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3777 	struct dmar_domain *domain;
3778 	struct intel_iommu *iommu;
3779 	unsigned long iova_pfn;
3780 	unsigned long nrpages;
3781 	phys_addr_t tlb_addr;
3782 	int prot = 0;
3783 	int ret;
3784 
3785 	if (unlikely(attach_deferred(dev)))
3786 		do_deferred_attach(dev);
3787 
3788 	domain = find_domain(dev);
3789 
3790 	if (WARN_ON(dir == DMA_NONE || !domain))
3791 		return DMA_MAPPING_ERROR;
3792 
3793 	iommu = domain_get_iommu(domain);
3794 	if (WARN_ON(!iommu))
3795 		return DMA_MAPPING_ERROR;
3796 
3797 	nrpages = aligned_nrpages(0, size);
3798 	iova_pfn = intel_alloc_iova(dev, domain,
3799 				    dma_to_mm_pfn(nrpages), dma_mask);
3800 	if (!iova_pfn)
3801 		return DMA_MAPPING_ERROR;
3802 
3803 	/*
3804 	 * Check if DMAR supports zero-length reads on write only
3805 	 * mappings..
3806 	 */
3807 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3808 			!cap_zlr(iommu->cap))
3809 		prot |= DMA_PTE_READ;
3810 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3811 		prot |= DMA_PTE_WRITE;
3812 
3813 	/*
3814 	 * If both the physical buffer start address and size are
3815 	 * page aligned, we don't need to use a bounce page.
3816 	 */
3817 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3818 		tlb_addr = swiotlb_tbl_map_single(dev,
3819 				phys_to_dma_unencrypted(dev, io_tlb_start),
3820 				paddr, size, aligned_size, dir, attrs);
3821 		if (tlb_addr == DMA_MAPPING_ERROR) {
3822 			goto swiotlb_error;
3823 		} else {
3824 			/* Cleanup the padding area. */
3825 			void *padding_start = phys_to_virt(tlb_addr);
3826 			size_t padding_size = aligned_size;
3827 
3828 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3829 			    (dir == DMA_TO_DEVICE ||
3830 			     dir == DMA_BIDIRECTIONAL)) {
3831 				padding_start += size;
3832 				padding_size -= size;
3833 			}
3834 
3835 			memset(padding_start, 0, padding_size);
3836 		}
3837 	} else {
3838 		tlb_addr = paddr;
3839 	}
3840 
3841 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3842 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3843 	if (ret)
3844 		goto mapping_error;
3845 
3846 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3847 
3848 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3849 
3850 mapping_error:
3851 	if (is_swiotlb_buffer(tlb_addr))
3852 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3853 					 aligned_size, dir, attrs);
3854 swiotlb_error:
3855 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3856 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3857 		size, (unsigned long long)paddr, dir);
3858 
3859 	return DMA_MAPPING_ERROR;
3860 }
3861 
3862 static void
3863 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3864 		    enum dma_data_direction dir, unsigned long attrs)
3865 {
3866 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3867 	struct dmar_domain *domain;
3868 	phys_addr_t tlb_addr;
3869 
3870 	domain = find_domain(dev);
3871 	if (WARN_ON(!domain))
3872 		return;
3873 
3874 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3875 	if (WARN_ON(!tlb_addr))
3876 		return;
3877 
3878 	intel_unmap(dev, dev_addr, size);
3879 	if (is_swiotlb_buffer(tlb_addr))
3880 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3881 					 aligned_size, dir, attrs);
3882 
3883 	trace_bounce_unmap_single(dev, dev_addr, size);
3884 }
3885 
3886 static dma_addr_t
3887 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3888 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3889 {
3890 	return bounce_map_single(dev, page_to_phys(page) + offset,
3891 				 size, dir, attrs, *dev->dma_mask);
3892 }
3893 
3894 static dma_addr_t
3895 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3896 		    enum dma_data_direction dir, unsigned long attrs)
3897 {
3898 	return bounce_map_single(dev, phys_addr, size,
3899 				 dir, attrs, *dev->dma_mask);
3900 }
3901 
3902 static void
3903 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3904 		  enum dma_data_direction dir, unsigned long attrs)
3905 {
3906 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3907 }
3908 
3909 static void
3910 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3911 		      enum dma_data_direction dir, unsigned long attrs)
3912 {
3913 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3914 }
3915 
3916 static void
3917 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3918 		enum dma_data_direction dir, unsigned long attrs)
3919 {
3920 	struct scatterlist *sg;
3921 	int i;
3922 
3923 	for_each_sg(sglist, sg, nelems, i)
3924 		bounce_unmap_page(dev, sg->dma_address,
3925 				  sg_dma_len(sg), dir, attrs);
3926 }
3927 
3928 static int
3929 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3930 	      enum dma_data_direction dir, unsigned long attrs)
3931 {
3932 	int i;
3933 	struct scatterlist *sg;
3934 
3935 	for_each_sg(sglist, sg, nelems, i) {
3936 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3937 						  sg->offset, sg->length,
3938 						  dir, attrs);
3939 		if (sg->dma_address == DMA_MAPPING_ERROR)
3940 			goto out_unmap;
3941 		sg_dma_len(sg) = sg->length;
3942 	}
3943 
3944 	for_each_sg(sglist, sg, nelems, i)
3945 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3946 
3947 	return nelems;
3948 
3949 out_unmap:
3950 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3951 	return 0;
3952 }
3953 
3954 static void
3955 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3956 			   size_t size, enum dma_data_direction dir)
3957 {
3958 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3959 }
3960 
3961 static void
3962 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3963 			      size_t size, enum dma_data_direction dir)
3964 {
3965 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3966 }
3967 
3968 static void
3969 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3970 		       int nelems, enum dma_data_direction dir)
3971 {
3972 	struct scatterlist *sg;
3973 	int i;
3974 
3975 	for_each_sg(sglist, sg, nelems, i)
3976 		bounce_sync_single(dev, sg_dma_address(sg),
3977 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3978 }
3979 
3980 static void
3981 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3982 			  int nelems, enum dma_data_direction dir)
3983 {
3984 	struct scatterlist *sg;
3985 	int i;
3986 
3987 	for_each_sg(sglist, sg, nelems, i)
3988 		bounce_sync_single(dev, sg_dma_address(sg),
3989 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3990 }
3991 
3992 static const struct dma_map_ops bounce_dma_ops = {
3993 	.alloc			= intel_alloc_coherent,
3994 	.free			= intel_free_coherent,
3995 	.map_sg			= bounce_map_sg,
3996 	.unmap_sg		= bounce_unmap_sg,
3997 	.map_page		= bounce_map_page,
3998 	.unmap_page		= bounce_unmap_page,
3999 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
4000 	.sync_single_for_device	= bounce_sync_single_for_device,
4001 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
4002 	.sync_sg_for_device	= bounce_sync_sg_for_device,
4003 	.map_resource		= bounce_map_resource,
4004 	.unmap_resource		= bounce_unmap_resource,
4005 	.alloc_pages		= dma_common_alloc_pages,
4006 	.free_pages		= dma_common_free_pages,
4007 	.dma_supported		= dma_direct_supported,
4008 };
4009 
4010 static inline int iommu_domain_cache_init(void)
4011 {
4012 	int ret = 0;
4013 
4014 	iommu_domain_cache = kmem_cache_create("iommu_domain",
4015 					 sizeof(struct dmar_domain),
4016 					 0,
4017 					 SLAB_HWCACHE_ALIGN,
4018 
4019 					 NULL);
4020 	if (!iommu_domain_cache) {
4021 		pr_err("Couldn't create iommu_domain cache\n");
4022 		ret = -ENOMEM;
4023 	}
4024 
4025 	return ret;
4026 }
4027 
4028 static inline int iommu_devinfo_cache_init(void)
4029 {
4030 	int ret = 0;
4031 
4032 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4033 					 sizeof(struct device_domain_info),
4034 					 0,
4035 					 SLAB_HWCACHE_ALIGN,
4036 					 NULL);
4037 	if (!iommu_devinfo_cache) {
4038 		pr_err("Couldn't create devinfo cache\n");
4039 		ret = -ENOMEM;
4040 	}
4041 
4042 	return ret;
4043 }
4044 
4045 static int __init iommu_init_mempool(void)
4046 {
4047 	int ret;
4048 	ret = iova_cache_get();
4049 	if (ret)
4050 		return ret;
4051 
4052 	ret = iommu_domain_cache_init();
4053 	if (ret)
4054 		goto domain_error;
4055 
4056 	ret = iommu_devinfo_cache_init();
4057 	if (!ret)
4058 		return ret;
4059 
4060 	kmem_cache_destroy(iommu_domain_cache);
4061 domain_error:
4062 	iova_cache_put();
4063 
4064 	return -ENOMEM;
4065 }
4066 
4067 static void __init iommu_exit_mempool(void)
4068 {
4069 	kmem_cache_destroy(iommu_devinfo_cache);
4070 	kmem_cache_destroy(iommu_domain_cache);
4071 	iova_cache_put();
4072 }
4073 
4074 static void __init init_no_remapping_devices(void)
4075 {
4076 	struct dmar_drhd_unit *drhd;
4077 	struct device *dev;
4078 	int i;
4079 
4080 	for_each_drhd_unit(drhd) {
4081 		if (!drhd->include_all) {
4082 			for_each_active_dev_scope(drhd->devices,
4083 						  drhd->devices_cnt, i, dev)
4084 				break;
4085 			/* ignore DMAR unit if no devices exist */
4086 			if (i == drhd->devices_cnt)
4087 				drhd->ignored = 1;
4088 		}
4089 	}
4090 
4091 	for_each_active_drhd_unit(drhd) {
4092 		if (drhd->include_all)
4093 			continue;
4094 
4095 		for_each_active_dev_scope(drhd->devices,
4096 					  drhd->devices_cnt, i, dev)
4097 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4098 				break;
4099 		if (i < drhd->devices_cnt)
4100 			continue;
4101 
4102 		/* This IOMMU has *only* gfx devices. Either bypass it or
4103 		   set the gfx_mapped flag, as appropriate */
4104 		drhd->gfx_dedicated = 1;
4105 		if (!dmar_map_gfx)
4106 			drhd->ignored = 1;
4107 	}
4108 }
4109 
4110 #ifdef CONFIG_SUSPEND
4111 static int init_iommu_hw(void)
4112 {
4113 	struct dmar_drhd_unit *drhd;
4114 	struct intel_iommu *iommu = NULL;
4115 
4116 	for_each_active_iommu(iommu, drhd)
4117 		if (iommu->qi)
4118 			dmar_reenable_qi(iommu);
4119 
4120 	for_each_iommu(iommu, drhd) {
4121 		if (drhd->ignored) {
4122 			/*
4123 			 * we always have to disable PMRs or DMA may fail on
4124 			 * this device
4125 			 */
4126 			if (force_on)
4127 				iommu_disable_protect_mem_regions(iommu);
4128 			continue;
4129 		}
4130 
4131 		iommu_flush_write_buffer(iommu);
4132 
4133 		iommu_set_root_entry(iommu);
4134 
4135 		iommu->flush.flush_context(iommu, 0, 0, 0,
4136 					   DMA_CCMD_GLOBAL_INVL);
4137 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4138 		iommu_enable_translation(iommu);
4139 		iommu_disable_protect_mem_regions(iommu);
4140 	}
4141 
4142 	return 0;
4143 }
4144 
4145 static void iommu_flush_all(void)
4146 {
4147 	struct dmar_drhd_unit *drhd;
4148 	struct intel_iommu *iommu;
4149 
4150 	for_each_active_iommu(iommu, drhd) {
4151 		iommu->flush.flush_context(iommu, 0, 0, 0,
4152 					   DMA_CCMD_GLOBAL_INVL);
4153 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4154 					 DMA_TLB_GLOBAL_FLUSH);
4155 	}
4156 }
4157 
4158 static int iommu_suspend(void)
4159 {
4160 	struct dmar_drhd_unit *drhd;
4161 	struct intel_iommu *iommu = NULL;
4162 	unsigned long flag;
4163 
4164 	for_each_active_iommu(iommu, drhd) {
4165 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4166 						 GFP_ATOMIC);
4167 		if (!iommu->iommu_state)
4168 			goto nomem;
4169 	}
4170 
4171 	iommu_flush_all();
4172 
4173 	for_each_active_iommu(iommu, drhd) {
4174 		iommu_disable_translation(iommu);
4175 
4176 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4177 
4178 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4179 			readl(iommu->reg + DMAR_FECTL_REG);
4180 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4181 			readl(iommu->reg + DMAR_FEDATA_REG);
4182 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4183 			readl(iommu->reg + DMAR_FEADDR_REG);
4184 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4185 			readl(iommu->reg + DMAR_FEUADDR_REG);
4186 
4187 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4188 	}
4189 	return 0;
4190 
4191 nomem:
4192 	for_each_active_iommu(iommu, drhd)
4193 		kfree(iommu->iommu_state);
4194 
4195 	return -ENOMEM;
4196 }
4197 
4198 static void iommu_resume(void)
4199 {
4200 	struct dmar_drhd_unit *drhd;
4201 	struct intel_iommu *iommu = NULL;
4202 	unsigned long flag;
4203 
4204 	if (init_iommu_hw()) {
4205 		if (force_on)
4206 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4207 		else
4208 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4209 		return;
4210 	}
4211 
4212 	for_each_active_iommu(iommu, drhd) {
4213 
4214 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4215 
4216 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4217 			iommu->reg + DMAR_FECTL_REG);
4218 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4219 			iommu->reg + DMAR_FEDATA_REG);
4220 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4221 			iommu->reg + DMAR_FEADDR_REG);
4222 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4223 			iommu->reg + DMAR_FEUADDR_REG);
4224 
4225 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4226 	}
4227 
4228 	for_each_active_iommu(iommu, drhd)
4229 		kfree(iommu->iommu_state);
4230 }
4231 
4232 static struct syscore_ops iommu_syscore_ops = {
4233 	.resume		= iommu_resume,
4234 	.suspend	= iommu_suspend,
4235 };
4236 
4237 static void __init init_iommu_pm_ops(void)
4238 {
4239 	register_syscore_ops(&iommu_syscore_ops);
4240 }
4241 
4242 #else
4243 static inline void init_iommu_pm_ops(void) {}
4244 #endif	/* CONFIG_PM */
4245 
4246 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4247 {
4248 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4249 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4250 	    rmrr->end_address <= rmrr->base_address ||
4251 	    arch_rmrr_sanity_check(rmrr))
4252 		return -EINVAL;
4253 
4254 	return 0;
4255 }
4256 
4257 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4258 {
4259 	struct acpi_dmar_reserved_memory *rmrr;
4260 	struct dmar_rmrr_unit *rmrru;
4261 
4262 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4263 	if (rmrr_sanity_check(rmrr)) {
4264 		pr_warn(FW_BUG
4265 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4266 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4267 			   rmrr->base_address, rmrr->end_address,
4268 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4269 			   dmi_get_system_info(DMI_BIOS_VERSION),
4270 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4271 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4272 	}
4273 
4274 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4275 	if (!rmrru)
4276 		goto out;
4277 
4278 	rmrru->hdr = header;
4279 
4280 	rmrru->base_address = rmrr->base_address;
4281 	rmrru->end_address = rmrr->end_address;
4282 
4283 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4284 				((void *)rmrr) + rmrr->header.length,
4285 				&rmrru->devices_cnt);
4286 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4287 		goto free_rmrru;
4288 
4289 	list_add(&rmrru->list, &dmar_rmrr_units);
4290 
4291 	return 0;
4292 free_rmrru:
4293 	kfree(rmrru);
4294 out:
4295 	return -ENOMEM;
4296 }
4297 
4298 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4299 {
4300 	struct dmar_atsr_unit *atsru;
4301 	struct acpi_dmar_atsr *tmp;
4302 
4303 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4304 				dmar_rcu_check()) {
4305 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4306 		if (atsr->segment != tmp->segment)
4307 			continue;
4308 		if (atsr->header.length != tmp->header.length)
4309 			continue;
4310 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4311 			return atsru;
4312 	}
4313 
4314 	return NULL;
4315 }
4316 
4317 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4318 {
4319 	struct acpi_dmar_atsr *atsr;
4320 	struct dmar_atsr_unit *atsru;
4321 
4322 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4323 		return 0;
4324 
4325 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4326 	atsru = dmar_find_atsr(atsr);
4327 	if (atsru)
4328 		return 0;
4329 
4330 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4331 	if (!atsru)
4332 		return -ENOMEM;
4333 
4334 	/*
4335 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4336 	 * copy the memory content because the memory buffer will be freed
4337 	 * on return.
4338 	 */
4339 	atsru->hdr = (void *)(atsru + 1);
4340 	memcpy(atsru->hdr, hdr, hdr->length);
4341 	atsru->include_all = atsr->flags & 0x1;
4342 	if (!atsru->include_all) {
4343 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4344 				(void *)atsr + atsr->header.length,
4345 				&atsru->devices_cnt);
4346 		if (atsru->devices_cnt && atsru->devices == NULL) {
4347 			kfree(atsru);
4348 			return -ENOMEM;
4349 		}
4350 	}
4351 
4352 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4353 
4354 	return 0;
4355 }
4356 
4357 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4358 {
4359 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4360 	kfree(atsru);
4361 }
4362 
4363 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4364 {
4365 	struct acpi_dmar_atsr *atsr;
4366 	struct dmar_atsr_unit *atsru;
4367 
4368 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4369 	atsru = dmar_find_atsr(atsr);
4370 	if (atsru) {
4371 		list_del_rcu(&atsru->list);
4372 		synchronize_rcu();
4373 		intel_iommu_free_atsr(atsru);
4374 	}
4375 
4376 	return 0;
4377 }
4378 
4379 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4380 {
4381 	int i;
4382 	struct device *dev;
4383 	struct acpi_dmar_atsr *atsr;
4384 	struct dmar_atsr_unit *atsru;
4385 
4386 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4387 	atsru = dmar_find_atsr(atsr);
4388 	if (!atsru)
4389 		return 0;
4390 
4391 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4392 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4393 					  i, dev)
4394 			return -EBUSY;
4395 	}
4396 
4397 	return 0;
4398 }
4399 
4400 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4401 {
4402 	int sp, ret;
4403 	struct intel_iommu *iommu = dmaru->iommu;
4404 
4405 	if (g_iommus[iommu->seq_id])
4406 		return 0;
4407 
4408 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4409 		pr_warn("%s: Doesn't support hardware pass through.\n",
4410 			iommu->name);
4411 		return -ENXIO;
4412 	}
4413 	if (!ecap_sc_support(iommu->ecap) &&
4414 	    domain_update_iommu_snooping(iommu)) {
4415 		pr_warn("%s: Doesn't support snooping.\n",
4416 			iommu->name);
4417 		return -ENXIO;
4418 	}
4419 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4420 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4421 		pr_warn("%s: Doesn't support large page.\n",
4422 			iommu->name);
4423 		return -ENXIO;
4424 	}
4425 
4426 	/*
4427 	 * Disable translation if already enabled prior to OS handover.
4428 	 */
4429 	if (iommu->gcmd & DMA_GCMD_TE)
4430 		iommu_disable_translation(iommu);
4431 
4432 	g_iommus[iommu->seq_id] = iommu;
4433 	ret = iommu_init_domains(iommu);
4434 	if (ret == 0)
4435 		ret = iommu_alloc_root_entry(iommu);
4436 	if (ret)
4437 		goto out;
4438 
4439 	intel_svm_check(iommu);
4440 
4441 	if (dmaru->ignored) {
4442 		/*
4443 		 * we always have to disable PMRs or DMA may fail on this device
4444 		 */
4445 		if (force_on)
4446 			iommu_disable_protect_mem_regions(iommu);
4447 		return 0;
4448 	}
4449 
4450 	intel_iommu_init_qi(iommu);
4451 	iommu_flush_write_buffer(iommu);
4452 
4453 #ifdef CONFIG_INTEL_IOMMU_SVM
4454 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4455 		ret = intel_svm_enable_prq(iommu);
4456 		if (ret)
4457 			goto disable_iommu;
4458 	}
4459 #endif
4460 	ret = dmar_set_interrupt(iommu);
4461 	if (ret)
4462 		goto disable_iommu;
4463 
4464 	iommu_set_root_entry(iommu);
4465 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4466 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4467 	iommu_enable_translation(iommu);
4468 
4469 	iommu_disable_protect_mem_regions(iommu);
4470 	return 0;
4471 
4472 disable_iommu:
4473 	disable_dmar_iommu(iommu);
4474 out:
4475 	free_dmar_iommu(iommu);
4476 	return ret;
4477 }
4478 
4479 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4480 {
4481 	int ret = 0;
4482 	struct intel_iommu *iommu = dmaru->iommu;
4483 
4484 	if (!intel_iommu_enabled)
4485 		return 0;
4486 	if (iommu == NULL)
4487 		return -EINVAL;
4488 
4489 	if (insert) {
4490 		ret = intel_iommu_add(dmaru);
4491 	} else {
4492 		disable_dmar_iommu(iommu);
4493 		free_dmar_iommu(iommu);
4494 	}
4495 
4496 	return ret;
4497 }
4498 
4499 static void intel_iommu_free_dmars(void)
4500 {
4501 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4502 	struct dmar_atsr_unit *atsru, *atsr_n;
4503 
4504 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4505 		list_del(&rmrru->list);
4506 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4507 		kfree(rmrru);
4508 	}
4509 
4510 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4511 		list_del(&atsru->list);
4512 		intel_iommu_free_atsr(atsru);
4513 	}
4514 }
4515 
4516 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4517 {
4518 	int i, ret = 1;
4519 	struct pci_bus *bus;
4520 	struct pci_dev *bridge = NULL;
4521 	struct device *tmp;
4522 	struct acpi_dmar_atsr *atsr;
4523 	struct dmar_atsr_unit *atsru;
4524 
4525 	dev = pci_physfn(dev);
4526 	for (bus = dev->bus; bus; bus = bus->parent) {
4527 		bridge = bus->self;
4528 		/* If it's an integrated device, allow ATS */
4529 		if (!bridge)
4530 			return 1;
4531 		/* Connected via non-PCIe: no ATS */
4532 		if (!pci_is_pcie(bridge) ||
4533 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4534 			return 0;
4535 		/* If we found the root port, look it up in the ATSR */
4536 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4537 			break;
4538 	}
4539 
4540 	rcu_read_lock();
4541 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4542 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4543 		if (atsr->segment != pci_domain_nr(dev->bus))
4544 			continue;
4545 
4546 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4547 			if (tmp == &bridge->dev)
4548 				goto out;
4549 
4550 		if (atsru->include_all)
4551 			goto out;
4552 	}
4553 	ret = 0;
4554 out:
4555 	rcu_read_unlock();
4556 
4557 	return ret;
4558 }
4559 
4560 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4561 {
4562 	int ret;
4563 	struct dmar_rmrr_unit *rmrru;
4564 	struct dmar_atsr_unit *atsru;
4565 	struct acpi_dmar_atsr *atsr;
4566 	struct acpi_dmar_reserved_memory *rmrr;
4567 
4568 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4569 		return 0;
4570 
4571 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4572 		rmrr = container_of(rmrru->hdr,
4573 				    struct acpi_dmar_reserved_memory, header);
4574 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4575 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4576 				((void *)rmrr) + rmrr->header.length,
4577 				rmrr->segment, rmrru->devices,
4578 				rmrru->devices_cnt);
4579 			if (ret < 0)
4580 				return ret;
4581 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4582 			dmar_remove_dev_scope(info, rmrr->segment,
4583 				rmrru->devices, rmrru->devices_cnt);
4584 		}
4585 	}
4586 
4587 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4588 		if (atsru->include_all)
4589 			continue;
4590 
4591 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4592 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4593 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4594 					(void *)atsr + atsr->header.length,
4595 					atsr->segment, atsru->devices,
4596 					atsru->devices_cnt);
4597 			if (ret > 0)
4598 				break;
4599 			else if (ret < 0)
4600 				return ret;
4601 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4602 			if (dmar_remove_dev_scope(info, atsr->segment,
4603 					atsru->devices, atsru->devices_cnt))
4604 				break;
4605 		}
4606 	}
4607 
4608 	return 0;
4609 }
4610 
4611 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4612 				       unsigned long val, void *v)
4613 {
4614 	struct memory_notify *mhp = v;
4615 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4616 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4617 			mhp->nr_pages - 1);
4618 
4619 	switch (val) {
4620 	case MEM_GOING_ONLINE:
4621 		if (iommu_domain_identity_map(si_domain,
4622 					      start_vpfn, last_vpfn)) {
4623 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4624 				start_vpfn, last_vpfn);
4625 			return NOTIFY_BAD;
4626 		}
4627 		break;
4628 
4629 	case MEM_OFFLINE:
4630 	case MEM_CANCEL_ONLINE:
4631 		{
4632 			struct dmar_drhd_unit *drhd;
4633 			struct intel_iommu *iommu;
4634 			struct page *freelist;
4635 
4636 			freelist = domain_unmap(si_domain,
4637 						start_vpfn, last_vpfn);
4638 
4639 			rcu_read_lock();
4640 			for_each_active_iommu(iommu, drhd)
4641 				iommu_flush_iotlb_psi(iommu, si_domain,
4642 					start_vpfn, mhp->nr_pages,
4643 					!freelist, 0);
4644 			rcu_read_unlock();
4645 			dma_free_pagelist(freelist);
4646 		}
4647 		break;
4648 	}
4649 
4650 	return NOTIFY_OK;
4651 }
4652 
4653 static struct notifier_block intel_iommu_memory_nb = {
4654 	.notifier_call = intel_iommu_memory_notifier,
4655 	.priority = 0
4656 };
4657 
4658 static void free_all_cpu_cached_iovas(unsigned int cpu)
4659 {
4660 	int i;
4661 
4662 	for (i = 0; i < g_num_of_iommus; i++) {
4663 		struct intel_iommu *iommu = g_iommus[i];
4664 		struct dmar_domain *domain;
4665 		int did;
4666 
4667 		if (!iommu)
4668 			continue;
4669 
4670 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4671 			domain = get_iommu_domain(iommu, (u16)did);
4672 
4673 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4674 				continue;
4675 
4676 			free_cpu_cached_iovas(cpu, &domain->iovad);
4677 		}
4678 	}
4679 }
4680 
4681 static int intel_iommu_cpu_dead(unsigned int cpu)
4682 {
4683 	free_all_cpu_cached_iovas(cpu);
4684 	return 0;
4685 }
4686 
4687 static void intel_disable_iommus(void)
4688 {
4689 	struct intel_iommu *iommu = NULL;
4690 	struct dmar_drhd_unit *drhd;
4691 
4692 	for_each_iommu(iommu, drhd)
4693 		iommu_disable_translation(iommu);
4694 }
4695 
4696 void intel_iommu_shutdown(void)
4697 {
4698 	struct dmar_drhd_unit *drhd;
4699 	struct intel_iommu *iommu = NULL;
4700 
4701 	if (no_iommu || dmar_disabled)
4702 		return;
4703 
4704 	down_write(&dmar_global_lock);
4705 
4706 	/* Disable PMRs explicitly here. */
4707 	for_each_iommu(iommu, drhd)
4708 		iommu_disable_protect_mem_regions(iommu);
4709 
4710 	/* Make sure the IOMMUs are switched off */
4711 	intel_disable_iommus();
4712 
4713 	up_write(&dmar_global_lock);
4714 }
4715 
4716 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4717 {
4718 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4719 
4720 	return container_of(iommu_dev, struct intel_iommu, iommu);
4721 }
4722 
4723 static ssize_t intel_iommu_show_version(struct device *dev,
4724 					struct device_attribute *attr,
4725 					char *buf)
4726 {
4727 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4728 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4729 	return sprintf(buf, "%d:%d\n",
4730 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4731 }
4732 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4733 
4734 static ssize_t intel_iommu_show_address(struct device *dev,
4735 					struct device_attribute *attr,
4736 					char *buf)
4737 {
4738 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4739 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4740 }
4741 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4742 
4743 static ssize_t intel_iommu_show_cap(struct device *dev,
4744 				    struct device_attribute *attr,
4745 				    char *buf)
4746 {
4747 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4748 	return sprintf(buf, "%llx\n", iommu->cap);
4749 }
4750 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4751 
4752 static ssize_t intel_iommu_show_ecap(struct device *dev,
4753 				    struct device_attribute *attr,
4754 				    char *buf)
4755 {
4756 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4757 	return sprintf(buf, "%llx\n", iommu->ecap);
4758 }
4759 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4760 
4761 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4762 				      struct device_attribute *attr,
4763 				      char *buf)
4764 {
4765 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4766 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4767 }
4768 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4769 
4770 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4771 					   struct device_attribute *attr,
4772 					   char *buf)
4773 {
4774 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4775 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4776 						  cap_ndoms(iommu->cap)));
4777 }
4778 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4779 
4780 static struct attribute *intel_iommu_attrs[] = {
4781 	&dev_attr_version.attr,
4782 	&dev_attr_address.attr,
4783 	&dev_attr_cap.attr,
4784 	&dev_attr_ecap.attr,
4785 	&dev_attr_domains_supported.attr,
4786 	&dev_attr_domains_used.attr,
4787 	NULL,
4788 };
4789 
4790 static struct attribute_group intel_iommu_group = {
4791 	.name = "intel-iommu",
4792 	.attrs = intel_iommu_attrs,
4793 };
4794 
4795 const struct attribute_group *intel_iommu_groups[] = {
4796 	&intel_iommu_group,
4797 	NULL,
4798 };
4799 
4800 static inline bool has_external_pci(void)
4801 {
4802 	struct pci_dev *pdev = NULL;
4803 
4804 	for_each_pci_dev(pdev)
4805 		if (pdev->external_facing)
4806 			return true;
4807 
4808 	return false;
4809 }
4810 
4811 static int __init platform_optin_force_iommu(void)
4812 {
4813 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4814 		return 0;
4815 
4816 	if (no_iommu || dmar_disabled)
4817 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4818 
4819 	/*
4820 	 * If Intel-IOMMU is disabled by default, we will apply identity
4821 	 * map for all devices except those marked as being untrusted.
4822 	 */
4823 	if (dmar_disabled)
4824 		iommu_set_default_passthrough(false);
4825 
4826 	dmar_disabled = 0;
4827 	no_iommu = 0;
4828 
4829 	return 1;
4830 }
4831 
4832 static int __init probe_acpi_namespace_devices(void)
4833 {
4834 	struct dmar_drhd_unit *drhd;
4835 	/* To avoid a -Wunused-but-set-variable warning. */
4836 	struct intel_iommu *iommu __maybe_unused;
4837 	struct device *dev;
4838 	int i, ret = 0;
4839 
4840 	for_each_active_iommu(iommu, drhd) {
4841 		for_each_active_dev_scope(drhd->devices,
4842 					  drhd->devices_cnt, i, dev) {
4843 			struct acpi_device_physical_node *pn;
4844 			struct iommu_group *group;
4845 			struct acpi_device *adev;
4846 
4847 			if (dev->bus != &acpi_bus_type)
4848 				continue;
4849 
4850 			adev = to_acpi_device(dev);
4851 			mutex_lock(&adev->physical_node_lock);
4852 			list_for_each_entry(pn,
4853 					    &adev->physical_node_list, node) {
4854 				group = iommu_group_get(pn->dev);
4855 				if (group) {
4856 					iommu_group_put(group);
4857 					continue;
4858 				}
4859 
4860 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4861 				ret = iommu_probe_device(pn->dev);
4862 				if (ret)
4863 					break;
4864 			}
4865 			mutex_unlock(&adev->physical_node_lock);
4866 
4867 			if (ret)
4868 				return ret;
4869 		}
4870 	}
4871 
4872 	return 0;
4873 }
4874 
4875 int __init intel_iommu_init(void)
4876 {
4877 	int ret = -ENODEV;
4878 	struct dmar_drhd_unit *drhd;
4879 	struct intel_iommu *iommu;
4880 
4881 	/*
4882 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4883 	 * opt in, so enforce that.
4884 	 */
4885 	force_on = tboot_force_iommu() || platform_optin_force_iommu();
4886 
4887 	if (iommu_init_mempool()) {
4888 		if (force_on)
4889 			panic("tboot: Failed to initialize iommu memory\n");
4890 		return -ENOMEM;
4891 	}
4892 
4893 	down_write(&dmar_global_lock);
4894 	if (dmar_table_init()) {
4895 		if (force_on)
4896 			panic("tboot: Failed to initialize DMAR table\n");
4897 		goto out_free_dmar;
4898 	}
4899 
4900 	if (dmar_dev_scope_init() < 0) {
4901 		if (force_on)
4902 			panic("tboot: Failed to initialize DMAR device scope\n");
4903 		goto out_free_dmar;
4904 	}
4905 
4906 	up_write(&dmar_global_lock);
4907 
4908 	/*
4909 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4910 	 * complain later when we register it under the lock.
4911 	 */
4912 	dmar_register_bus_notifier();
4913 
4914 	down_write(&dmar_global_lock);
4915 
4916 	if (!no_iommu)
4917 		intel_iommu_debugfs_init();
4918 
4919 	if (no_iommu || dmar_disabled) {
4920 		/*
4921 		 * We exit the function here to ensure IOMMU's remapping and
4922 		 * mempool aren't setup, which means that the IOMMU's PMRs
4923 		 * won't be disabled via the call to init_dmars(). So disable
4924 		 * it explicitly here. The PMRs were setup by tboot prior to
4925 		 * calling SENTER, but the kernel is expected to reset/tear
4926 		 * down the PMRs.
4927 		 */
4928 		if (intel_iommu_tboot_noforce) {
4929 			for_each_iommu(iommu, drhd)
4930 				iommu_disable_protect_mem_regions(iommu);
4931 		}
4932 
4933 		/*
4934 		 * Make sure the IOMMUs are switched off, even when we
4935 		 * boot into a kexec kernel and the previous kernel left
4936 		 * them enabled
4937 		 */
4938 		intel_disable_iommus();
4939 		goto out_free_dmar;
4940 	}
4941 
4942 	if (list_empty(&dmar_rmrr_units))
4943 		pr_info("No RMRR found\n");
4944 
4945 	if (list_empty(&dmar_atsr_units))
4946 		pr_info("No ATSR found\n");
4947 
4948 	if (dmar_init_reserved_ranges()) {
4949 		if (force_on)
4950 			panic("tboot: Failed to reserve iommu ranges\n");
4951 		goto out_free_reserved_range;
4952 	}
4953 
4954 	if (dmar_map_gfx)
4955 		intel_iommu_gfx_mapped = 1;
4956 
4957 	init_no_remapping_devices();
4958 
4959 	ret = init_dmars();
4960 	if (ret) {
4961 		if (force_on)
4962 			panic("tboot: Failed to initialize DMARs\n");
4963 		pr_err("Initialization failed\n");
4964 		goto out_free_reserved_range;
4965 	}
4966 	up_write(&dmar_global_lock);
4967 
4968 	init_iommu_pm_ops();
4969 
4970 	down_read(&dmar_global_lock);
4971 	for_each_active_iommu(iommu, drhd) {
4972 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4973 				       intel_iommu_groups,
4974 				       "%s", iommu->name);
4975 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4976 		iommu_device_register(&iommu->iommu);
4977 	}
4978 	up_read(&dmar_global_lock);
4979 
4980 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4981 	if (si_domain && !hw_pass_through)
4982 		register_memory_notifier(&intel_iommu_memory_nb);
4983 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4984 			  intel_iommu_cpu_dead);
4985 
4986 	down_read(&dmar_global_lock);
4987 	if (probe_acpi_namespace_devices())
4988 		pr_warn("ACPI name space devices didn't probe correctly\n");
4989 
4990 	/* Finally, we enable the DMA remapping hardware. */
4991 	for_each_iommu(iommu, drhd) {
4992 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4993 			iommu_enable_translation(iommu);
4994 
4995 		iommu_disable_protect_mem_regions(iommu);
4996 	}
4997 	up_read(&dmar_global_lock);
4998 
4999 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5000 
5001 	intel_iommu_enabled = 1;
5002 
5003 	return 0;
5004 
5005 out_free_reserved_range:
5006 	put_iova_domain(&reserved_iova_list);
5007 out_free_dmar:
5008 	intel_iommu_free_dmars();
5009 	up_write(&dmar_global_lock);
5010 	iommu_exit_mempool();
5011 	return ret;
5012 }
5013 
5014 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5015 {
5016 	struct intel_iommu *iommu = opaque;
5017 
5018 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5019 	return 0;
5020 }
5021 
5022 /*
5023  * NB - intel-iommu lacks any sort of reference counting for the users of
5024  * dependent devices.  If multiple endpoints have intersecting dependent
5025  * devices, unbinding the driver from any one of them will possibly leave
5026  * the others unable to operate.
5027  */
5028 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5029 {
5030 	if (!iommu || !dev || !dev_is_pci(dev))
5031 		return;
5032 
5033 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5034 }
5035 
5036 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5037 {
5038 	struct dmar_domain *domain;
5039 	struct intel_iommu *iommu;
5040 	unsigned long flags;
5041 
5042 	assert_spin_locked(&device_domain_lock);
5043 
5044 	if (WARN_ON(!info))
5045 		return;
5046 
5047 	iommu = info->iommu;
5048 	domain = info->domain;
5049 
5050 	if (info->dev) {
5051 		if (dev_is_pci(info->dev) && sm_supported(iommu))
5052 			intel_pasid_tear_down_entry(iommu, info->dev,
5053 					PASID_RID2PASID, false);
5054 
5055 		iommu_disable_dev_iotlb(info);
5056 		if (!dev_is_real_dma_subdevice(info->dev))
5057 			domain_context_clear(iommu, info->dev);
5058 		intel_pasid_free_table(info->dev);
5059 	}
5060 
5061 	unlink_domain_info(info);
5062 
5063 	spin_lock_irqsave(&iommu->lock, flags);
5064 	domain_detach_iommu(domain, iommu);
5065 	spin_unlock_irqrestore(&iommu->lock, flags);
5066 
5067 	free_devinfo_mem(info);
5068 }
5069 
5070 static void dmar_remove_one_dev_info(struct device *dev)
5071 {
5072 	struct device_domain_info *info;
5073 	unsigned long flags;
5074 
5075 	spin_lock_irqsave(&device_domain_lock, flags);
5076 	info = get_domain_info(dev);
5077 	if (info)
5078 		__dmar_remove_one_dev_info(info);
5079 	spin_unlock_irqrestore(&device_domain_lock, flags);
5080 }
5081 
5082 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5083 {
5084 	int adjust_width;
5085 
5086 	/* calculate AGAW */
5087 	domain->gaw = guest_width;
5088 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5089 	domain->agaw = width_to_agaw(adjust_width);
5090 
5091 	domain->iommu_coherency = 0;
5092 	domain->iommu_snooping = 0;
5093 	domain->iommu_superpage = 0;
5094 	domain->max_addr = 0;
5095 
5096 	/* always allocate the top pgd */
5097 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5098 	if (!domain->pgd)
5099 		return -ENOMEM;
5100 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5101 	return 0;
5102 }
5103 
5104 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5105 {
5106 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5107 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5108 
5109 	if (!intel_iommu_strict &&
5110 	    init_iova_flush_queue(&dmar_domain->iovad,
5111 				  iommu_flush_iova, iova_entry_free))
5112 		pr_info("iova flush queue initialization failed\n");
5113 }
5114 
5115 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5116 {
5117 	struct dmar_domain *dmar_domain;
5118 	struct iommu_domain *domain;
5119 
5120 	switch (type) {
5121 	case IOMMU_DOMAIN_DMA:
5122 	case IOMMU_DOMAIN_UNMANAGED:
5123 		dmar_domain = alloc_domain(0);
5124 		if (!dmar_domain) {
5125 			pr_err("Can't allocate dmar_domain\n");
5126 			return NULL;
5127 		}
5128 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5129 			pr_err("Domain initialization failed\n");
5130 			domain_exit(dmar_domain);
5131 			return NULL;
5132 		}
5133 
5134 		if (type == IOMMU_DOMAIN_DMA)
5135 			intel_init_iova_domain(dmar_domain);
5136 
5137 		domain = &dmar_domain->domain;
5138 		domain->geometry.aperture_start = 0;
5139 		domain->geometry.aperture_end   =
5140 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5141 		domain->geometry.force_aperture = true;
5142 
5143 		return domain;
5144 	case IOMMU_DOMAIN_IDENTITY:
5145 		return &si_domain->domain;
5146 	default:
5147 		return NULL;
5148 	}
5149 
5150 	return NULL;
5151 }
5152 
5153 static void intel_iommu_domain_free(struct iommu_domain *domain)
5154 {
5155 	if (domain != &si_domain->domain)
5156 		domain_exit(to_dmar_domain(domain));
5157 }
5158 
5159 /*
5160  * Check whether a @domain could be attached to the @dev through the
5161  * aux-domain attach/detach APIs.
5162  */
5163 static inline bool
5164 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5165 {
5166 	struct device_domain_info *info = get_domain_info(dev);
5167 
5168 	return info && info->auxd_enabled &&
5169 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5170 }
5171 
5172 static void auxiliary_link_device(struct dmar_domain *domain,
5173 				  struct device *dev)
5174 {
5175 	struct device_domain_info *info = get_domain_info(dev);
5176 
5177 	assert_spin_locked(&device_domain_lock);
5178 	if (WARN_ON(!info))
5179 		return;
5180 
5181 	domain->auxd_refcnt++;
5182 	list_add(&domain->auxd, &info->auxiliary_domains);
5183 }
5184 
5185 static void auxiliary_unlink_device(struct dmar_domain *domain,
5186 				    struct device *dev)
5187 {
5188 	struct device_domain_info *info = get_domain_info(dev);
5189 
5190 	assert_spin_locked(&device_domain_lock);
5191 	if (WARN_ON(!info))
5192 		return;
5193 
5194 	list_del(&domain->auxd);
5195 	domain->auxd_refcnt--;
5196 
5197 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5198 		ioasid_free(domain->default_pasid);
5199 }
5200 
5201 static int aux_domain_add_dev(struct dmar_domain *domain,
5202 			      struct device *dev)
5203 {
5204 	int ret;
5205 	unsigned long flags;
5206 	struct intel_iommu *iommu;
5207 
5208 	iommu = device_to_iommu(dev, NULL, NULL);
5209 	if (!iommu)
5210 		return -ENODEV;
5211 
5212 	if (domain->default_pasid <= 0) {
5213 		u32 pasid;
5214 
5215 		/* No private data needed for the default pasid */
5216 		pasid = ioasid_alloc(NULL, PASID_MIN,
5217 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5218 				     NULL);
5219 		if (pasid == INVALID_IOASID) {
5220 			pr_err("Can't allocate default pasid\n");
5221 			return -ENODEV;
5222 		}
5223 		domain->default_pasid = pasid;
5224 	}
5225 
5226 	spin_lock_irqsave(&device_domain_lock, flags);
5227 	/*
5228 	 * iommu->lock must be held to attach domain to iommu and setup the
5229 	 * pasid entry for second level translation.
5230 	 */
5231 	spin_lock(&iommu->lock);
5232 	ret = domain_attach_iommu(domain, iommu);
5233 	if (ret)
5234 		goto attach_failed;
5235 
5236 	/* Setup the PASID entry for mediated devices: */
5237 	if (domain_use_first_level(domain))
5238 		ret = domain_setup_first_level(iommu, domain, dev,
5239 					       domain->default_pasid);
5240 	else
5241 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5242 						     domain->default_pasid);
5243 	if (ret)
5244 		goto table_failed;
5245 	spin_unlock(&iommu->lock);
5246 
5247 	auxiliary_link_device(domain, dev);
5248 
5249 	spin_unlock_irqrestore(&device_domain_lock, flags);
5250 
5251 	return 0;
5252 
5253 table_failed:
5254 	domain_detach_iommu(domain, iommu);
5255 attach_failed:
5256 	spin_unlock(&iommu->lock);
5257 	spin_unlock_irqrestore(&device_domain_lock, flags);
5258 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5259 		ioasid_free(domain->default_pasid);
5260 
5261 	return ret;
5262 }
5263 
5264 static void aux_domain_remove_dev(struct dmar_domain *domain,
5265 				  struct device *dev)
5266 {
5267 	struct device_domain_info *info;
5268 	struct intel_iommu *iommu;
5269 	unsigned long flags;
5270 
5271 	if (!is_aux_domain(dev, &domain->domain))
5272 		return;
5273 
5274 	spin_lock_irqsave(&device_domain_lock, flags);
5275 	info = get_domain_info(dev);
5276 	iommu = info->iommu;
5277 
5278 	auxiliary_unlink_device(domain, dev);
5279 
5280 	spin_lock(&iommu->lock);
5281 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5282 	domain_detach_iommu(domain, iommu);
5283 	spin_unlock(&iommu->lock);
5284 
5285 	spin_unlock_irqrestore(&device_domain_lock, flags);
5286 }
5287 
5288 static int prepare_domain_attach_device(struct iommu_domain *domain,
5289 					struct device *dev)
5290 {
5291 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5292 	struct intel_iommu *iommu;
5293 	int addr_width;
5294 
5295 	iommu = device_to_iommu(dev, NULL, NULL);
5296 	if (!iommu)
5297 		return -ENODEV;
5298 
5299 	/* check if this iommu agaw is sufficient for max mapped address */
5300 	addr_width = agaw_to_width(iommu->agaw);
5301 	if (addr_width > cap_mgaw(iommu->cap))
5302 		addr_width = cap_mgaw(iommu->cap);
5303 
5304 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5305 		dev_err(dev, "%s: iommu width (%d) is not "
5306 		        "sufficient for the mapped address (%llx)\n",
5307 		        __func__, addr_width, dmar_domain->max_addr);
5308 		return -EFAULT;
5309 	}
5310 	dmar_domain->gaw = addr_width;
5311 
5312 	/*
5313 	 * Knock out extra levels of page tables if necessary
5314 	 */
5315 	while (iommu->agaw < dmar_domain->agaw) {
5316 		struct dma_pte *pte;
5317 
5318 		pte = dmar_domain->pgd;
5319 		if (dma_pte_present(pte)) {
5320 			dmar_domain->pgd = (struct dma_pte *)
5321 				phys_to_virt(dma_pte_addr(pte));
5322 			free_pgtable_page(pte);
5323 		}
5324 		dmar_domain->agaw--;
5325 	}
5326 
5327 	return 0;
5328 }
5329 
5330 static int intel_iommu_attach_device(struct iommu_domain *domain,
5331 				     struct device *dev)
5332 {
5333 	int ret;
5334 
5335 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5336 	    device_is_rmrr_locked(dev)) {
5337 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5338 		return -EPERM;
5339 	}
5340 
5341 	if (is_aux_domain(dev, domain))
5342 		return -EPERM;
5343 
5344 	/* normally dev is not mapped */
5345 	if (unlikely(domain_context_mapped(dev))) {
5346 		struct dmar_domain *old_domain;
5347 
5348 		old_domain = find_domain(dev);
5349 		if (old_domain)
5350 			dmar_remove_one_dev_info(dev);
5351 	}
5352 
5353 	ret = prepare_domain_attach_device(domain, dev);
5354 	if (ret)
5355 		return ret;
5356 
5357 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5358 }
5359 
5360 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5361 					 struct device *dev)
5362 {
5363 	int ret;
5364 
5365 	if (!is_aux_domain(dev, domain))
5366 		return -EPERM;
5367 
5368 	ret = prepare_domain_attach_device(domain, dev);
5369 	if (ret)
5370 		return ret;
5371 
5372 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5373 }
5374 
5375 static void intel_iommu_detach_device(struct iommu_domain *domain,
5376 				      struct device *dev)
5377 {
5378 	dmar_remove_one_dev_info(dev);
5379 }
5380 
5381 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5382 					  struct device *dev)
5383 {
5384 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5385 }
5386 
5387 /*
5388  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5389  * VT-d granularity. Invalidation is typically included in the unmap operation
5390  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5391  * owns the first level page tables. Invalidations of translation caches in the
5392  * guest are trapped and passed down to the host.
5393  *
5394  * vIOMMU in the guest will only expose first level page tables, therefore
5395  * we do not support IOTLB granularity for request without PASID (second level).
5396  *
5397  * For example, to find the VT-d granularity encoding for IOTLB
5398  * type and page selective granularity within PASID:
5399  * X: indexed by iommu cache type
5400  * Y: indexed by enum iommu_inv_granularity
5401  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5402  */
5403 
5404 static const int
5405 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5406 	/*
5407 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5408 	 * page selective (address granularity)
5409 	 */
5410 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5411 	/* PASID based dev TLBs */
5412 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5413 	/* PASID cache */
5414 	{-EINVAL, -EINVAL, -EINVAL}
5415 };
5416 
5417 static inline int to_vtd_granularity(int type, int granu)
5418 {
5419 	return inv_type_granu_table[type][granu];
5420 }
5421 
5422 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5423 {
5424 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5425 
5426 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5427 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5428 	 * granu size in contiguous memory.
5429 	 */
5430 	return order_base_2(nr_pages);
5431 }
5432 
5433 #ifdef CONFIG_INTEL_IOMMU_SVM
5434 static int
5435 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5436 			   struct iommu_cache_invalidate_info *inv_info)
5437 {
5438 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5439 	struct device_domain_info *info;
5440 	struct intel_iommu *iommu;
5441 	unsigned long flags;
5442 	int cache_type;
5443 	u8 bus, devfn;
5444 	u16 did, sid;
5445 	int ret = 0;
5446 	u64 size = 0;
5447 
5448 	if (!inv_info || !dmar_domain)
5449 		return -EINVAL;
5450 
5451 	if (!dev || !dev_is_pci(dev))
5452 		return -ENODEV;
5453 
5454 	iommu = device_to_iommu(dev, &bus, &devfn);
5455 	if (!iommu)
5456 		return -ENODEV;
5457 
5458 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5459 		return -EINVAL;
5460 
5461 	spin_lock_irqsave(&device_domain_lock, flags);
5462 	spin_lock(&iommu->lock);
5463 	info = get_domain_info(dev);
5464 	if (!info) {
5465 		ret = -EINVAL;
5466 		goto out_unlock;
5467 	}
5468 	did = dmar_domain->iommu_did[iommu->seq_id];
5469 	sid = PCI_DEVID(bus, devfn);
5470 
5471 	/* Size is only valid in address selective invalidation */
5472 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5473 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5474 				   inv_info->granu.addr_info.nb_granules);
5475 
5476 	for_each_set_bit(cache_type,
5477 			 (unsigned long *)&inv_info->cache,
5478 			 IOMMU_CACHE_INV_TYPE_NR) {
5479 		int granu = 0;
5480 		u64 pasid = 0;
5481 		u64 addr = 0;
5482 
5483 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5484 		if (granu == -EINVAL) {
5485 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5486 					   cache_type, inv_info->granularity);
5487 			break;
5488 		}
5489 
5490 		/*
5491 		 * PASID is stored in different locations based on the
5492 		 * granularity.
5493 		 */
5494 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5495 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5496 			pasid = inv_info->granu.pasid_info.pasid;
5497 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5498 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5499 			pasid = inv_info->granu.addr_info.pasid;
5500 
5501 		switch (BIT(cache_type)) {
5502 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5503 			/* HW will ignore LSB bits based on address mask */
5504 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5505 			    size &&
5506 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5507 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5508 						   inv_info->granu.addr_info.addr, size);
5509 			}
5510 
5511 			/*
5512 			 * If granu is PASID-selective, address is ignored.
5513 			 * We use npages = -1 to indicate that.
5514 			 */
5515 			qi_flush_piotlb(iommu, did, pasid,
5516 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5517 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5518 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5519 
5520 			if (!info->ats_enabled)
5521 				break;
5522 			/*
5523 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5524 			 * in the guest may assume IOTLB flush is inclusive,
5525 			 * which is more efficient.
5526 			 */
5527 			fallthrough;
5528 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5529 			/*
5530 			 * PASID based device TLB invalidation does not support
5531 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5532 			 * IOMMU_INV_GRANU_ADDR.
5533 			 * The equivalent of that is we set the size to be the
5534 			 * entire range of 64 bit. User only provides PASID info
5535 			 * without address info. So we set addr to 0.
5536 			 */
5537 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5538 				size = 64 - VTD_PAGE_SHIFT;
5539 				addr = 0;
5540 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5541 				addr = inv_info->granu.addr_info.addr;
5542 			}
5543 
5544 			if (info->ats_enabled)
5545 				qi_flush_dev_iotlb_pasid(iommu, sid,
5546 						info->pfsid, pasid,
5547 						info->ats_qdep, addr,
5548 						size);
5549 			else
5550 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5551 			break;
5552 		default:
5553 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5554 					    cache_type);
5555 			ret = -EINVAL;
5556 		}
5557 	}
5558 out_unlock:
5559 	spin_unlock(&iommu->lock);
5560 	spin_unlock_irqrestore(&device_domain_lock, flags);
5561 
5562 	return ret;
5563 }
5564 #endif
5565 
5566 static int intel_iommu_map(struct iommu_domain *domain,
5567 			   unsigned long iova, phys_addr_t hpa,
5568 			   size_t size, int iommu_prot, gfp_t gfp)
5569 {
5570 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5571 	u64 max_addr;
5572 	int prot = 0;
5573 	int ret;
5574 
5575 	if (iommu_prot & IOMMU_READ)
5576 		prot |= DMA_PTE_READ;
5577 	if (iommu_prot & IOMMU_WRITE)
5578 		prot |= DMA_PTE_WRITE;
5579 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5580 		prot |= DMA_PTE_SNP;
5581 
5582 	max_addr = iova + size;
5583 	if (dmar_domain->max_addr < max_addr) {
5584 		u64 end;
5585 
5586 		/* check if minimum agaw is sufficient for mapped address */
5587 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5588 		if (end < max_addr) {
5589 			pr_err("%s: iommu width (%d) is not "
5590 			       "sufficient for the mapped address (%llx)\n",
5591 			       __func__, dmar_domain->gaw, max_addr);
5592 			return -EFAULT;
5593 		}
5594 		dmar_domain->max_addr = max_addr;
5595 	}
5596 	/* Round up size to next multiple of PAGE_SIZE, if it and
5597 	   the low bits of hpa would take us onto the next page */
5598 	size = aligned_nrpages(hpa, size);
5599 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5600 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5601 	return ret;
5602 }
5603 
5604 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5605 				unsigned long iova, size_t size,
5606 				struct iommu_iotlb_gather *gather)
5607 {
5608 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5609 	struct page *freelist = NULL;
5610 	unsigned long start_pfn, last_pfn;
5611 	unsigned int npages;
5612 	int iommu_id, level = 0;
5613 
5614 	/* Cope with horrid API which requires us to unmap more than the
5615 	   size argument if it happens to be a large-page mapping. */
5616 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5617 
5618 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5619 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5620 
5621 	start_pfn = iova >> VTD_PAGE_SHIFT;
5622 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5623 
5624 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5625 
5626 	npages = last_pfn - start_pfn + 1;
5627 
5628 	for_each_domain_iommu(iommu_id, dmar_domain)
5629 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5630 				      start_pfn, npages, !freelist, 0);
5631 
5632 	dma_free_pagelist(freelist);
5633 
5634 	if (dmar_domain->max_addr == iova + size)
5635 		dmar_domain->max_addr = iova;
5636 
5637 	return size;
5638 }
5639 
5640 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5641 					    dma_addr_t iova)
5642 {
5643 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5644 	struct dma_pte *pte;
5645 	int level = 0;
5646 	u64 phys = 0;
5647 
5648 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5649 	if (pte && dma_pte_present(pte))
5650 		phys = dma_pte_addr(pte) +
5651 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5652 						VTD_PAGE_SHIFT) - 1));
5653 
5654 	return phys;
5655 }
5656 
5657 static inline bool scalable_mode_support(void)
5658 {
5659 	struct dmar_drhd_unit *drhd;
5660 	struct intel_iommu *iommu;
5661 	bool ret = true;
5662 
5663 	rcu_read_lock();
5664 	for_each_active_iommu(iommu, drhd) {
5665 		if (!sm_supported(iommu)) {
5666 			ret = false;
5667 			break;
5668 		}
5669 	}
5670 	rcu_read_unlock();
5671 
5672 	return ret;
5673 }
5674 
5675 static inline bool iommu_pasid_support(void)
5676 {
5677 	struct dmar_drhd_unit *drhd;
5678 	struct intel_iommu *iommu;
5679 	bool ret = true;
5680 
5681 	rcu_read_lock();
5682 	for_each_active_iommu(iommu, drhd) {
5683 		if (!pasid_supported(iommu)) {
5684 			ret = false;
5685 			break;
5686 		}
5687 	}
5688 	rcu_read_unlock();
5689 
5690 	return ret;
5691 }
5692 
5693 static inline bool nested_mode_support(void)
5694 {
5695 	struct dmar_drhd_unit *drhd;
5696 	struct intel_iommu *iommu;
5697 	bool ret = true;
5698 
5699 	rcu_read_lock();
5700 	for_each_active_iommu(iommu, drhd) {
5701 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5702 			ret = false;
5703 			break;
5704 		}
5705 	}
5706 	rcu_read_unlock();
5707 
5708 	return ret;
5709 }
5710 
5711 static bool intel_iommu_capable(enum iommu_cap cap)
5712 {
5713 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5714 		return domain_update_iommu_snooping(NULL) == 1;
5715 	if (cap == IOMMU_CAP_INTR_REMAP)
5716 		return irq_remapping_enabled == 1;
5717 
5718 	return false;
5719 }
5720 
5721 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5722 {
5723 	struct intel_iommu *iommu;
5724 
5725 	iommu = device_to_iommu(dev, NULL, NULL);
5726 	if (!iommu)
5727 		return ERR_PTR(-ENODEV);
5728 
5729 	if (translation_pre_enabled(iommu))
5730 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5731 
5732 	return &iommu->iommu;
5733 }
5734 
5735 static void intel_iommu_release_device(struct device *dev)
5736 {
5737 	struct intel_iommu *iommu;
5738 
5739 	iommu = device_to_iommu(dev, NULL, NULL);
5740 	if (!iommu)
5741 		return;
5742 
5743 	dmar_remove_one_dev_info(dev);
5744 
5745 	set_dma_ops(dev, NULL);
5746 }
5747 
5748 static void intel_iommu_probe_finalize(struct device *dev)
5749 {
5750 	struct iommu_domain *domain;
5751 
5752 	domain = iommu_get_domain_for_dev(dev);
5753 	if (device_needs_bounce(dev))
5754 		set_dma_ops(dev, &bounce_dma_ops);
5755 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5756 		set_dma_ops(dev, &intel_dma_ops);
5757 	else
5758 		set_dma_ops(dev, NULL);
5759 }
5760 
5761 static void intel_iommu_get_resv_regions(struct device *device,
5762 					 struct list_head *head)
5763 {
5764 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5765 	struct iommu_resv_region *reg;
5766 	struct dmar_rmrr_unit *rmrr;
5767 	struct device *i_dev;
5768 	int i;
5769 
5770 	down_read(&dmar_global_lock);
5771 	for_each_rmrr_units(rmrr) {
5772 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5773 					  i, i_dev) {
5774 			struct iommu_resv_region *resv;
5775 			enum iommu_resv_type type;
5776 			size_t length;
5777 
5778 			if (i_dev != device &&
5779 			    !is_downstream_to_pci_bridge(device, i_dev))
5780 				continue;
5781 
5782 			length = rmrr->end_address - rmrr->base_address + 1;
5783 
5784 			type = device_rmrr_is_relaxable(device) ?
5785 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5786 
5787 			resv = iommu_alloc_resv_region(rmrr->base_address,
5788 						       length, prot, type);
5789 			if (!resv)
5790 				break;
5791 
5792 			list_add_tail(&resv->list, head);
5793 		}
5794 	}
5795 	up_read(&dmar_global_lock);
5796 
5797 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5798 	if (dev_is_pci(device)) {
5799 		struct pci_dev *pdev = to_pci_dev(device);
5800 
5801 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5802 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5803 						   IOMMU_RESV_DIRECT_RELAXABLE);
5804 			if (reg)
5805 				list_add_tail(&reg->list, head);
5806 		}
5807 	}
5808 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5809 
5810 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5811 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5812 				      0, IOMMU_RESV_MSI);
5813 	if (!reg)
5814 		return;
5815 	list_add_tail(&reg->list, head);
5816 }
5817 
5818 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5819 {
5820 	struct device_domain_info *info;
5821 	struct context_entry *context;
5822 	struct dmar_domain *domain;
5823 	unsigned long flags;
5824 	u64 ctx_lo;
5825 	int ret;
5826 
5827 	domain = find_domain(dev);
5828 	if (!domain)
5829 		return -EINVAL;
5830 
5831 	spin_lock_irqsave(&device_domain_lock, flags);
5832 	spin_lock(&iommu->lock);
5833 
5834 	ret = -EINVAL;
5835 	info = get_domain_info(dev);
5836 	if (!info || !info->pasid_supported)
5837 		goto out;
5838 
5839 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5840 	if (WARN_ON(!context))
5841 		goto out;
5842 
5843 	ctx_lo = context[0].lo;
5844 
5845 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5846 		ctx_lo |= CONTEXT_PASIDE;
5847 		context[0].lo = ctx_lo;
5848 		wmb();
5849 		iommu->flush.flush_context(iommu,
5850 					   domain->iommu_did[iommu->seq_id],
5851 					   PCI_DEVID(info->bus, info->devfn),
5852 					   DMA_CCMD_MASK_NOBIT,
5853 					   DMA_CCMD_DEVICE_INVL);
5854 	}
5855 
5856 	/* Enable PASID support in the device, if it wasn't already */
5857 	if (!info->pasid_enabled)
5858 		iommu_enable_dev_iotlb(info);
5859 
5860 	ret = 0;
5861 
5862  out:
5863 	spin_unlock(&iommu->lock);
5864 	spin_unlock_irqrestore(&device_domain_lock, flags);
5865 
5866 	return ret;
5867 }
5868 
5869 static void intel_iommu_apply_resv_region(struct device *dev,
5870 					  struct iommu_domain *domain,
5871 					  struct iommu_resv_region *region)
5872 {
5873 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5874 	unsigned long start, end;
5875 
5876 	start = IOVA_PFN(region->start);
5877 	end   = IOVA_PFN(region->start + region->length - 1);
5878 
5879 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5880 }
5881 
5882 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5883 {
5884 	if (dev_is_pci(dev))
5885 		return pci_device_group(dev);
5886 	return generic_device_group(dev);
5887 }
5888 
5889 static int intel_iommu_enable_auxd(struct device *dev)
5890 {
5891 	struct device_domain_info *info;
5892 	struct intel_iommu *iommu;
5893 	unsigned long flags;
5894 	int ret;
5895 
5896 	iommu = device_to_iommu(dev, NULL, NULL);
5897 	if (!iommu || dmar_disabled)
5898 		return -EINVAL;
5899 
5900 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5901 		return -EINVAL;
5902 
5903 	ret = intel_iommu_enable_pasid(iommu, dev);
5904 	if (ret)
5905 		return -ENODEV;
5906 
5907 	spin_lock_irqsave(&device_domain_lock, flags);
5908 	info = get_domain_info(dev);
5909 	info->auxd_enabled = 1;
5910 	spin_unlock_irqrestore(&device_domain_lock, flags);
5911 
5912 	return 0;
5913 }
5914 
5915 static int intel_iommu_disable_auxd(struct device *dev)
5916 {
5917 	struct device_domain_info *info;
5918 	unsigned long flags;
5919 
5920 	spin_lock_irqsave(&device_domain_lock, flags);
5921 	info = get_domain_info(dev);
5922 	if (!WARN_ON(!info))
5923 		info->auxd_enabled = 0;
5924 	spin_unlock_irqrestore(&device_domain_lock, flags);
5925 
5926 	return 0;
5927 }
5928 
5929 /*
5930  * A PCI express designated vendor specific extended capability is defined
5931  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5932  * for system software and tools to detect endpoint devices supporting the
5933  * Intel scalable IO virtualization without host driver dependency.
5934  *
5935  * Returns the address of the matching extended capability structure within
5936  * the device's PCI configuration space or 0 if the device does not support
5937  * it.
5938  */
5939 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5940 {
5941 	int pos;
5942 	u16 vendor, id;
5943 
5944 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5945 	while (pos) {
5946 		pci_read_config_word(pdev, pos + 4, &vendor);
5947 		pci_read_config_word(pdev, pos + 8, &id);
5948 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5949 			return pos;
5950 
5951 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5952 	}
5953 
5954 	return 0;
5955 }
5956 
5957 static bool
5958 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5959 {
5960 	if (feat == IOMMU_DEV_FEAT_AUX) {
5961 		int ret;
5962 
5963 		if (!dev_is_pci(dev) || dmar_disabled ||
5964 		    !scalable_mode_support() || !iommu_pasid_support())
5965 			return false;
5966 
5967 		ret = pci_pasid_features(to_pci_dev(dev));
5968 		if (ret < 0)
5969 			return false;
5970 
5971 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5972 	}
5973 
5974 	if (feat == IOMMU_DEV_FEAT_SVA) {
5975 		struct device_domain_info *info = get_domain_info(dev);
5976 
5977 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5978 			info->pasid_supported && info->pri_supported &&
5979 			info->ats_supported;
5980 	}
5981 
5982 	return false;
5983 }
5984 
5985 static int
5986 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5987 {
5988 	if (feat == IOMMU_DEV_FEAT_AUX)
5989 		return intel_iommu_enable_auxd(dev);
5990 
5991 	if (feat == IOMMU_DEV_FEAT_SVA) {
5992 		struct device_domain_info *info = get_domain_info(dev);
5993 
5994 		if (!info)
5995 			return -EINVAL;
5996 
5997 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5998 			return 0;
5999 	}
6000 
6001 	return -ENODEV;
6002 }
6003 
6004 static int
6005 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6006 {
6007 	if (feat == IOMMU_DEV_FEAT_AUX)
6008 		return intel_iommu_disable_auxd(dev);
6009 
6010 	return -ENODEV;
6011 }
6012 
6013 static bool
6014 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6015 {
6016 	struct device_domain_info *info = get_domain_info(dev);
6017 
6018 	if (feat == IOMMU_DEV_FEAT_AUX)
6019 		return scalable_mode_support() && info && info->auxd_enabled;
6020 
6021 	return false;
6022 }
6023 
6024 static int
6025 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6026 {
6027 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6028 
6029 	return dmar_domain->default_pasid > 0 ?
6030 			dmar_domain->default_pasid : -EINVAL;
6031 }
6032 
6033 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6034 					   struct device *dev)
6035 {
6036 	return attach_deferred(dev);
6037 }
6038 
6039 static int
6040 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6041 			    enum iommu_attr attr, void *data)
6042 {
6043 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6044 	unsigned long flags;
6045 	int ret = 0;
6046 
6047 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6048 		return -EINVAL;
6049 
6050 	switch (attr) {
6051 	case DOMAIN_ATTR_NESTING:
6052 		spin_lock_irqsave(&device_domain_lock, flags);
6053 		if (nested_mode_support() &&
6054 		    list_empty(&dmar_domain->devices)) {
6055 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6056 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6057 		} else {
6058 			ret = -ENODEV;
6059 		}
6060 		spin_unlock_irqrestore(&device_domain_lock, flags);
6061 		break;
6062 	default:
6063 		ret = -EINVAL;
6064 		break;
6065 	}
6066 
6067 	return ret;
6068 }
6069 
6070 /*
6071  * Check that the device does not live on an external facing PCI port that is
6072  * marked as untrusted. Such devices should not be able to apply quirks and
6073  * thus not be able to bypass the IOMMU restrictions.
6074  */
6075 static bool risky_device(struct pci_dev *pdev)
6076 {
6077 	if (pdev->untrusted) {
6078 		pci_info(pdev,
6079 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6080 			 pdev->vendor, pdev->device);
6081 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6082 		return true;
6083 	}
6084 	return false;
6085 }
6086 
6087 const struct iommu_ops intel_iommu_ops = {
6088 	.capable		= intel_iommu_capable,
6089 	.domain_alloc		= intel_iommu_domain_alloc,
6090 	.domain_free		= intel_iommu_domain_free,
6091 	.domain_set_attr	= intel_iommu_domain_set_attr,
6092 	.attach_dev		= intel_iommu_attach_device,
6093 	.detach_dev		= intel_iommu_detach_device,
6094 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6095 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6096 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6097 	.map			= intel_iommu_map,
6098 	.unmap			= intel_iommu_unmap,
6099 	.iova_to_phys		= intel_iommu_iova_to_phys,
6100 	.probe_device		= intel_iommu_probe_device,
6101 	.probe_finalize		= intel_iommu_probe_finalize,
6102 	.release_device		= intel_iommu_release_device,
6103 	.get_resv_regions	= intel_iommu_get_resv_regions,
6104 	.put_resv_regions	= generic_iommu_put_resv_regions,
6105 	.apply_resv_region	= intel_iommu_apply_resv_region,
6106 	.device_group		= intel_iommu_device_group,
6107 	.dev_has_feat		= intel_iommu_dev_has_feat,
6108 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6109 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6110 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6111 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6112 	.def_domain_type	= device_def_domain_type,
6113 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6114 #ifdef CONFIG_INTEL_IOMMU_SVM
6115 	.cache_invalidate	= intel_iommu_sva_invalidate,
6116 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6117 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6118 	.sva_bind		= intel_svm_bind,
6119 	.sva_unbind		= intel_svm_unbind,
6120 	.sva_get_pasid		= intel_svm_get_pasid,
6121 	.page_response		= intel_svm_page_response,
6122 #endif
6123 };
6124 
6125 static void quirk_iommu_igfx(struct pci_dev *dev)
6126 {
6127 	if (risky_device(dev))
6128 		return;
6129 
6130 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6131 	dmar_map_gfx = 0;
6132 }
6133 
6134 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6142 
6143 /* Broadwell igfx malfunctions with dmar */
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6168 
6169 static void quirk_iommu_rwbf(struct pci_dev *dev)
6170 {
6171 	if (risky_device(dev))
6172 		return;
6173 
6174 	/*
6175 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6176 	 * but needs it. Same seems to hold for the desktop versions.
6177 	 */
6178 	pci_info(dev, "Forcing write-buffer flush capability\n");
6179 	rwbf_quirk = 1;
6180 }
6181 
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6187 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6189 
6190 #define GGC 0x52
6191 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6192 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6193 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6194 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6195 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6196 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6197 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6198 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6199 
6200 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6201 {
6202 	unsigned short ggc;
6203 
6204 	if (risky_device(dev))
6205 		return;
6206 
6207 	if (pci_read_config_word(dev, GGC, &ggc))
6208 		return;
6209 
6210 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6211 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6212 		dmar_map_gfx = 0;
6213 	} else if (dmar_map_gfx) {
6214 		/* we have to ensure the gfx device is idle before we flush */
6215 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6216 		intel_iommu_strict = 1;
6217        }
6218 }
6219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6223 
6224 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6225 {
6226 	unsigned short ver;
6227 
6228 	if (!IS_GFX_DEVICE(dev))
6229 		return;
6230 
6231 	ver = (dev->device >> 8) & 0xff;
6232 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6233 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6234 	    ver != 0x9a)
6235 		return;
6236 
6237 	if (risky_device(dev))
6238 		return;
6239 
6240 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6241 	iommu_skip_te_disable = 1;
6242 }
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6244 
6245 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6246    ISOCH DMAR unit for the Azalia sound device, but not give it any
6247    TLB entries, which causes it to deadlock. Check for that.  We do
6248    this in a function called from init_dmars(), instead of in a PCI
6249    quirk, because we don't want to print the obnoxious "BIOS broken"
6250    message if VT-d is actually disabled.
6251 */
6252 static void __init check_tylersburg_isoch(void)
6253 {
6254 	struct pci_dev *pdev;
6255 	uint32_t vtisochctrl;
6256 
6257 	/* If there's no Azalia in the system anyway, forget it. */
6258 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6259 	if (!pdev)
6260 		return;
6261 
6262 	if (risky_device(pdev)) {
6263 		pci_dev_put(pdev);
6264 		return;
6265 	}
6266 
6267 	pci_dev_put(pdev);
6268 
6269 	/* System Management Registers. Might be hidden, in which case
6270 	   we can't do the sanity check. But that's OK, because the
6271 	   known-broken BIOSes _don't_ actually hide it, so far. */
6272 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6273 	if (!pdev)
6274 		return;
6275 
6276 	if (risky_device(pdev)) {
6277 		pci_dev_put(pdev);
6278 		return;
6279 	}
6280 
6281 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6282 		pci_dev_put(pdev);
6283 		return;
6284 	}
6285 
6286 	pci_dev_put(pdev);
6287 
6288 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6289 	if (vtisochctrl & 1)
6290 		return;
6291 
6292 	/* Drop all bits other than the number of TLB entries */
6293 	vtisochctrl &= 0x1c;
6294 
6295 	/* If we have the recommended number of TLB entries (16), fine. */
6296 	if (vtisochctrl == 0x10)
6297 		return;
6298 
6299 	/* Zero TLB entries? You get to ride the short bus to school. */
6300 	if (!vtisochctrl) {
6301 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6302 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6303 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6304 		     dmi_get_system_info(DMI_BIOS_VERSION),
6305 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6306 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6307 		return;
6308 	}
6309 
6310 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6311 	       vtisochctrl);
6312 }
6313