xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 88f4ede4)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360 
361 #define IDENTMAP_GFX		2
362 #define IDENTMAP_AZALIA		4
363 
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 	struct device_domain_info *info;
371 
372 	if (!dev)
373 		return NULL;
374 
375 	info = dev_iommu_priv_get(dev);
376 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 		return NULL;
378 
379 	return info;
380 }
381 
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384 
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
386 				to_pci_dev(d)->untrusted)
387 
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 				     void *data), void *data)
394 {
395 	int ret = 0;
396 	unsigned long flags;
397 	struct device_domain_info *info;
398 
399 	spin_lock_irqsave(&device_domain_lock, flags);
400 	list_for_each_entry(info, &device_domain_list, global) {
401 		ret = fn(info, data);
402 		if (ret) {
403 			spin_unlock_irqrestore(&device_domain_lock, flags);
404 			return ret;
405 		}
406 	}
407 	spin_unlock_irqrestore(&device_domain_lock, flags);
408 
409 	return 0;
410 }
411 
412 const struct iommu_ops intel_iommu_ops;
413 
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418 
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423 
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426 	u32 gsts;
427 
428 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 	if (gsts & DMA_GSTS_TES)
430 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432 
433 static int __init intel_iommu_setup(char *str)
434 {
435 	if (!str)
436 		return -EINVAL;
437 	while (*str) {
438 		if (!strncmp(str, "on", 2)) {
439 			dmar_disabled = 0;
440 			pr_info("IOMMU enabled\n");
441 		} else if (!strncmp(str, "off", 3)) {
442 			dmar_disabled = 1;
443 			no_platform_optin = 1;
444 			pr_info("IOMMU disabled\n");
445 		} else if (!strncmp(str, "igfx_off", 8)) {
446 			dmar_map_gfx = 0;
447 			pr_info("Disable GFX device mapping\n");
448 		} else if (!strncmp(str, "forcedac", 8)) {
449 			pr_info("Forcing DAC for PCI devices\n");
450 			dmar_forcedac = 1;
451 		} else if (!strncmp(str, "strict", 6)) {
452 			pr_info("Disable batched IOTLB flush\n");
453 			intel_iommu_strict = 1;
454 		} else if (!strncmp(str, "sp_off", 6)) {
455 			pr_info("Disable supported super page\n");
456 			intel_iommu_superpage = 0;
457 		} else if (!strncmp(str, "sm_on", 5)) {
458 			pr_info("Intel-IOMMU: scalable mode supported\n");
459 			intel_iommu_sm = 1;
460 		} else if (!strncmp(str, "tboot_noforce", 13)) {
461 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 			intel_iommu_tboot_noforce = 1;
463 		} else if (!strncmp(str, "nobounce", 8)) {
464 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 			intel_no_bounce = 1;
466 		}
467 
468 		str += strcspn(str, ",");
469 		while (*str == ',')
470 			str++;
471 	}
472 	return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475 
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478 
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481 	struct dmar_domain **domains;
482 	int idx = did >> 8;
483 
484 	domains = iommu->domains[idx];
485 	if (!domains)
486 		return NULL;
487 
488 	return domains[did & 0xff];
489 }
490 
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 			     struct dmar_domain *domain)
493 {
494 	struct dmar_domain **domains;
495 	int idx = did >> 8;
496 
497 	if (!iommu->domains[idx]) {
498 		size_t size = 256 * sizeof(struct dmar_domain *);
499 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 	}
501 
502 	domains = iommu->domains[idx];
503 	if (WARN_ON(!domains))
504 		return;
505 	else
506 		domains[did & 0xff] = domain;
507 }
508 
509 void *alloc_pgtable_page(int node)
510 {
511 	struct page *page;
512 	void *vaddr = NULL;
513 
514 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 	if (page)
516 		vaddr = page_address(page);
517 	return vaddr;
518 }
519 
520 void free_pgtable_page(void *vaddr)
521 {
522 	free_page((unsigned long)vaddr);
523 }
524 
525 static inline void *alloc_domain_mem(void)
526 {
527 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529 
530 static void free_domain_mem(void *vaddr)
531 {
532 	kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534 
535 static inline void * alloc_devinfo_mem(void)
536 {
537 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539 
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542 	kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544 
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549 
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554 
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 				       unsigned long pfn)
557 {
558 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559 
560 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562 
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565 	unsigned long sagaw;
566 	int agaw = -1;
567 
568 	sagaw = cap_sagaw(iommu->cap);
569 	for (agaw = width_to_agaw(max_gaw);
570 	     agaw >= 0; agaw--) {
571 		if (test_bit(agaw, &sagaw))
572 			break;
573 	}
574 
575 	return agaw;
576 }
577 
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585 
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595 
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599 	int iommu_id;
600 
601 	/* si_domain and vm domain should not get here. */
602 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 		return NULL;
604 
605 	for_each_domain_iommu(iommu_id, domain)
606 		break;
607 
608 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 		return NULL;
610 
611 	return g_iommus[iommu_id];
612 }
613 
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616 	return sm_supported(iommu) ?
617 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619 
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622 	struct dmar_drhd_unit *drhd;
623 	struct intel_iommu *iommu;
624 	bool found = false;
625 	int i;
626 
627 	domain->iommu_coherency = 1;
628 
629 	for_each_domain_iommu(i, domain) {
630 		found = true;
631 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 			domain->iommu_coherency = 0;
633 			break;
634 		}
635 	}
636 	if (found)
637 		return;
638 
639 	/* No hardware attached; use lowest common denominator */
640 	rcu_read_lock();
641 	for_each_active_iommu(iommu, drhd) {
642 		if (!iommu_paging_structure_coherency(iommu)) {
643 			domain->iommu_coherency = 0;
644 			break;
645 		}
646 	}
647 	rcu_read_unlock();
648 }
649 
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652 	struct dmar_drhd_unit *drhd;
653 	struct intel_iommu *iommu;
654 	int ret = 1;
655 
656 	rcu_read_lock();
657 	for_each_active_iommu(iommu, drhd) {
658 		if (iommu != skip) {
659 			if (!ecap_sc_support(iommu->ecap)) {
660 				ret = 0;
661 				break;
662 			}
663 		}
664 	}
665 	rcu_read_unlock();
666 
667 	return ret;
668 }
669 
670 static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 					 struct intel_iommu *skip)
672 {
673 	struct dmar_drhd_unit *drhd;
674 	struct intel_iommu *iommu;
675 	int mask = 0x3;
676 
677 	if (!intel_iommu_superpage) {
678 		return 0;
679 	}
680 
681 	/* set iommu_superpage to the smallest common denominator */
682 	rcu_read_lock();
683 	for_each_active_iommu(iommu, drhd) {
684 		if (iommu != skip) {
685 			if (domain && domain_use_first_level(domain)) {
686 				if (!cap_fl1gp_support(iommu->cap))
687 					mask = 0x1;
688 			} else {
689 				mask &= cap_super_page_val(iommu->cap);
690 			}
691 
692 			if (!mask)
693 				break;
694 		}
695 	}
696 	rcu_read_unlock();
697 
698 	return fls(mask);
699 }
700 
701 static int domain_update_device_node(struct dmar_domain *domain)
702 {
703 	struct device_domain_info *info;
704 	int nid = NUMA_NO_NODE;
705 
706 	assert_spin_locked(&device_domain_lock);
707 
708 	if (list_empty(&domain->devices))
709 		return NUMA_NO_NODE;
710 
711 	list_for_each_entry(info, &domain->devices, link) {
712 		if (!info->dev)
713 			continue;
714 
715 		/*
716 		 * There could possibly be multiple device numa nodes as devices
717 		 * within the same domain may sit behind different IOMMUs. There
718 		 * isn't perfect answer in such situation, so we select first
719 		 * come first served policy.
720 		 */
721 		nid = dev_to_node(info->dev);
722 		if (nid != NUMA_NO_NODE)
723 			break;
724 	}
725 
726 	return nid;
727 }
728 
729 /* Some capabilities may be different across iommus */
730 static void domain_update_iommu_cap(struct dmar_domain *domain)
731 {
732 	domain_update_iommu_coherency(domain);
733 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
734 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
735 
736 	/*
737 	 * If RHSA is missing, we should default to the device numa domain
738 	 * as fall back.
739 	 */
740 	if (domain->nid == NUMA_NO_NODE)
741 		domain->nid = domain_update_device_node(domain);
742 }
743 
744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
745 					 u8 devfn, int alloc)
746 {
747 	struct root_entry *root = &iommu->root_entry[bus];
748 	struct context_entry *context;
749 	u64 *entry;
750 
751 	entry = &root->lo;
752 	if (sm_supported(iommu)) {
753 		if (devfn >= 0x80) {
754 			devfn -= 0x80;
755 			entry = &root->hi;
756 		}
757 		devfn *= 2;
758 	}
759 	if (*entry & 1)
760 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
761 	else {
762 		unsigned long phy_addr;
763 		if (!alloc)
764 			return NULL;
765 
766 		context = alloc_pgtable_page(iommu->node);
767 		if (!context)
768 			return NULL;
769 
770 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
771 		phy_addr = virt_to_phys((void *)context);
772 		*entry = phy_addr | 1;
773 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
774 	}
775 	return &context[devfn];
776 }
777 
778 static bool attach_deferred(struct device *dev)
779 {
780 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
781 }
782 
783 /**
784  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
785  *				 sub-hierarchy of a candidate PCI-PCI bridge
786  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
787  * @bridge: the candidate PCI-PCI bridge
788  *
789  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
790  */
791 static bool
792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
793 {
794 	struct pci_dev *pdev, *pbridge;
795 
796 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
797 		return false;
798 
799 	pdev = to_pci_dev(dev);
800 	pbridge = to_pci_dev(bridge);
801 
802 	if (pbridge->subordinate &&
803 	    pbridge->subordinate->number <= pdev->bus->number &&
804 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
805 		return true;
806 
807 	return false;
808 }
809 
810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
811 {
812 	struct dmar_drhd_unit *drhd;
813 	u32 vtbar;
814 	int rc;
815 
816 	/* We know that this device on this chipset has its own IOMMU.
817 	 * If we find it under a different IOMMU, then the BIOS is lying
818 	 * to us. Hope that the IOMMU for this device is actually
819 	 * disabled, and it needs no translation...
820 	 */
821 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
822 	if (rc) {
823 		/* "can't" happen */
824 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
825 		return false;
826 	}
827 	vtbar &= 0xffff0000;
828 
829 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
830 	drhd = dmar_find_matched_drhd_unit(pdev);
831 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
832 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
833 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
834 		return true;
835 	}
836 
837 	return false;
838 }
839 
840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
841 {
842 	if (!iommu || iommu->drhd->ignored)
843 		return true;
844 
845 	if (dev_is_pci(dev)) {
846 		struct pci_dev *pdev = to_pci_dev(dev);
847 
848 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
849 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
850 		    quirk_ioat_snb_local_iommu(pdev))
851 			return true;
852 	}
853 
854 	return false;
855 }
856 
857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
858 {
859 	struct dmar_drhd_unit *drhd = NULL;
860 	struct pci_dev *pdev = NULL;
861 	struct intel_iommu *iommu;
862 	struct device *tmp;
863 	u16 segment = 0;
864 	int i;
865 
866 	if (!dev)
867 		return NULL;
868 
869 	if (dev_is_pci(dev)) {
870 		struct pci_dev *pf_pdev;
871 
872 		pdev = pci_real_dma_dev(to_pci_dev(dev));
873 
874 		/* VFs aren't listed in scope tables; we need to look up
875 		 * the PF instead to find the IOMMU. */
876 		pf_pdev = pci_physfn(pdev);
877 		dev = &pf_pdev->dev;
878 		segment = pci_domain_nr(pdev->bus);
879 	} else if (has_acpi_companion(dev))
880 		dev = &ACPI_COMPANION(dev)->dev;
881 
882 	rcu_read_lock();
883 	for_each_iommu(iommu, drhd) {
884 		if (pdev && segment != drhd->segment)
885 			continue;
886 
887 		for_each_active_dev_scope(drhd->devices,
888 					  drhd->devices_cnt, i, tmp) {
889 			if (tmp == dev) {
890 				/* For a VF use its original BDF# not that of the PF
891 				 * which we used for the IOMMU lookup. Strictly speaking
892 				 * we could do this for all PCI devices; we only need to
893 				 * get the BDF# from the scope table for ACPI matches. */
894 				if (pdev && pdev->is_virtfn)
895 					goto got_pdev;
896 
897 				if (bus && devfn) {
898 					*bus = drhd->devices[i].bus;
899 					*devfn = drhd->devices[i].devfn;
900 				}
901 				goto out;
902 			}
903 
904 			if (is_downstream_to_pci_bridge(dev, tmp))
905 				goto got_pdev;
906 		}
907 
908 		if (pdev && drhd->include_all) {
909 		got_pdev:
910 			if (bus && devfn) {
911 				*bus = pdev->bus->number;
912 				*devfn = pdev->devfn;
913 			}
914 			goto out;
915 		}
916 	}
917 	iommu = NULL;
918  out:
919 	if (iommu_is_dummy(iommu, dev))
920 		iommu = NULL;
921 
922 	rcu_read_unlock();
923 
924 	return iommu;
925 }
926 
927 static void domain_flush_cache(struct dmar_domain *domain,
928 			       void *addr, int size)
929 {
930 	if (!domain->iommu_coherency)
931 		clflush_cache_range(addr, size);
932 }
933 
934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
935 {
936 	struct context_entry *context;
937 	int ret = 0;
938 	unsigned long flags;
939 
940 	spin_lock_irqsave(&iommu->lock, flags);
941 	context = iommu_context_addr(iommu, bus, devfn, 0);
942 	if (context)
943 		ret = context_present(context);
944 	spin_unlock_irqrestore(&iommu->lock, flags);
945 	return ret;
946 }
947 
948 static void free_context_table(struct intel_iommu *iommu)
949 {
950 	int i;
951 	unsigned long flags;
952 	struct context_entry *context;
953 
954 	spin_lock_irqsave(&iommu->lock, flags);
955 	if (!iommu->root_entry) {
956 		goto out;
957 	}
958 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
959 		context = iommu_context_addr(iommu, i, 0, 0);
960 		if (context)
961 			free_pgtable_page(context);
962 
963 		if (!sm_supported(iommu))
964 			continue;
965 
966 		context = iommu_context_addr(iommu, i, 0x80, 0);
967 		if (context)
968 			free_pgtable_page(context);
969 
970 	}
971 	free_pgtable_page(iommu->root_entry);
972 	iommu->root_entry = NULL;
973 out:
974 	spin_unlock_irqrestore(&iommu->lock, flags);
975 }
976 
977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
978 				      unsigned long pfn, int *target_level)
979 {
980 	struct dma_pte *parent, *pte;
981 	int level = agaw_to_level(domain->agaw);
982 	int offset;
983 
984 	BUG_ON(!domain->pgd);
985 
986 	if (!domain_pfn_supported(domain, pfn))
987 		/* Address beyond IOMMU's addressing capabilities. */
988 		return NULL;
989 
990 	parent = domain->pgd;
991 
992 	while (1) {
993 		void *tmp_page;
994 
995 		offset = pfn_level_offset(pfn, level);
996 		pte = &parent[offset];
997 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
998 			break;
999 		if (level == *target_level)
1000 			break;
1001 
1002 		if (!dma_pte_present(pte)) {
1003 			uint64_t pteval;
1004 
1005 			tmp_page = alloc_pgtable_page(domain->nid);
1006 
1007 			if (!tmp_page)
1008 				return NULL;
1009 
1010 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1011 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1012 			if (domain_use_first_level(domain))
1013 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1014 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1015 				/* Someone else set it while we were thinking; use theirs. */
1016 				free_pgtable_page(tmp_page);
1017 			else
1018 				domain_flush_cache(domain, pte, sizeof(*pte));
1019 		}
1020 		if (level == 1)
1021 			break;
1022 
1023 		parent = phys_to_virt(dma_pte_addr(pte));
1024 		level--;
1025 	}
1026 
1027 	if (!*target_level)
1028 		*target_level = level;
1029 
1030 	return pte;
1031 }
1032 
1033 /* return address's pte at specific level */
1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1035 					 unsigned long pfn,
1036 					 int level, int *large_page)
1037 {
1038 	struct dma_pte *parent, *pte;
1039 	int total = agaw_to_level(domain->agaw);
1040 	int offset;
1041 
1042 	parent = domain->pgd;
1043 	while (level <= total) {
1044 		offset = pfn_level_offset(pfn, total);
1045 		pte = &parent[offset];
1046 		if (level == total)
1047 			return pte;
1048 
1049 		if (!dma_pte_present(pte)) {
1050 			*large_page = total;
1051 			break;
1052 		}
1053 
1054 		if (dma_pte_superpage(pte)) {
1055 			*large_page = total;
1056 			return pte;
1057 		}
1058 
1059 		parent = phys_to_virt(dma_pte_addr(pte));
1060 		total--;
1061 	}
1062 	return NULL;
1063 }
1064 
1065 /* clear last level pte, a tlb flush should be followed */
1066 static void dma_pte_clear_range(struct dmar_domain *domain,
1067 				unsigned long start_pfn,
1068 				unsigned long last_pfn)
1069 {
1070 	unsigned int large_page;
1071 	struct dma_pte *first_pte, *pte;
1072 
1073 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075 	BUG_ON(start_pfn > last_pfn);
1076 
1077 	/* we don't need lock here; nobody else touches the iova range */
1078 	do {
1079 		large_page = 1;
1080 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1081 		if (!pte) {
1082 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1083 			continue;
1084 		}
1085 		do {
1086 			dma_clear_pte(pte);
1087 			start_pfn += lvl_to_nr_pages(large_page);
1088 			pte++;
1089 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1090 
1091 		domain_flush_cache(domain, first_pte,
1092 				   (void *)pte - (void *)first_pte);
1093 
1094 	} while (start_pfn && start_pfn <= last_pfn);
1095 }
1096 
1097 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1098 			       int retain_level, struct dma_pte *pte,
1099 			       unsigned long pfn, unsigned long start_pfn,
1100 			       unsigned long last_pfn)
1101 {
1102 	pfn = max(start_pfn, pfn);
1103 	pte = &pte[pfn_level_offset(pfn, level)];
1104 
1105 	do {
1106 		unsigned long level_pfn;
1107 		struct dma_pte *level_pte;
1108 
1109 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1110 			goto next;
1111 
1112 		level_pfn = pfn & level_mask(level);
1113 		level_pte = phys_to_virt(dma_pte_addr(pte));
1114 
1115 		if (level > 2) {
1116 			dma_pte_free_level(domain, level - 1, retain_level,
1117 					   level_pte, level_pfn, start_pfn,
1118 					   last_pfn);
1119 		}
1120 
1121 		/*
1122 		 * Free the page table if we're below the level we want to
1123 		 * retain and the range covers the entire table.
1124 		 */
1125 		if (level < retain_level && !(start_pfn > level_pfn ||
1126 		      last_pfn < level_pfn + level_size(level) - 1)) {
1127 			dma_clear_pte(pte);
1128 			domain_flush_cache(domain, pte, sizeof(*pte));
1129 			free_pgtable_page(level_pte);
1130 		}
1131 next:
1132 		pfn += level_size(level);
1133 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1134 }
1135 
1136 /*
1137  * clear last level (leaf) ptes and free page table pages below the
1138  * level we wish to keep intact.
1139  */
1140 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1141 				   unsigned long start_pfn,
1142 				   unsigned long last_pfn,
1143 				   int retain_level)
1144 {
1145 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1146 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1147 	BUG_ON(start_pfn > last_pfn);
1148 
1149 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1150 
1151 	/* We don't need lock here; nobody else touches the iova range */
1152 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1153 			   domain->pgd, 0, start_pfn, last_pfn);
1154 
1155 	/* free pgd */
1156 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1157 		free_pgtable_page(domain->pgd);
1158 		domain->pgd = NULL;
1159 	}
1160 }
1161 
1162 /* When a page at a given level is being unlinked from its parent, we don't
1163    need to *modify* it at all. All we need to do is make a list of all the
1164    pages which can be freed just as soon as we've flushed the IOTLB and we
1165    know the hardware page-walk will no longer touch them.
1166    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1167    be freed. */
1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1169 					    int level, struct dma_pte *pte,
1170 					    struct page *freelist)
1171 {
1172 	struct page *pg;
1173 
1174 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1175 	pg->freelist = freelist;
1176 	freelist = pg;
1177 
1178 	if (level == 1)
1179 		return freelist;
1180 
1181 	pte = page_address(pg);
1182 	do {
1183 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1184 			freelist = dma_pte_list_pagetables(domain, level - 1,
1185 							   pte, freelist);
1186 		pte++;
1187 	} while (!first_pte_in_page(pte));
1188 
1189 	return freelist;
1190 }
1191 
1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1193 					struct dma_pte *pte, unsigned long pfn,
1194 					unsigned long start_pfn,
1195 					unsigned long last_pfn,
1196 					struct page *freelist)
1197 {
1198 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1199 
1200 	pfn = max(start_pfn, pfn);
1201 	pte = &pte[pfn_level_offset(pfn, level)];
1202 
1203 	do {
1204 		unsigned long level_pfn;
1205 
1206 		if (!dma_pte_present(pte))
1207 			goto next;
1208 
1209 		level_pfn = pfn & level_mask(level);
1210 
1211 		/* If range covers entire pagetable, free it */
1212 		if (start_pfn <= level_pfn &&
1213 		    last_pfn >= level_pfn + level_size(level) - 1) {
1214 			/* These suborbinate page tables are going away entirely. Don't
1215 			   bother to clear them; we're just going to *free* them. */
1216 			if (level > 1 && !dma_pte_superpage(pte))
1217 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1218 
1219 			dma_clear_pte(pte);
1220 			if (!first_pte)
1221 				first_pte = pte;
1222 			last_pte = pte;
1223 		} else if (level > 1) {
1224 			/* Recurse down into a level that isn't *entirely* obsolete */
1225 			freelist = dma_pte_clear_level(domain, level - 1,
1226 						       phys_to_virt(dma_pte_addr(pte)),
1227 						       level_pfn, start_pfn, last_pfn,
1228 						       freelist);
1229 		}
1230 next:
1231 		pfn += level_size(level);
1232 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1233 
1234 	if (first_pte)
1235 		domain_flush_cache(domain, first_pte,
1236 				   (void *)++last_pte - (void *)first_pte);
1237 
1238 	return freelist;
1239 }
1240 
1241 /* We can't just free the pages because the IOMMU may still be walking
1242    the page tables, and may have cached the intermediate levels. The
1243    pages can only be freed after the IOTLB flush has been done. */
1244 static struct page *domain_unmap(struct dmar_domain *domain,
1245 				 unsigned long start_pfn,
1246 				 unsigned long last_pfn)
1247 {
1248 	struct page *freelist;
1249 
1250 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1251 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1252 	BUG_ON(start_pfn > last_pfn);
1253 
1254 	/* we don't need lock here; nobody else touches the iova range */
1255 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1256 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1257 
1258 	/* free pgd */
1259 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1260 		struct page *pgd_page = virt_to_page(domain->pgd);
1261 		pgd_page->freelist = freelist;
1262 		freelist = pgd_page;
1263 
1264 		domain->pgd = NULL;
1265 	}
1266 
1267 	return freelist;
1268 }
1269 
1270 static void dma_free_pagelist(struct page *freelist)
1271 {
1272 	struct page *pg;
1273 
1274 	while ((pg = freelist)) {
1275 		freelist = pg->freelist;
1276 		free_pgtable_page(page_address(pg));
1277 	}
1278 }
1279 
1280 static void iova_entry_free(unsigned long data)
1281 {
1282 	struct page *freelist = (struct page *)data;
1283 
1284 	dma_free_pagelist(freelist);
1285 }
1286 
1287 /* iommu handling */
1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1289 {
1290 	struct root_entry *root;
1291 	unsigned long flags;
1292 
1293 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1294 	if (!root) {
1295 		pr_err("Allocating root entry for %s failed\n",
1296 			iommu->name);
1297 		return -ENOMEM;
1298 	}
1299 
1300 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1301 
1302 	spin_lock_irqsave(&iommu->lock, flags);
1303 	iommu->root_entry = root;
1304 	spin_unlock_irqrestore(&iommu->lock, flags);
1305 
1306 	return 0;
1307 }
1308 
1309 static void iommu_set_root_entry(struct intel_iommu *iommu)
1310 {
1311 	u64 addr;
1312 	u32 sts;
1313 	unsigned long flag;
1314 
1315 	addr = virt_to_phys(iommu->root_entry);
1316 	if (sm_supported(iommu))
1317 		addr |= DMA_RTADDR_SMT;
1318 
1319 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1321 
1322 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1323 
1324 	/* Make sure hardware complete it */
1325 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326 		      readl, (sts & DMA_GSTS_RTPS), sts);
1327 
1328 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330 
1331 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1332 {
1333 	u32 val;
1334 	unsigned long flag;
1335 
1336 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337 		return;
1338 
1339 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1341 
1342 	/* Make sure hardware complete it */
1343 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1344 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1345 
1346 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348 
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_context(struct intel_iommu *iommu,
1351 				  u16 did, u16 source_id, u8 function_mask,
1352 				  u64 type)
1353 {
1354 	u64 val = 0;
1355 	unsigned long flag;
1356 
1357 	switch (type) {
1358 	case DMA_CCMD_GLOBAL_INVL:
1359 		val = DMA_CCMD_GLOBAL_INVL;
1360 		break;
1361 	case DMA_CCMD_DOMAIN_INVL:
1362 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1363 		break;
1364 	case DMA_CCMD_DEVICE_INVL:
1365 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1366 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1367 		break;
1368 	default:
1369 		BUG();
1370 	}
1371 	val |= DMA_CCMD_ICC;
1372 
1373 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1375 
1376 	/* Make sure hardware complete it */
1377 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1378 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1379 
1380 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 }
1382 
1383 /* return value determine if we need a write buffer flush */
1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1385 				u64 addr, unsigned int size_order, u64 type)
1386 {
1387 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1388 	u64 val = 0, val_iva = 0;
1389 	unsigned long flag;
1390 
1391 	switch (type) {
1392 	case DMA_TLB_GLOBAL_FLUSH:
1393 		/* global flush doesn't need set IVA_REG */
1394 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1395 		break;
1396 	case DMA_TLB_DSI_FLUSH:
1397 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1398 		break;
1399 	case DMA_TLB_PSI_FLUSH:
1400 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401 		/* IH bit is passed in as part of address */
1402 		val_iva = size_order | addr;
1403 		break;
1404 	default:
1405 		BUG();
1406 	}
1407 	/* Note: set drain read/write */
1408 #if 0
1409 	/*
1410 	 * This is probably to be super secure.. Looks like we can
1411 	 * ignore it without any impact.
1412 	 */
1413 	if (cap_read_drain(iommu->cap))
1414 		val |= DMA_TLB_READ_DRAIN;
1415 #endif
1416 	if (cap_write_drain(iommu->cap))
1417 		val |= DMA_TLB_WRITE_DRAIN;
1418 
1419 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1420 	/* Note: Only uses first TLB reg currently */
1421 	if (val_iva)
1422 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1423 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1424 
1425 	/* Make sure hardware complete it */
1426 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1427 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1428 
1429 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430 
1431 	/* check IOTLB invalidation granularity */
1432 	if (DMA_TLB_IAIG(val) == 0)
1433 		pr_err("Flush IOTLB failed\n");
1434 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1435 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1436 			(unsigned long long)DMA_TLB_IIRG(type),
1437 			(unsigned long long)DMA_TLB_IAIG(val));
1438 }
1439 
1440 static struct device_domain_info *
1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442 			 u8 bus, u8 devfn)
1443 {
1444 	struct device_domain_info *info;
1445 
1446 	assert_spin_locked(&device_domain_lock);
1447 
1448 	if (!iommu->qi)
1449 		return NULL;
1450 
1451 	list_for_each_entry(info, &domain->devices, link)
1452 		if (info->iommu == iommu && info->bus == bus &&
1453 		    info->devfn == devfn) {
1454 			if (info->ats_supported && info->dev)
1455 				return info;
1456 			break;
1457 		}
1458 
1459 	return NULL;
1460 }
1461 
1462 static void domain_update_iotlb(struct dmar_domain *domain)
1463 {
1464 	struct device_domain_info *info;
1465 	bool has_iotlb_device = false;
1466 
1467 	assert_spin_locked(&device_domain_lock);
1468 
1469 	list_for_each_entry(info, &domain->devices, link) {
1470 		struct pci_dev *pdev;
1471 
1472 		if (!info->dev || !dev_is_pci(info->dev))
1473 			continue;
1474 
1475 		pdev = to_pci_dev(info->dev);
1476 		if (pdev->ats_enabled) {
1477 			has_iotlb_device = true;
1478 			break;
1479 		}
1480 	}
1481 
1482 	domain->has_iotlb_device = has_iotlb_device;
1483 }
1484 
1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1486 {
1487 	struct pci_dev *pdev;
1488 
1489 	assert_spin_locked(&device_domain_lock);
1490 
1491 	if (!info || !dev_is_pci(info->dev))
1492 		return;
1493 
1494 	pdev = to_pci_dev(info->dev);
1495 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1496 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1497 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1498 	 * reserved, which should be set to 0.
1499 	 */
1500 	if (!ecap_dit(info->iommu->ecap))
1501 		info->pfsid = 0;
1502 	else {
1503 		struct pci_dev *pf_pdev;
1504 
1505 		/* pdev will be returned if device is not a vf */
1506 		pf_pdev = pci_physfn(pdev);
1507 		info->pfsid = pci_dev_id(pf_pdev);
1508 	}
1509 
1510 #ifdef CONFIG_INTEL_IOMMU_SVM
1511 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1512 	   the device if you enable PASID support after ATS support is
1513 	   undefined. So always enable PASID support on devices which
1514 	   have it, even if we can't yet know if we're ever going to
1515 	   use it. */
1516 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1517 		info->pasid_enabled = 1;
1518 
1519 	if (info->pri_supported &&
1520 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1521 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1522 		info->pri_enabled = 1;
1523 #endif
1524 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1525 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1526 		info->ats_enabled = 1;
1527 		domain_update_iotlb(info->domain);
1528 		info->ats_qdep = pci_ats_queue_depth(pdev);
1529 	}
1530 }
1531 
1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1533 {
1534 	struct pci_dev *pdev;
1535 
1536 	assert_spin_locked(&device_domain_lock);
1537 
1538 	if (!dev_is_pci(info->dev))
1539 		return;
1540 
1541 	pdev = to_pci_dev(info->dev);
1542 
1543 	if (info->ats_enabled) {
1544 		pci_disable_ats(pdev);
1545 		info->ats_enabled = 0;
1546 		domain_update_iotlb(info->domain);
1547 	}
1548 #ifdef CONFIG_INTEL_IOMMU_SVM
1549 	if (info->pri_enabled) {
1550 		pci_disable_pri(pdev);
1551 		info->pri_enabled = 0;
1552 	}
1553 	if (info->pasid_enabled) {
1554 		pci_disable_pasid(pdev);
1555 		info->pasid_enabled = 0;
1556 	}
1557 #endif
1558 }
1559 
1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1561 				  u64 addr, unsigned mask)
1562 {
1563 	u16 sid, qdep;
1564 	unsigned long flags;
1565 	struct device_domain_info *info;
1566 
1567 	if (!domain->has_iotlb_device)
1568 		return;
1569 
1570 	spin_lock_irqsave(&device_domain_lock, flags);
1571 	list_for_each_entry(info, &domain->devices, link) {
1572 		if (!info->ats_enabled)
1573 			continue;
1574 
1575 		sid = info->bus << 8 | info->devfn;
1576 		qdep = info->ats_qdep;
1577 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578 				qdep, addr, mask);
1579 	}
1580 	spin_unlock_irqrestore(&device_domain_lock, flags);
1581 }
1582 
1583 static void domain_flush_piotlb(struct intel_iommu *iommu,
1584 				struct dmar_domain *domain,
1585 				u64 addr, unsigned long npages, bool ih)
1586 {
1587 	u16 did = domain->iommu_did[iommu->seq_id];
1588 
1589 	if (domain->default_pasid)
1590 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1591 				addr, npages, ih);
1592 
1593 	if (!list_empty(&domain->devices))
1594 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 }
1596 
1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1598 				  struct dmar_domain *domain,
1599 				  unsigned long pfn, unsigned int pages,
1600 				  int ih, int map)
1601 {
1602 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1603 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1604 	u16 did = domain->iommu_did[iommu->seq_id];
1605 
1606 	BUG_ON(pages == 0);
1607 
1608 	if (ih)
1609 		ih = 1 << 6;
1610 
1611 	if (domain_use_first_level(domain)) {
1612 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613 	} else {
1614 		/*
1615 		 * Fallback to domain selective flush if no PSI support or
1616 		 * the size is too big. PSI requires page size to be 2 ^ x,
1617 		 * and the base address is naturally aligned to the size.
1618 		 */
1619 		if (!cap_pgsel_inv(iommu->cap) ||
1620 		    mask > cap_max_amask_val(iommu->cap))
1621 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622 							DMA_TLB_DSI_FLUSH);
1623 		else
1624 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1625 							DMA_TLB_PSI_FLUSH);
1626 	}
1627 
1628 	/*
1629 	 * In caching mode, changes of pages from non-present to present require
1630 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1631 	 */
1632 	if (!cap_caching_mode(iommu->cap) || !map)
1633 		iommu_flush_dev_iotlb(domain, addr, mask);
1634 }
1635 
1636 /* Notification for newly created mappings */
1637 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1638 					struct dmar_domain *domain,
1639 					unsigned long pfn, unsigned int pages)
1640 {
1641 	/*
1642 	 * It's a non-present to present mapping. Only flush if caching mode
1643 	 * and second level.
1644 	 */
1645 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1646 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1647 	else
1648 		iommu_flush_write_buffer(iommu);
1649 }
1650 
1651 static void iommu_flush_iova(struct iova_domain *iovad)
1652 {
1653 	struct dmar_domain *domain;
1654 	int idx;
1655 
1656 	domain = container_of(iovad, struct dmar_domain, iovad);
1657 
1658 	for_each_domain_iommu(idx, domain) {
1659 		struct intel_iommu *iommu = g_iommus[idx];
1660 		u16 did = domain->iommu_did[iommu->seq_id];
1661 
1662 		if (domain_use_first_level(domain))
1663 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1664 		else
1665 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1666 						 DMA_TLB_DSI_FLUSH);
1667 
1668 		if (!cap_caching_mode(iommu->cap))
1669 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1670 					      0, MAX_AGAW_PFN_WIDTH);
1671 	}
1672 }
1673 
1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1675 {
1676 	u32 pmen;
1677 	unsigned long flags;
1678 
1679 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1680 		return;
1681 
1682 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1683 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1684 	pmen &= ~DMA_PMEN_EPM;
1685 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1686 
1687 	/* wait for the protected region status bit to clear */
1688 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1689 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1690 
1691 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1692 }
1693 
1694 static void iommu_enable_translation(struct intel_iommu *iommu)
1695 {
1696 	u32 sts;
1697 	unsigned long flags;
1698 
1699 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1700 	iommu->gcmd |= DMA_GCMD_TE;
1701 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1702 
1703 	/* Make sure hardware complete it */
1704 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1705 		      readl, (sts & DMA_GSTS_TES), sts);
1706 
1707 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709 
1710 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 {
1712 	u32 sts;
1713 	unsigned long flag;
1714 
1715 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1716 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1717 		return;
1718 
1719 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1720 	iommu->gcmd &= ~DMA_GCMD_TE;
1721 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1722 
1723 	/* Make sure hardware complete it */
1724 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1725 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1726 
1727 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1728 }
1729 
1730 static int iommu_init_domains(struct intel_iommu *iommu)
1731 {
1732 	u32 ndomains, nlongs;
1733 	size_t size;
1734 
1735 	ndomains = cap_ndoms(iommu->cap);
1736 	pr_debug("%s: Number of Domains supported <%d>\n",
1737 		 iommu->name, ndomains);
1738 	nlongs = BITS_TO_LONGS(ndomains);
1739 
1740 	spin_lock_init(&iommu->lock);
1741 
1742 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1743 	if (!iommu->domain_ids) {
1744 		pr_err("%s: Allocating domain id array failed\n",
1745 		       iommu->name);
1746 		return -ENOMEM;
1747 	}
1748 
1749 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1750 	iommu->domains = kzalloc(size, GFP_KERNEL);
1751 
1752 	if (iommu->domains) {
1753 		size = 256 * sizeof(struct dmar_domain *);
1754 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1755 	}
1756 
1757 	if (!iommu->domains || !iommu->domains[0]) {
1758 		pr_err("%s: Allocating domain array failed\n",
1759 		       iommu->name);
1760 		kfree(iommu->domain_ids);
1761 		kfree(iommu->domains);
1762 		iommu->domain_ids = NULL;
1763 		iommu->domains    = NULL;
1764 		return -ENOMEM;
1765 	}
1766 
1767 	/*
1768 	 * If Caching mode is set, then invalid translations are tagged
1769 	 * with domain-id 0, hence we need to pre-allocate it. We also
1770 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1771 	 * make sure it is not used for a real domain.
1772 	 */
1773 	set_bit(0, iommu->domain_ids);
1774 
1775 	/*
1776 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1777 	 * entry for first-level or pass-through translation modes should
1778 	 * be programmed with a domain id different from those used for
1779 	 * second-level or nested translation. We reserve a domain id for
1780 	 * this purpose.
1781 	 */
1782 	if (sm_supported(iommu))
1783 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784 
1785 	return 0;
1786 }
1787 
1788 static void disable_dmar_iommu(struct intel_iommu *iommu)
1789 {
1790 	struct device_domain_info *info, *tmp;
1791 	unsigned long flags;
1792 
1793 	if (!iommu->domains || !iommu->domain_ids)
1794 		return;
1795 
1796 	spin_lock_irqsave(&device_domain_lock, flags);
1797 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1798 		if (info->iommu != iommu)
1799 			continue;
1800 
1801 		if (!info->dev || !info->domain)
1802 			continue;
1803 
1804 		__dmar_remove_one_dev_info(info);
1805 	}
1806 	spin_unlock_irqrestore(&device_domain_lock, flags);
1807 
1808 	if (iommu->gcmd & DMA_GCMD_TE)
1809 		iommu_disable_translation(iommu);
1810 }
1811 
1812 static void free_dmar_iommu(struct intel_iommu *iommu)
1813 {
1814 	if ((iommu->domains) && (iommu->domain_ids)) {
1815 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1816 		int i;
1817 
1818 		for (i = 0; i < elems; i++)
1819 			kfree(iommu->domains[i]);
1820 		kfree(iommu->domains);
1821 		kfree(iommu->domain_ids);
1822 		iommu->domains = NULL;
1823 		iommu->domain_ids = NULL;
1824 	}
1825 
1826 	g_iommus[iommu->seq_id] = NULL;
1827 
1828 	/* free context mapping */
1829 	free_context_table(iommu);
1830 
1831 #ifdef CONFIG_INTEL_IOMMU_SVM
1832 	if (pasid_supported(iommu)) {
1833 		if (ecap_prs(iommu->ecap))
1834 			intel_svm_finish_prq(iommu);
1835 	}
1836 	if (vccap_pasid(iommu->vccap))
1837 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1838 
1839 #endif
1840 }
1841 
1842 /*
1843  * Check and return whether first level is used by default for
1844  * DMA translation.
1845  */
1846 static bool first_level_by_default(void)
1847 {
1848 	struct dmar_drhd_unit *drhd;
1849 	struct intel_iommu *iommu;
1850 	static int first_level_support = -1;
1851 
1852 	if (likely(first_level_support != -1))
1853 		return first_level_support;
1854 
1855 	first_level_support = 1;
1856 
1857 	rcu_read_lock();
1858 	for_each_active_iommu(iommu, drhd) {
1859 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1860 			first_level_support = 0;
1861 			break;
1862 		}
1863 	}
1864 	rcu_read_unlock();
1865 
1866 	return first_level_support;
1867 }
1868 
1869 static struct dmar_domain *alloc_domain(int flags)
1870 {
1871 	struct dmar_domain *domain;
1872 
1873 	domain = alloc_domain_mem();
1874 	if (!domain)
1875 		return NULL;
1876 
1877 	memset(domain, 0, sizeof(*domain));
1878 	domain->nid = NUMA_NO_NODE;
1879 	domain->flags = flags;
1880 	if (first_level_by_default())
1881 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1882 	domain->has_iotlb_device = false;
1883 	INIT_LIST_HEAD(&domain->devices);
1884 
1885 	return domain;
1886 }
1887 
1888 /* Must be called with iommu->lock */
1889 static int domain_attach_iommu(struct dmar_domain *domain,
1890 			       struct intel_iommu *iommu)
1891 {
1892 	unsigned long ndomains;
1893 	int num;
1894 
1895 	assert_spin_locked(&device_domain_lock);
1896 	assert_spin_locked(&iommu->lock);
1897 
1898 	domain->iommu_refcnt[iommu->seq_id] += 1;
1899 	domain->iommu_count += 1;
1900 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1901 		ndomains = cap_ndoms(iommu->cap);
1902 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1903 
1904 		if (num >= ndomains) {
1905 			pr_err("%s: No free domain ids\n", iommu->name);
1906 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1907 			domain->iommu_count -= 1;
1908 			return -ENOSPC;
1909 		}
1910 
1911 		set_bit(num, iommu->domain_ids);
1912 		set_iommu_domain(iommu, num, domain);
1913 
1914 		domain->iommu_did[iommu->seq_id] = num;
1915 		domain->nid			 = iommu->node;
1916 
1917 		domain_update_iommu_cap(domain);
1918 	}
1919 
1920 	return 0;
1921 }
1922 
1923 static int domain_detach_iommu(struct dmar_domain *domain,
1924 			       struct intel_iommu *iommu)
1925 {
1926 	int num, count;
1927 
1928 	assert_spin_locked(&device_domain_lock);
1929 	assert_spin_locked(&iommu->lock);
1930 
1931 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1932 	count = --domain->iommu_count;
1933 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1934 		num = domain->iommu_did[iommu->seq_id];
1935 		clear_bit(num, iommu->domain_ids);
1936 		set_iommu_domain(iommu, num, NULL);
1937 
1938 		domain_update_iommu_cap(domain);
1939 		domain->iommu_did[iommu->seq_id] = 0;
1940 	}
1941 
1942 	return count;
1943 }
1944 
1945 static struct iova_domain reserved_iova_list;
1946 static struct lock_class_key reserved_rbtree_key;
1947 
1948 static int dmar_init_reserved_ranges(void)
1949 {
1950 	struct pci_dev *pdev = NULL;
1951 	struct iova *iova;
1952 	int i;
1953 
1954 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1955 
1956 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1957 		&reserved_rbtree_key);
1958 
1959 	/* IOAPIC ranges shouldn't be accessed by DMA */
1960 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1961 		IOVA_PFN(IOAPIC_RANGE_END));
1962 	if (!iova) {
1963 		pr_err("Reserve IOAPIC range failed\n");
1964 		return -ENODEV;
1965 	}
1966 
1967 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1968 	for_each_pci_dev(pdev) {
1969 		struct resource *r;
1970 
1971 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1972 			r = &pdev->resource[i];
1973 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1974 				continue;
1975 			iova = reserve_iova(&reserved_iova_list,
1976 					    IOVA_PFN(r->start),
1977 					    IOVA_PFN(r->end));
1978 			if (!iova) {
1979 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1980 				return -ENODEV;
1981 			}
1982 		}
1983 	}
1984 	return 0;
1985 }
1986 
1987 static inline int guestwidth_to_adjustwidth(int gaw)
1988 {
1989 	int agaw;
1990 	int r = (gaw - 12) % 9;
1991 
1992 	if (r == 0)
1993 		agaw = gaw;
1994 	else
1995 		agaw = gaw + 9 - r;
1996 	if (agaw > 64)
1997 		agaw = 64;
1998 	return agaw;
1999 }
2000 
2001 static void domain_exit(struct dmar_domain *domain)
2002 {
2003 
2004 	/* Remove associated devices and clear attached or cached domains */
2005 	domain_remove_dev_info(domain);
2006 
2007 	/* destroy iovas */
2008 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
2009 		put_iova_domain(&domain->iovad);
2010 
2011 	if (domain->pgd) {
2012 		struct page *freelist;
2013 
2014 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2015 		dma_free_pagelist(freelist);
2016 	}
2017 
2018 	free_domain_mem(domain);
2019 }
2020 
2021 /*
2022  * Get the PASID directory size for scalable mode context entry.
2023  * Value of X in the PDTS field of a scalable mode context entry
2024  * indicates PASID directory with 2^(X + 7) entries.
2025  */
2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2027 {
2028 	int pds, max_pde;
2029 
2030 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2031 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2032 	if (pds < 7)
2033 		return 0;
2034 
2035 	return pds - 7;
2036 }
2037 
2038 /*
2039  * Set the RID_PASID field of a scalable mode context entry. The
2040  * IOMMU hardware will use the PASID value set in this field for
2041  * DMA translations of DMA requests without PASID.
2042  */
2043 static inline void
2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2045 {
2046 	context->hi |= pasid & ((1 << 20) - 1);
2047 }
2048 
2049 /*
2050  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2051  * entry.
2052  */
2053 static inline void context_set_sm_dte(struct context_entry *context)
2054 {
2055 	context->lo |= (1 << 2);
2056 }
2057 
2058 /*
2059  * Set the PRE(Page Request Enable) field of a scalable mode context
2060  * entry.
2061  */
2062 static inline void context_set_sm_pre(struct context_entry *context)
2063 {
2064 	context->lo |= (1 << 4);
2065 }
2066 
2067 /* Convert value to context PASID directory size field coding. */
2068 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2069 
2070 static int domain_context_mapping_one(struct dmar_domain *domain,
2071 				      struct intel_iommu *iommu,
2072 				      struct pasid_table *table,
2073 				      u8 bus, u8 devfn)
2074 {
2075 	u16 did = domain->iommu_did[iommu->seq_id];
2076 	int translation = CONTEXT_TT_MULTI_LEVEL;
2077 	struct device_domain_info *info = NULL;
2078 	struct context_entry *context;
2079 	unsigned long flags;
2080 	int ret;
2081 
2082 	WARN_ON(did == 0);
2083 
2084 	if (hw_pass_through && domain_type_is_si(domain))
2085 		translation = CONTEXT_TT_PASS_THROUGH;
2086 
2087 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2088 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2089 
2090 	BUG_ON(!domain->pgd);
2091 
2092 	spin_lock_irqsave(&device_domain_lock, flags);
2093 	spin_lock(&iommu->lock);
2094 
2095 	ret = -ENOMEM;
2096 	context = iommu_context_addr(iommu, bus, devfn, 1);
2097 	if (!context)
2098 		goto out_unlock;
2099 
2100 	ret = 0;
2101 	if (context_present(context))
2102 		goto out_unlock;
2103 
2104 	/*
2105 	 * For kdump cases, old valid entries may be cached due to the
2106 	 * in-flight DMA and copied pgtable, but there is no unmapping
2107 	 * behaviour for them, thus we need an explicit cache flush for
2108 	 * the newly-mapped device. For kdump, at this point, the device
2109 	 * is supposed to finish reset at its driver probe stage, so no
2110 	 * in-flight DMA will exist, and we don't need to worry anymore
2111 	 * hereafter.
2112 	 */
2113 	if (context_copied(context)) {
2114 		u16 did_old = context_domain_id(context);
2115 
2116 		if (did_old < cap_ndoms(iommu->cap)) {
2117 			iommu->flush.flush_context(iommu, did_old,
2118 						   (((u16)bus) << 8) | devfn,
2119 						   DMA_CCMD_MASK_NOBIT,
2120 						   DMA_CCMD_DEVICE_INVL);
2121 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2122 						 DMA_TLB_DSI_FLUSH);
2123 		}
2124 	}
2125 
2126 	context_clear_entry(context);
2127 
2128 	if (sm_supported(iommu)) {
2129 		unsigned long pds;
2130 
2131 		WARN_ON(!table);
2132 
2133 		/* Setup the PASID DIR pointer: */
2134 		pds = context_get_sm_pds(table);
2135 		context->lo = (u64)virt_to_phys(table->table) |
2136 				context_pdts(pds);
2137 
2138 		/* Setup the RID_PASID field: */
2139 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2140 
2141 		/*
2142 		 * Setup the Device-TLB enable bit and Page request
2143 		 * Enable bit:
2144 		 */
2145 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146 		if (info && info->ats_supported)
2147 			context_set_sm_dte(context);
2148 		if (info && info->pri_supported)
2149 			context_set_sm_pre(context);
2150 	} else {
2151 		struct dma_pte *pgd = domain->pgd;
2152 		int agaw;
2153 
2154 		context_set_domain_id(context, did);
2155 
2156 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2157 			/*
2158 			 * Skip top levels of page tables for iommu which has
2159 			 * less agaw than default. Unnecessary for PT mode.
2160 			 */
2161 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2162 				ret = -ENOMEM;
2163 				pgd = phys_to_virt(dma_pte_addr(pgd));
2164 				if (!dma_pte_present(pgd))
2165 					goto out_unlock;
2166 			}
2167 
2168 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2169 			if (info && info->ats_supported)
2170 				translation = CONTEXT_TT_DEV_IOTLB;
2171 			else
2172 				translation = CONTEXT_TT_MULTI_LEVEL;
2173 
2174 			context_set_address_root(context, virt_to_phys(pgd));
2175 			context_set_address_width(context, agaw);
2176 		} else {
2177 			/*
2178 			 * In pass through mode, AW must be programmed to
2179 			 * indicate the largest AGAW value supported by
2180 			 * hardware. And ASR is ignored by hardware.
2181 			 */
2182 			context_set_address_width(context, iommu->msagaw);
2183 		}
2184 
2185 		context_set_translation_type(context, translation);
2186 	}
2187 
2188 	context_set_fault_enable(context);
2189 	context_set_present(context);
2190 	if (!ecap_coherent(iommu->ecap))
2191 		clflush_cache_range(context, sizeof(*context));
2192 
2193 	/*
2194 	 * It's a non-present to present mapping. If hardware doesn't cache
2195 	 * non-present entry we only need to flush the write-buffer. If the
2196 	 * _does_ cache non-present entries, then it does so in the special
2197 	 * domain #0, which we have to flush:
2198 	 */
2199 	if (cap_caching_mode(iommu->cap)) {
2200 		iommu->flush.flush_context(iommu, 0,
2201 					   (((u16)bus) << 8) | devfn,
2202 					   DMA_CCMD_MASK_NOBIT,
2203 					   DMA_CCMD_DEVICE_INVL);
2204 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2205 	} else {
2206 		iommu_flush_write_buffer(iommu);
2207 	}
2208 	iommu_enable_dev_iotlb(info);
2209 
2210 	ret = 0;
2211 
2212 out_unlock:
2213 	spin_unlock(&iommu->lock);
2214 	spin_unlock_irqrestore(&device_domain_lock, flags);
2215 
2216 	return ret;
2217 }
2218 
2219 struct domain_context_mapping_data {
2220 	struct dmar_domain *domain;
2221 	struct intel_iommu *iommu;
2222 	struct pasid_table *table;
2223 };
2224 
2225 static int domain_context_mapping_cb(struct pci_dev *pdev,
2226 				     u16 alias, void *opaque)
2227 {
2228 	struct domain_context_mapping_data *data = opaque;
2229 
2230 	return domain_context_mapping_one(data->domain, data->iommu,
2231 					  data->table, PCI_BUS_NUM(alias),
2232 					  alias & 0xff);
2233 }
2234 
2235 static int
2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2237 {
2238 	struct domain_context_mapping_data data;
2239 	struct pasid_table *table;
2240 	struct intel_iommu *iommu;
2241 	u8 bus, devfn;
2242 
2243 	iommu = device_to_iommu(dev, &bus, &devfn);
2244 	if (!iommu)
2245 		return -ENODEV;
2246 
2247 	table = intel_pasid_get_table(dev);
2248 
2249 	if (!dev_is_pci(dev))
2250 		return domain_context_mapping_one(domain, iommu, table,
2251 						  bus, devfn);
2252 
2253 	data.domain = domain;
2254 	data.iommu = iommu;
2255 	data.table = table;
2256 
2257 	return pci_for_each_dma_alias(to_pci_dev(dev),
2258 				      &domain_context_mapping_cb, &data);
2259 }
2260 
2261 static int domain_context_mapped_cb(struct pci_dev *pdev,
2262 				    u16 alias, void *opaque)
2263 {
2264 	struct intel_iommu *iommu = opaque;
2265 
2266 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2267 }
2268 
2269 static int domain_context_mapped(struct device *dev)
2270 {
2271 	struct intel_iommu *iommu;
2272 	u8 bus, devfn;
2273 
2274 	iommu = device_to_iommu(dev, &bus, &devfn);
2275 	if (!iommu)
2276 		return -ENODEV;
2277 
2278 	if (!dev_is_pci(dev))
2279 		return device_context_mapped(iommu, bus, devfn);
2280 
2281 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2282 				       domain_context_mapped_cb, iommu);
2283 }
2284 
2285 /* Returns a number of VTD pages, but aligned to MM page size */
2286 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2287 					    size_t size)
2288 {
2289 	host_addr &= ~PAGE_MASK;
2290 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2291 }
2292 
2293 /* Return largest possible superpage level for a given mapping */
2294 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2295 					  unsigned long iov_pfn,
2296 					  unsigned long phy_pfn,
2297 					  unsigned long pages)
2298 {
2299 	int support, level = 1;
2300 	unsigned long pfnmerge;
2301 
2302 	support = domain->iommu_superpage;
2303 
2304 	/* To use a large page, the virtual *and* physical addresses
2305 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2306 	   of them will mean we have to use smaller pages. So just
2307 	   merge them and check both at once. */
2308 	pfnmerge = iov_pfn | phy_pfn;
2309 
2310 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2311 		pages >>= VTD_STRIDE_SHIFT;
2312 		if (!pages)
2313 			break;
2314 		pfnmerge >>= VTD_STRIDE_SHIFT;
2315 		level++;
2316 		support--;
2317 	}
2318 	return level;
2319 }
2320 
2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322 			    struct scatterlist *sg, unsigned long phys_pfn,
2323 			    unsigned long nr_pages, int prot)
2324 {
2325 	struct dma_pte *first_pte = NULL, *pte = NULL;
2326 	phys_addr_t pteval;
2327 	unsigned long sg_res = 0;
2328 	unsigned int largepage_lvl = 0;
2329 	unsigned long lvl_pages = 0;
2330 	u64 attr;
2331 
2332 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2333 
2334 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2335 		return -EINVAL;
2336 
2337 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2338 	if (domain_use_first_level(domain))
2339 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2340 
2341 	if (!sg) {
2342 		sg_res = nr_pages;
2343 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344 	}
2345 
2346 	while (nr_pages > 0) {
2347 		uint64_t tmp;
2348 
2349 		if (!sg_res) {
2350 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2351 
2352 			sg_res = aligned_nrpages(sg->offset, sg->length);
2353 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2354 			sg->dma_length = sg->length;
2355 			pteval = (sg_phys(sg) - pgoff) | attr;
2356 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2357 		}
2358 
2359 		if (!pte) {
2360 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2361 
2362 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2363 			if (!pte)
2364 				return -ENOMEM;
2365 			/* It is large page*/
2366 			if (largepage_lvl > 1) {
2367 				unsigned long nr_superpages, end_pfn;
2368 
2369 				pteval |= DMA_PTE_LARGE_PAGE;
2370 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2371 
2372 				nr_superpages = sg_res / lvl_pages;
2373 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2374 
2375 				/*
2376 				 * Ensure that old small page tables are
2377 				 * removed to make room for superpage(s).
2378 				 * We're adding new large pages, so make sure
2379 				 * we don't remove their parent tables.
2380 				 */
2381 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2382 						       largepage_lvl + 1);
2383 			} else {
2384 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2385 			}
2386 
2387 		}
2388 		/* We don't need lock here, nobody else
2389 		 * touches the iova range
2390 		 */
2391 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2392 		if (tmp) {
2393 			static int dumps = 5;
2394 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2395 				iov_pfn, tmp, (unsigned long long)pteval);
2396 			if (dumps) {
2397 				dumps--;
2398 				debug_dma_dump_mappings(NULL);
2399 			}
2400 			WARN_ON(1);
2401 		}
2402 
2403 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2404 
2405 		BUG_ON(nr_pages < lvl_pages);
2406 		BUG_ON(sg_res < lvl_pages);
2407 
2408 		nr_pages -= lvl_pages;
2409 		iov_pfn += lvl_pages;
2410 		phys_pfn += lvl_pages;
2411 		pteval += lvl_pages * VTD_PAGE_SIZE;
2412 		sg_res -= lvl_pages;
2413 
2414 		/* If the next PTE would be the first in a new page, then we
2415 		   need to flush the cache on the entries we've just written.
2416 		   And then we'll need to recalculate 'pte', so clear it and
2417 		   let it get set again in the if (!pte) block above.
2418 
2419 		   If we're done (!nr_pages) we need to flush the cache too.
2420 
2421 		   Also if we've been setting superpages, we may need to
2422 		   recalculate 'pte' and switch back to smaller pages for the
2423 		   end of the mapping, if the trailing size is not enough to
2424 		   use another superpage (i.e. sg_res < lvl_pages). */
2425 		pte++;
2426 		if (!nr_pages || first_pte_in_page(pte) ||
2427 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2428 			domain_flush_cache(domain, first_pte,
2429 					   (void *)pte - (void *)first_pte);
2430 			pte = NULL;
2431 		}
2432 
2433 		if (!sg_res && nr_pages)
2434 			sg = sg_next(sg);
2435 	}
2436 	return 0;
2437 }
2438 
2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2440 			  struct scatterlist *sg, unsigned long phys_pfn,
2441 			  unsigned long nr_pages, int prot)
2442 {
2443 	int iommu_id, ret;
2444 	struct intel_iommu *iommu;
2445 
2446 	/* Do the real mapping first */
2447 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2448 	if (ret)
2449 		return ret;
2450 
2451 	for_each_domain_iommu(iommu_id, domain) {
2452 		iommu = g_iommus[iommu_id];
2453 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2454 	}
2455 
2456 	return 0;
2457 }
2458 
2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2460 				    struct scatterlist *sg, unsigned long nr_pages,
2461 				    int prot)
2462 {
2463 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2464 }
2465 
2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2467 				     unsigned long phys_pfn, unsigned long nr_pages,
2468 				     int prot)
2469 {
2470 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2471 }
2472 
2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2474 {
2475 	unsigned long flags;
2476 	struct context_entry *context;
2477 	u16 did_old;
2478 
2479 	if (!iommu)
2480 		return;
2481 
2482 	spin_lock_irqsave(&iommu->lock, flags);
2483 	context = iommu_context_addr(iommu, bus, devfn, 0);
2484 	if (!context) {
2485 		spin_unlock_irqrestore(&iommu->lock, flags);
2486 		return;
2487 	}
2488 	did_old = context_domain_id(context);
2489 	context_clear_entry(context);
2490 	__iommu_flush_cache(iommu, context, sizeof(*context));
2491 	spin_unlock_irqrestore(&iommu->lock, flags);
2492 	iommu->flush.flush_context(iommu,
2493 				   did_old,
2494 				   (((u16)bus) << 8) | devfn,
2495 				   DMA_CCMD_MASK_NOBIT,
2496 				   DMA_CCMD_DEVICE_INVL);
2497 	iommu->flush.flush_iotlb(iommu,
2498 				 did_old,
2499 				 0,
2500 				 0,
2501 				 DMA_TLB_DSI_FLUSH);
2502 }
2503 
2504 static inline void unlink_domain_info(struct device_domain_info *info)
2505 {
2506 	assert_spin_locked(&device_domain_lock);
2507 	list_del(&info->link);
2508 	list_del(&info->global);
2509 	if (info->dev)
2510 		dev_iommu_priv_set(info->dev, NULL);
2511 }
2512 
2513 static void domain_remove_dev_info(struct dmar_domain *domain)
2514 {
2515 	struct device_domain_info *info, *tmp;
2516 	unsigned long flags;
2517 
2518 	spin_lock_irqsave(&device_domain_lock, flags);
2519 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2520 		__dmar_remove_one_dev_info(info);
2521 	spin_unlock_irqrestore(&device_domain_lock, flags);
2522 }
2523 
2524 struct dmar_domain *find_domain(struct device *dev)
2525 {
2526 	struct device_domain_info *info;
2527 
2528 	if (unlikely(!dev || !dev->iommu))
2529 		return NULL;
2530 
2531 	if (unlikely(attach_deferred(dev)))
2532 		return NULL;
2533 
2534 	/* No lock here, assumes no domain exit in normal case */
2535 	info = get_domain_info(dev);
2536 	if (likely(info))
2537 		return info->domain;
2538 
2539 	return NULL;
2540 }
2541 
2542 static void do_deferred_attach(struct device *dev)
2543 {
2544 	struct iommu_domain *domain;
2545 
2546 	dev_iommu_priv_set(dev, NULL);
2547 	domain = iommu_get_domain_for_dev(dev);
2548 	if (domain)
2549 		intel_iommu_attach_device(domain, dev);
2550 }
2551 
2552 static inline struct device_domain_info *
2553 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2554 {
2555 	struct device_domain_info *info;
2556 
2557 	list_for_each_entry(info, &device_domain_list, global)
2558 		if (info->segment == segment && info->bus == bus &&
2559 		    info->devfn == devfn)
2560 			return info;
2561 
2562 	return NULL;
2563 }
2564 
2565 static int domain_setup_first_level(struct intel_iommu *iommu,
2566 				    struct dmar_domain *domain,
2567 				    struct device *dev,
2568 				    u32 pasid)
2569 {
2570 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2571 	struct dma_pte *pgd = domain->pgd;
2572 	int agaw, level;
2573 
2574 	/*
2575 	 * Skip top levels of page tables for iommu which has
2576 	 * less agaw than default. Unnecessary for PT mode.
2577 	 */
2578 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2579 		pgd = phys_to_virt(dma_pte_addr(pgd));
2580 		if (!dma_pte_present(pgd))
2581 			return -ENOMEM;
2582 	}
2583 
2584 	level = agaw_to_level(agaw);
2585 	if (level != 4 && level != 5)
2586 		return -EINVAL;
2587 
2588 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2589 
2590 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2591 					     domain->iommu_did[iommu->seq_id],
2592 					     flags);
2593 }
2594 
2595 static bool dev_is_real_dma_subdevice(struct device *dev)
2596 {
2597 	return dev && dev_is_pci(dev) &&
2598 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2599 }
2600 
2601 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2602 						    int bus, int devfn,
2603 						    struct device *dev,
2604 						    struct dmar_domain *domain)
2605 {
2606 	struct dmar_domain *found = NULL;
2607 	struct device_domain_info *info;
2608 	unsigned long flags;
2609 	int ret;
2610 
2611 	info = alloc_devinfo_mem();
2612 	if (!info)
2613 		return NULL;
2614 
2615 	if (!dev_is_real_dma_subdevice(dev)) {
2616 		info->bus = bus;
2617 		info->devfn = devfn;
2618 		info->segment = iommu->segment;
2619 	} else {
2620 		struct pci_dev *pdev = to_pci_dev(dev);
2621 
2622 		info->bus = pdev->bus->number;
2623 		info->devfn = pdev->devfn;
2624 		info->segment = pci_domain_nr(pdev->bus);
2625 	}
2626 
2627 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2628 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2629 	info->ats_qdep = 0;
2630 	info->dev = dev;
2631 	info->domain = domain;
2632 	info->iommu = iommu;
2633 	info->pasid_table = NULL;
2634 	info->auxd_enabled = 0;
2635 	INIT_LIST_HEAD(&info->auxiliary_domains);
2636 
2637 	if (dev && dev_is_pci(dev)) {
2638 		struct pci_dev *pdev = to_pci_dev(info->dev);
2639 
2640 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2641 		    pci_ats_supported(pdev) &&
2642 		    dmar_find_matched_atsr_unit(pdev))
2643 			info->ats_supported = 1;
2644 
2645 		if (sm_supported(iommu)) {
2646 			if (pasid_supported(iommu)) {
2647 				int features = pci_pasid_features(pdev);
2648 				if (features >= 0)
2649 					info->pasid_supported = features | 1;
2650 			}
2651 
2652 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2653 			    pci_pri_supported(pdev))
2654 				info->pri_supported = 1;
2655 		}
2656 	}
2657 
2658 	spin_lock_irqsave(&device_domain_lock, flags);
2659 	if (dev)
2660 		found = find_domain(dev);
2661 
2662 	if (!found) {
2663 		struct device_domain_info *info2;
2664 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2665 						       info->devfn);
2666 		if (info2) {
2667 			found      = info2->domain;
2668 			info2->dev = dev;
2669 		}
2670 	}
2671 
2672 	if (found) {
2673 		spin_unlock_irqrestore(&device_domain_lock, flags);
2674 		free_devinfo_mem(info);
2675 		/* Caller must free the original domain */
2676 		return found;
2677 	}
2678 
2679 	spin_lock(&iommu->lock);
2680 	ret = domain_attach_iommu(domain, iommu);
2681 	spin_unlock(&iommu->lock);
2682 
2683 	if (ret) {
2684 		spin_unlock_irqrestore(&device_domain_lock, flags);
2685 		free_devinfo_mem(info);
2686 		return NULL;
2687 	}
2688 
2689 	list_add(&info->link, &domain->devices);
2690 	list_add(&info->global, &device_domain_list);
2691 	if (dev)
2692 		dev_iommu_priv_set(dev, info);
2693 	spin_unlock_irqrestore(&device_domain_lock, flags);
2694 
2695 	/* PASID table is mandatory for a PCI device in scalable mode. */
2696 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2697 		ret = intel_pasid_alloc_table(dev);
2698 		if (ret) {
2699 			dev_err(dev, "PASID table allocation failed\n");
2700 			dmar_remove_one_dev_info(dev);
2701 			return NULL;
2702 		}
2703 
2704 		/* Setup the PASID entry for requests without PASID: */
2705 		spin_lock_irqsave(&iommu->lock, flags);
2706 		if (hw_pass_through && domain_type_is_si(domain))
2707 			ret = intel_pasid_setup_pass_through(iommu, domain,
2708 					dev, PASID_RID2PASID);
2709 		else if (domain_use_first_level(domain))
2710 			ret = domain_setup_first_level(iommu, domain, dev,
2711 					PASID_RID2PASID);
2712 		else
2713 			ret = intel_pasid_setup_second_level(iommu, domain,
2714 					dev, PASID_RID2PASID);
2715 		spin_unlock_irqrestore(&iommu->lock, flags);
2716 		if (ret) {
2717 			dev_err(dev, "Setup RID2PASID failed\n");
2718 			dmar_remove_one_dev_info(dev);
2719 			return NULL;
2720 		}
2721 	}
2722 
2723 	if (dev && domain_context_mapping(domain, dev)) {
2724 		dev_err(dev, "Domain context map failed\n");
2725 		dmar_remove_one_dev_info(dev);
2726 		return NULL;
2727 	}
2728 
2729 	return domain;
2730 }
2731 
2732 static int iommu_domain_identity_map(struct dmar_domain *domain,
2733 				     unsigned long first_vpfn,
2734 				     unsigned long last_vpfn)
2735 {
2736 	/*
2737 	 * RMRR range might have overlap with physical memory range,
2738 	 * clear it first
2739 	 */
2740 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2741 
2742 	return __domain_mapping(domain, first_vpfn, NULL,
2743 				first_vpfn, last_vpfn - first_vpfn + 1,
2744 				DMA_PTE_READ|DMA_PTE_WRITE);
2745 }
2746 
2747 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2748 
2749 static int __init si_domain_init(int hw)
2750 {
2751 	struct dmar_rmrr_unit *rmrr;
2752 	struct device *dev;
2753 	int i, nid, ret;
2754 
2755 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2756 	if (!si_domain)
2757 		return -EFAULT;
2758 
2759 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2760 		domain_exit(si_domain);
2761 		return -EFAULT;
2762 	}
2763 
2764 	if (hw)
2765 		return 0;
2766 
2767 	for_each_online_node(nid) {
2768 		unsigned long start_pfn, end_pfn;
2769 		int i;
2770 
2771 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2772 			ret = iommu_domain_identity_map(si_domain,
2773 					mm_to_dma_pfn(start_pfn),
2774 					mm_to_dma_pfn(end_pfn));
2775 			if (ret)
2776 				return ret;
2777 		}
2778 	}
2779 
2780 	/*
2781 	 * Identity map the RMRRs so that devices with RMRRs could also use
2782 	 * the si_domain.
2783 	 */
2784 	for_each_rmrr_units(rmrr) {
2785 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2786 					  i, dev) {
2787 			unsigned long long start = rmrr->base_address;
2788 			unsigned long long end = rmrr->end_address;
2789 
2790 			if (WARN_ON(end < start ||
2791 				    end >> agaw_to_width(si_domain->agaw)))
2792 				continue;
2793 
2794 			ret = iommu_domain_identity_map(si_domain,
2795 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2796 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2797 			if (ret)
2798 				return ret;
2799 		}
2800 	}
2801 
2802 	return 0;
2803 }
2804 
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806 {
2807 	struct dmar_domain *ndomain;
2808 	struct intel_iommu *iommu;
2809 	u8 bus, devfn;
2810 
2811 	iommu = device_to_iommu(dev, &bus, &devfn);
2812 	if (!iommu)
2813 		return -ENODEV;
2814 
2815 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816 	if (ndomain != domain)
2817 		return -EBUSY;
2818 
2819 	return 0;
2820 }
2821 
2822 static bool device_has_rmrr(struct device *dev)
2823 {
2824 	struct dmar_rmrr_unit *rmrr;
2825 	struct device *tmp;
2826 	int i;
2827 
2828 	rcu_read_lock();
2829 	for_each_rmrr_units(rmrr) {
2830 		/*
2831 		 * Return TRUE if this RMRR contains the device that
2832 		 * is passed in.
2833 		 */
2834 		for_each_active_dev_scope(rmrr->devices,
2835 					  rmrr->devices_cnt, i, tmp)
2836 			if (tmp == dev ||
2837 			    is_downstream_to_pci_bridge(dev, tmp)) {
2838 				rcu_read_unlock();
2839 				return true;
2840 			}
2841 	}
2842 	rcu_read_unlock();
2843 	return false;
2844 }
2845 
2846 /**
2847  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848  * is relaxable (ie. is allowed to be not enforced under some conditions)
2849  * @dev: device handle
2850  *
2851  * We assume that PCI USB devices with RMRRs have them largely
2852  * for historical reasons and that the RMRR space is not actively used post
2853  * boot.  This exclusion may change if vendors begin to abuse it.
2854  *
2855  * The same exception is made for graphics devices, with the requirement that
2856  * any use of the RMRR regions will be torn down before assigning the device
2857  * to a guest.
2858  *
2859  * Return: true if the RMRR is relaxable, false otherwise
2860  */
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2862 {
2863 	struct pci_dev *pdev;
2864 
2865 	if (!dev_is_pci(dev))
2866 		return false;
2867 
2868 	pdev = to_pci_dev(dev);
2869 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870 		return true;
2871 	else
2872 		return false;
2873 }
2874 
2875 /*
2876  * There are a couple cases where we need to restrict the functionality of
2877  * devices associated with RMRRs.  The first is when evaluating a device for
2878  * identity mapping because problems exist when devices are moved in and out
2879  * of domains and their respective RMRR information is lost.  This means that
2880  * a device with associated RMRRs will never be in a "passthrough" domain.
2881  * The second is use of the device through the IOMMU API.  This interface
2882  * expects to have full control of the IOVA space for the device.  We cannot
2883  * satisfy both the requirement that RMRR access is maintained and have an
2884  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2885  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886  * We therefore prevent devices associated with an RMRR from participating in
2887  * the IOMMU API, which eliminates them from device assignment.
2888  *
2889  * In both cases, devices which have relaxable RMRRs are not concerned by this
2890  * restriction. See device_rmrr_is_relaxable comment.
2891  */
2892 static bool device_is_rmrr_locked(struct device *dev)
2893 {
2894 	if (!device_has_rmrr(dev))
2895 		return false;
2896 
2897 	if (device_rmrr_is_relaxable(dev))
2898 		return false;
2899 
2900 	return true;
2901 }
2902 
2903 /*
2904  * Return the required default domain type for a specific device.
2905  *
2906  * @dev: the device in query
2907  * @startup: true if this is during early boot
2908  *
2909  * Returns:
2910  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912  *  - 0: both identity and dynamic domains work for this device
2913  */
2914 static int device_def_domain_type(struct device *dev)
2915 {
2916 	if (dev_is_pci(dev)) {
2917 		struct pci_dev *pdev = to_pci_dev(dev);
2918 
2919 		/*
2920 		 * Prevent any device marked as untrusted from getting
2921 		 * placed into the statically identity mapping domain.
2922 		 */
2923 		if (pdev->untrusted)
2924 			return IOMMU_DOMAIN_DMA;
2925 
2926 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2927 			return IOMMU_DOMAIN_IDENTITY;
2928 
2929 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2930 			return IOMMU_DOMAIN_IDENTITY;
2931 	}
2932 
2933 	return 0;
2934 }
2935 
2936 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2937 {
2938 	/*
2939 	 * Start from the sane iommu hardware state.
2940 	 * If the queued invalidation is already initialized by us
2941 	 * (for example, while enabling interrupt-remapping) then
2942 	 * we got the things already rolling from a sane state.
2943 	 */
2944 	if (!iommu->qi) {
2945 		/*
2946 		 * Clear any previous faults.
2947 		 */
2948 		dmar_fault(-1, iommu);
2949 		/*
2950 		 * Disable queued invalidation if supported and already enabled
2951 		 * before OS handover.
2952 		 */
2953 		dmar_disable_qi(iommu);
2954 	}
2955 
2956 	if (dmar_enable_qi(iommu)) {
2957 		/*
2958 		 * Queued Invalidate not enabled, use Register Based Invalidate
2959 		 */
2960 		iommu->flush.flush_context = __iommu_flush_context;
2961 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2962 		pr_info("%s: Using Register based invalidation\n",
2963 			iommu->name);
2964 	} else {
2965 		iommu->flush.flush_context = qi_flush_context;
2966 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2967 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2968 	}
2969 }
2970 
2971 static int copy_context_table(struct intel_iommu *iommu,
2972 			      struct root_entry *old_re,
2973 			      struct context_entry **tbl,
2974 			      int bus, bool ext)
2975 {
2976 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2977 	struct context_entry *new_ce = NULL, ce;
2978 	struct context_entry *old_ce = NULL;
2979 	struct root_entry re;
2980 	phys_addr_t old_ce_phys;
2981 
2982 	tbl_idx = ext ? bus * 2 : bus;
2983 	memcpy(&re, old_re, sizeof(re));
2984 
2985 	for (devfn = 0; devfn < 256; devfn++) {
2986 		/* First calculate the correct index */
2987 		idx = (ext ? devfn * 2 : devfn) % 256;
2988 
2989 		if (idx == 0) {
2990 			/* First save what we may have and clean up */
2991 			if (new_ce) {
2992 				tbl[tbl_idx] = new_ce;
2993 				__iommu_flush_cache(iommu, new_ce,
2994 						    VTD_PAGE_SIZE);
2995 				pos = 1;
2996 			}
2997 
2998 			if (old_ce)
2999 				memunmap(old_ce);
3000 
3001 			ret = 0;
3002 			if (devfn < 0x80)
3003 				old_ce_phys = root_entry_lctp(&re);
3004 			else
3005 				old_ce_phys = root_entry_uctp(&re);
3006 
3007 			if (!old_ce_phys) {
3008 				if (ext && devfn == 0) {
3009 					/* No LCTP, try UCTP */
3010 					devfn = 0x7f;
3011 					continue;
3012 				} else {
3013 					goto out;
3014 				}
3015 			}
3016 
3017 			ret = -ENOMEM;
3018 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3019 					MEMREMAP_WB);
3020 			if (!old_ce)
3021 				goto out;
3022 
3023 			new_ce = alloc_pgtable_page(iommu->node);
3024 			if (!new_ce)
3025 				goto out_unmap;
3026 
3027 			ret = 0;
3028 		}
3029 
3030 		/* Now copy the context entry */
3031 		memcpy(&ce, old_ce + idx, sizeof(ce));
3032 
3033 		if (!__context_present(&ce))
3034 			continue;
3035 
3036 		did = context_domain_id(&ce);
3037 		if (did >= 0 && did < cap_ndoms(iommu->cap))
3038 			set_bit(did, iommu->domain_ids);
3039 
3040 		/*
3041 		 * We need a marker for copied context entries. This
3042 		 * marker needs to work for the old format as well as
3043 		 * for extended context entries.
3044 		 *
3045 		 * Bit 67 of the context entry is used. In the old
3046 		 * format this bit is available to software, in the
3047 		 * extended format it is the PGE bit, but PGE is ignored
3048 		 * by HW if PASIDs are disabled (and thus still
3049 		 * available).
3050 		 *
3051 		 * So disable PASIDs first and then mark the entry
3052 		 * copied. This means that we don't copy PASID
3053 		 * translations from the old kernel, but this is fine as
3054 		 * faults there are not fatal.
3055 		 */
3056 		context_clear_pasid_enable(&ce);
3057 		context_set_copied(&ce);
3058 
3059 		new_ce[idx] = ce;
3060 	}
3061 
3062 	tbl[tbl_idx + pos] = new_ce;
3063 
3064 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3065 
3066 out_unmap:
3067 	memunmap(old_ce);
3068 
3069 out:
3070 	return ret;
3071 }
3072 
3073 static int copy_translation_tables(struct intel_iommu *iommu)
3074 {
3075 	struct context_entry **ctxt_tbls;
3076 	struct root_entry *old_rt;
3077 	phys_addr_t old_rt_phys;
3078 	int ctxt_table_entries;
3079 	unsigned long flags;
3080 	u64 rtaddr_reg;
3081 	int bus, ret;
3082 	bool new_ext, ext;
3083 
3084 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3085 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3086 	new_ext    = !!ecap_ecs(iommu->ecap);
3087 
3088 	/*
3089 	 * The RTT bit can only be changed when translation is disabled,
3090 	 * but disabling translation means to open a window for data
3091 	 * corruption. So bail out and don't copy anything if we would
3092 	 * have to change the bit.
3093 	 */
3094 	if (new_ext != ext)
3095 		return -EINVAL;
3096 
3097 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3098 	if (!old_rt_phys)
3099 		return -EINVAL;
3100 
3101 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3102 	if (!old_rt)
3103 		return -ENOMEM;
3104 
3105 	/* This is too big for the stack - allocate it from slab */
3106 	ctxt_table_entries = ext ? 512 : 256;
3107 	ret = -ENOMEM;
3108 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3109 	if (!ctxt_tbls)
3110 		goto out_unmap;
3111 
3112 	for (bus = 0; bus < 256; bus++) {
3113 		ret = copy_context_table(iommu, &old_rt[bus],
3114 					 ctxt_tbls, bus, ext);
3115 		if (ret) {
3116 			pr_err("%s: Failed to copy context table for bus %d\n",
3117 				iommu->name, bus);
3118 			continue;
3119 		}
3120 	}
3121 
3122 	spin_lock_irqsave(&iommu->lock, flags);
3123 
3124 	/* Context tables are copied, now write them to the root_entry table */
3125 	for (bus = 0; bus < 256; bus++) {
3126 		int idx = ext ? bus * 2 : bus;
3127 		u64 val;
3128 
3129 		if (ctxt_tbls[idx]) {
3130 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3131 			iommu->root_entry[bus].lo = val;
3132 		}
3133 
3134 		if (!ext || !ctxt_tbls[idx + 1])
3135 			continue;
3136 
3137 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3138 		iommu->root_entry[bus].hi = val;
3139 	}
3140 
3141 	spin_unlock_irqrestore(&iommu->lock, flags);
3142 
3143 	kfree(ctxt_tbls);
3144 
3145 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3146 
3147 	ret = 0;
3148 
3149 out_unmap:
3150 	memunmap(old_rt);
3151 
3152 	return ret;
3153 }
3154 
3155 #ifdef CONFIG_INTEL_IOMMU_SVM
3156 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3157 {
3158 	struct intel_iommu *iommu = data;
3159 	ioasid_t ioasid;
3160 
3161 	if (!iommu)
3162 		return INVALID_IOASID;
3163 	/*
3164 	 * VT-d virtual command interface always uses the full 20 bit
3165 	 * PASID range. Host can partition guest PASID range based on
3166 	 * policies but it is out of guest's control.
3167 	 */
3168 	if (min < PASID_MIN || max > intel_pasid_max_id)
3169 		return INVALID_IOASID;
3170 
3171 	if (vcmd_alloc_pasid(iommu, &ioasid))
3172 		return INVALID_IOASID;
3173 
3174 	return ioasid;
3175 }
3176 
3177 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3178 {
3179 	struct intel_iommu *iommu = data;
3180 
3181 	if (!iommu)
3182 		return;
3183 	/*
3184 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3185 	 * We can only free the PASID when all the devices are unbound.
3186 	 */
3187 	if (ioasid_find(NULL, ioasid, NULL)) {
3188 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3189 		return;
3190 	}
3191 	vcmd_free_pasid(iommu, ioasid);
3192 }
3193 
3194 static void register_pasid_allocator(struct intel_iommu *iommu)
3195 {
3196 	/*
3197 	 * If we are running in the host, no need for custom allocator
3198 	 * in that PASIDs are allocated from the host system-wide.
3199 	 */
3200 	if (!cap_caching_mode(iommu->cap))
3201 		return;
3202 
3203 	if (!sm_supported(iommu)) {
3204 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3205 		return;
3206 	}
3207 
3208 	/*
3209 	 * Register a custom PASID allocator if we are running in a guest,
3210 	 * guest PASID must be obtained via virtual command interface.
3211 	 * There can be multiple vIOMMUs in each guest but only one allocator
3212 	 * is active. All vIOMMU allocators will eventually be calling the same
3213 	 * host allocator.
3214 	 */
3215 	if (!vccap_pasid(iommu->vccap))
3216 		return;
3217 
3218 	pr_info("Register custom PASID allocator\n");
3219 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3220 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3221 	iommu->pasid_allocator.pdata = (void *)iommu;
3222 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3223 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3224 		/*
3225 		 * Disable scalable mode on this IOMMU if there
3226 		 * is no custom allocator. Mixing SM capable vIOMMU
3227 		 * and non-SM vIOMMU are not supported.
3228 		 */
3229 		intel_iommu_sm = 0;
3230 	}
3231 }
3232 #endif
3233 
3234 static int __init init_dmars(void)
3235 {
3236 	struct dmar_drhd_unit *drhd;
3237 	struct intel_iommu *iommu;
3238 	int ret;
3239 
3240 	/*
3241 	 * for each drhd
3242 	 *    allocate root
3243 	 *    initialize and program root entry to not present
3244 	 * endfor
3245 	 */
3246 	for_each_drhd_unit(drhd) {
3247 		/*
3248 		 * lock not needed as this is only incremented in the single
3249 		 * threaded kernel __init code path all other access are read
3250 		 * only
3251 		 */
3252 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3253 			g_num_of_iommus++;
3254 			continue;
3255 		}
3256 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3257 	}
3258 
3259 	/* Preallocate enough resources for IOMMU hot-addition */
3260 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3261 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3262 
3263 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3264 			GFP_KERNEL);
3265 	if (!g_iommus) {
3266 		pr_err("Allocating global iommu array failed\n");
3267 		ret = -ENOMEM;
3268 		goto error;
3269 	}
3270 
3271 	for_each_iommu(iommu, drhd) {
3272 		if (drhd->ignored) {
3273 			iommu_disable_translation(iommu);
3274 			continue;
3275 		}
3276 
3277 		/*
3278 		 * Find the max pasid size of all IOMMU's in the system.
3279 		 * We need to ensure the system pasid table is no bigger
3280 		 * than the smallest supported.
3281 		 */
3282 		if (pasid_supported(iommu)) {
3283 			u32 temp = 2 << ecap_pss(iommu->ecap);
3284 
3285 			intel_pasid_max_id = min_t(u32, temp,
3286 						   intel_pasid_max_id);
3287 		}
3288 
3289 		g_iommus[iommu->seq_id] = iommu;
3290 
3291 		intel_iommu_init_qi(iommu);
3292 
3293 		ret = iommu_init_domains(iommu);
3294 		if (ret)
3295 			goto free_iommu;
3296 
3297 		init_translation_status(iommu);
3298 
3299 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3300 			iommu_disable_translation(iommu);
3301 			clear_translation_pre_enabled(iommu);
3302 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3303 				iommu->name);
3304 		}
3305 
3306 		/*
3307 		 * TBD:
3308 		 * we could share the same root & context tables
3309 		 * among all IOMMU's. Need to Split it later.
3310 		 */
3311 		ret = iommu_alloc_root_entry(iommu);
3312 		if (ret)
3313 			goto free_iommu;
3314 
3315 		if (translation_pre_enabled(iommu)) {
3316 			pr_info("Translation already enabled - trying to copy translation structures\n");
3317 
3318 			ret = copy_translation_tables(iommu);
3319 			if (ret) {
3320 				/*
3321 				 * We found the IOMMU with translation
3322 				 * enabled - but failed to copy over the
3323 				 * old root-entry table. Try to proceed
3324 				 * by disabling translation now and
3325 				 * allocating a clean root-entry table.
3326 				 * This might cause DMAR faults, but
3327 				 * probably the dump will still succeed.
3328 				 */
3329 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3330 				       iommu->name);
3331 				iommu_disable_translation(iommu);
3332 				clear_translation_pre_enabled(iommu);
3333 			} else {
3334 				pr_info("Copied translation tables from previous kernel for %s\n",
3335 					iommu->name);
3336 			}
3337 		}
3338 
3339 		if (!ecap_pass_through(iommu->ecap))
3340 			hw_pass_through = 0;
3341 		intel_svm_check(iommu);
3342 	}
3343 
3344 	/*
3345 	 * Now that qi is enabled on all iommus, set the root entry and flush
3346 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3347 	 * flush_context function will loop forever and the boot hangs.
3348 	 */
3349 	for_each_active_iommu(iommu, drhd) {
3350 		iommu_flush_write_buffer(iommu);
3351 #ifdef CONFIG_INTEL_IOMMU_SVM
3352 		register_pasid_allocator(iommu);
3353 #endif
3354 		iommu_set_root_entry(iommu);
3355 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3356 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3357 	}
3358 
3359 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3360 	dmar_map_gfx = 0;
3361 #endif
3362 
3363 	if (!dmar_map_gfx)
3364 		iommu_identity_mapping |= IDENTMAP_GFX;
3365 
3366 	check_tylersburg_isoch();
3367 
3368 	ret = si_domain_init(hw_pass_through);
3369 	if (ret)
3370 		goto free_iommu;
3371 
3372 	/*
3373 	 * for each drhd
3374 	 *   enable fault log
3375 	 *   global invalidate context cache
3376 	 *   global invalidate iotlb
3377 	 *   enable translation
3378 	 */
3379 	for_each_iommu(iommu, drhd) {
3380 		if (drhd->ignored) {
3381 			/*
3382 			 * we always have to disable PMRs or DMA may fail on
3383 			 * this device
3384 			 */
3385 			if (force_on)
3386 				iommu_disable_protect_mem_regions(iommu);
3387 			continue;
3388 		}
3389 
3390 		iommu_flush_write_buffer(iommu);
3391 
3392 #ifdef CONFIG_INTEL_IOMMU_SVM
3393 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3394 			/*
3395 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3396 			 * could cause possible lock race condition.
3397 			 */
3398 			up_write(&dmar_global_lock);
3399 			ret = intel_svm_enable_prq(iommu);
3400 			down_write(&dmar_global_lock);
3401 			if (ret)
3402 				goto free_iommu;
3403 		}
3404 #endif
3405 		ret = dmar_set_interrupt(iommu);
3406 		if (ret)
3407 			goto free_iommu;
3408 	}
3409 
3410 	return 0;
3411 
3412 free_iommu:
3413 	for_each_active_iommu(iommu, drhd) {
3414 		disable_dmar_iommu(iommu);
3415 		free_dmar_iommu(iommu);
3416 	}
3417 
3418 	kfree(g_iommus);
3419 
3420 error:
3421 	return ret;
3422 }
3423 
3424 /* This takes a number of _MM_ pages, not VTD pages */
3425 static unsigned long intel_alloc_iova(struct device *dev,
3426 				     struct dmar_domain *domain,
3427 				     unsigned long nrpages, uint64_t dma_mask)
3428 {
3429 	unsigned long iova_pfn;
3430 
3431 	/*
3432 	 * Restrict dma_mask to the width that the iommu can handle.
3433 	 * First-level translation restricts the input-address to a
3434 	 * canonical address (i.e., address bits 63:N have the same
3435 	 * value as address bit [N-1], where N is 48-bits with 4-level
3436 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3437 	 * [N-1].
3438 	 */
3439 	if (domain_use_first_level(domain))
3440 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3441 				 dma_mask);
3442 	else
3443 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3444 				 dma_mask);
3445 
3446 	/* Ensure we reserve the whole size-aligned region */
3447 	nrpages = __roundup_pow_of_two(nrpages);
3448 
3449 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3450 		/*
3451 		 * First try to allocate an io virtual address in
3452 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3453 		 * from higher range
3454 		 */
3455 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3456 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3457 		if (iova_pfn)
3458 			return iova_pfn;
3459 	}
3460 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3461 				   IOVA_PFN(dma_mask), true);
3462 	if (unlikely(!iova_pfn)) {
3463 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3464 			     nrpages);
3465 		return 0;
3466 	}
3467 
3468 	return iova_pfn;
3469 }
3470 
3471 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3472 				     size_t size, int dir, u64 dma_mask)
3473 {
3474 	struct dmar_domain *domain;
3475 	phys_addr_t start_paddr;
3476 	unsigned long iova_pfn;
3477 	int prot = 0;
3478 	int ret;
3479 	struct intel_iommu *iommu;
3480 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3481 
3482 	BUG_ON(dir == DMA_NONE);
3483 
3484 	if (unlikely(attach_deferred(dev)))
3485 		do_deferred_attach(dev);
3486 
3487 	domain = find_domain(dev);
3488 	if (!domain)
3489 		return DMA_MAPPING_ERROR;
3490 
3491 	iommu = domain_get_iommu(domain);
3492 	size = aligned_nrpages(paddr, size);
3493 
3494 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3495 	if (!iova_pfn)
3496 		goto error;
3497 
3498 	/*
3499 	 * Check if DMAR supports zero-length reads on write only
3500 	 * mappings..
3501 	 */
3502 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3503 			!cap_zlr(iommu->cap))
3504 		prot |= DMA_PTE_READ;
3505 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3506 		prot |= DMA_PTE_WRITE;
3507 	/*
3508 	 * paddr - (paddr + size) might be partial page, we should map the whole
3509 	 * page.  Note: if two part of one page are separately mapped, we
3510 	 * might have two guest_addr mapping to the same host paddr, but this
3511 	 * is not a big problem
3512 	 */
3513 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3514 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3515 	if (ret)
3516 		goto error;
3517 
3518 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3519 	start_paddr += paddr & ~PAGE_MASK;
3520 
3521 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3522 
3523 	return start_paddr;
3524 
3525 error:
3526 	if (iova_pfn)
3527 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3528 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3529 		size, (unsigned long long)paddr, dir);
3530 	return DMA_MAPPING_ERROR;
3531 }
3532 
3533 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3534 				 unsigned long offset, size_t size,
3535 				 enum dma_data_direction dir,
3536 				 unsigned long attrs)
3537 {
3538 	return __intel_map_single(dev, page_to_phys(page) + offset,
3539 				  size, dir, *dev->dma_mask);
3540 }
3541 
3542 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3543 				     size_t size, enum dma_data_direction dir,
3544 				     unsigned long attrs)
3545 {
3546 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3547 }
3548 
3549 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3550 {
3551 	struct dmar_domain *domain;
3552 	unsigned long start_pfn, last_pfn;
3553 	unsigned long nrpages;
3554 	unsigned long iova_pfn;
3555 	struct intel_iommu *iommu;
3556 	struct page *freelist;
3557 	struct pci_dev *pdev = NULL;
3558 
3559 	domain = find_domain(dev);
3560 	BUG_ON(!domain);
3561 
3562 	iommu = domain_get_iommu(domain);
3563 
3564 	iova_pfn = IOVA_PFN(dev_addr);
3565 
3566 	nrpages = aligned_nrpages(dev_addr, size);
3567 	start_pfn = mm_to_dma_pfn(iova_pfn);
3568 	last_pfn = start_pfn + nrpages - 1;
3569 
3570 	if (dev_is_pci(dev))
3571 		pdev = to_pci_dev(dev);
3572 
3573 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3574 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3575 			!has_iova_flush_queue(&domain->iovad)) {
3576 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3577 				      nrpages, !freelist, 0);
3578 		/* free iova */
3579 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3580 		dma_free_pagelist(freelist);
3581 	} else {
3582 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3583 			   (unsigned long)freelist);
3584 		/*
3585 		 * queue up the release of the unmap to save the 1/6th of the
3586 		 * cpu used up by the iotlb flush operation...
3587 		 */
3588 	}
3589 
3590 	trace_unmap_single(dev, dev_addr, size);
3591 }
3592 
3593 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3594 			     size_t size, enum dma_data_direction dir,
3595 			     unsigned long attrs)
3596 {
3597 	intel_unmap(dev, dev_addr, size);
3598 }
3599 
3600 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3601 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3602 {
3603 	intel_unmap(dev, dev_addr, size);
3604 }
3605 
3606 static void *intel_alloc_coherent(struct device *dev, size_t size,
3607 				  dma_addr_t *dma_handle, gfp_t flags,
3608 				  unsigned long attrs)
3609 {
3610 	struct page *page = NULL;
3611 	int order;
3612 
3613 	if (unlikely(attach_deferred(dev)))
3614 		do_deferred_attach(dev);
3615 
3616 	size = PAGE_ALIGN(size);
3617 	order = get_order(size);
3618 
3619 	if (gfpflags_allow_blocking(flags)) {
3620 		unsigned int count = size >> PAGE_SHIFT;
3621 
3622 		page = dma_alloc_from_contiguous(dev, count, order,
3623 						 flags & __GFP_NOWARN);
3624 	}
3625 
3626 	if (!page)
3627 		page = alloc_pages(flags, order);
3628 	if (!page)
3629 		return NULL;
3630 	memset(page_address(page), 0, size);
3631 
3632 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3633 					 DMA_BIDIRECTIONAL,
3634 					 dev->coherent_dma_mask);
3635 	if (*dma_handle != DMA_MAPPING_ERROR)
3636 		return page_address(page);
3637 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3638 		__free_pages(page, order);
3639 
3640 	return NULL;
3641 }
3642 
3643 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3644 				dma_addr_t dma_handle, unsigned long attrs)
3645 {
3646 	int order;
3647 	struct page *page = virt_to_page(vaddr);
3648 
3649 	size = PAGE_ALIGN(size);
3650 	order = get_order(size);
3651 
3652 	intel_unmap(dev, dma_handle, size);
3653 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3654 		__free_pages(page, order);
3655 }
3656 
3657 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3658 			   int nelems, enum dma_data_direction dir,
3659 			   unsigned long attrs)
3660 {
3661 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3662 	unsigned long nrpages = 0;
3663 	struct scatterlist *sg;
3664 	int i;
3665 
3666 	for_each_sg(sglist, sg, nelems, i) {
3667 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3668 	}
3669 
3670 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3671 
3672 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3673 }
3674 
3675 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3676 			enum dma_data_direction dir, unsigned long attrs)
3677 {
3678 	int i;
3679 	struct dmar_domain *domain;
3680 	size_t size = 0;
3681 	int prot = 0;
3682 	unsigned long iova_pfn;
3683 	int ret;
3684 	struct scatterlist *sg;
3685 	unsigned long start_vpfn;
3686 	struct intel_iommu *iommu;
3687 
3688 	BUG_ON(dir == DMA_NONE);
3689 
3690 	if (unlikely(attach_deferred(dev)))
3691 		do_deferred_attach(dev);
3692 
3693 	domain = find_domain(dev);
3694 	if (!domain)
3695 		return 0;
3696 
3697 	iommu = domain_get_iommu(domain);
3698 
3699 	for_each_sg(sglist, sg, nelems, i)
3700 		size += aligned_nrpages(sg->offset, sg->length);
3701 
3702 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3703 				*dev->dma_mask);
3704 	if (!iova_pfn) {
3705 		sglist->dma_length = 0;
3706 		return 0;
3707 	}
3708 
3709 	/*
3710 	 * Check if DMAR supports zero-length reads on write only
3711 	 * mappings..
3712 	 */
3713 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3714 			!cap_zlr(iommu->cap))
3715 		prot |= DMA_PTE_READ;
3716 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3717 		prot |= DMA_PTE_WRITE;
3718 
3719 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3720 
3721 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3722 	if (unlikely(ret)) {
3723 		dma_pte_free_pagetable(domain, start_vpfn,
3724 				       start_vpfn + size - 1,
3725 				       agaw_to_level(domain->agaw) + 1);
3726 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3727 		return 0;
3728 	}
3729 
3730 	for_each_sg(sglist, sg, nelems, i)
3731 		trace_map_sg(dev, i + 1, nelems, sg);
3732 
3733 	return nelems;
3734 }
3735 
3736 static u64 intel_get_required_mask(struct device *dev)
3737 {
3738 	return DMA_BIT_MASK(32);
3739 }
3740 
3741 static const struct dma_map_ops intel_dma_ops = {
3742 	.alloc = intel_alloc_coherent,
3743 	.free = intel_free_coherent,
3744 	.map_sg = intel_map_sg,
3745 	.unmap_sg = intel_unmap_sg,
3746 	.map_page = intel_map_page,
3747 	.unmap_page = intel_unmap_page,
3748 	.map_resource = intel_map_resource,
3749 	.unmap_resource = intel_unmap_resource,
3750 	.dma_supported = dma_direct_supported,
3751 	.mmap = dma_common_mmap,
3752 	.get_sgtable = dma_common_get_sgtable,
3753 	.alloc_pages = dma_common_alloc_pages,
3754 	.free_pages = dma_common_free_pages,
3755 	.get_required_mask = intel_get_required_mask,
3756 };
3757 
3758 static void
3759 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3760 		   enum dma_data_direction dir, enum dma_sync_target target)
3761 {
3762 	struct dmar_domain *domain;
3763 	phys_addr_t tlb_addr;
3764 
3765 	domain = find_domain(dev);
3766 	if (WARN_ON(!domain))
3767 		return;
3768 
3769 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3770 	if (is_swiotlb_buffer(tlb_addr))
3771 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3772 }
3773 
3774 static dma_addr_t
3775 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3776 		  enum dma_data_direction dir, unsigned long attrs,
3777 		  u64 dma_mask)
3778 {
3779 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3780 	struct dmar_domain *domain;
3781 	struct intel_iommu *iommu;
3782 	unsigned long iova_pfn;
3783 	unsigned long nrpages;
3784 	phys_addr_t tlb_addr;
3785 	int prot = 0;
3786 	int ret;
3787 
3788 	if (unlikely(attach_deferred(dev)))
3789 		do_deferred_attach(dev);
3790 
3791 	domain = find_domain(dev);
3792 
3793 	if (WARN_ON(dir == DMA_NONE || !domain))
3794 		return DMA_MAPPING_ERROR;
3795 
3796 	iommu = domain_get_iommu(domain);
3797 	if (WARN_ON(!iommu))
3798 		return DMA_MAPPING_ERROR;
3799 
3800 	nrpages = aligned_nrpages(0, size);
3801 	iova_pfn = intel_alloc_iova(dev, domain,
3802 				    dma_to_mm_pfn(nrpages), dma_mask);
3803 	if (!iova_pfn)
3804 		return DMA_MAPPING_ERROR;
3805 
3806 	/*
3807 	 * Check if DMAR supports zero-length reads on write only
3808 	 * mappings..
3809 	 */
3810 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3811 			!cap_zlr(iommu->cap))
3812 		prot |= DMA_PTE_READ;
3813 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3814 		prot |= DMA_PTE_WRITE;
3815 
3816 	/*
3817 	 * If both the physical buffer start address and size are
3818 	 * page aligned, we don't need to use a bounce page.
3819 	 */
3820 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3821 		tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3822 				aligned_size, dir, attrs);
3823 		if (tlb_addr == DMA_MAPPING_ERROR) {
3824 			goto swiotlb_error;
3825 		} else {
3826 			/* Cleanup the padding area. */
3827 			void *padding_start = phys_to_virt(tlb_addr);
3828 			size_t padding_size = aligned_size;
3829 
3830 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3831 			    (dir == DMA_TO_DEVICE ||
3832 			     dir == DMA_BIDIRECTIONAL)) {
3833 				padding_start += size;
3834 				padding_size -= size;
3835 			}
3836 
3837 			memset(padding_start, 0, padding_size);
3838 		}
3839 	} else {
3840 		tlb_addr = paddr;
3841 	}
3842 
3843 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3844 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3845 	if (ret)
3846 		goto mapping_error;
3847 
3848 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3849 
3850 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3851 
3852 mapping_error:
3853 	if (is_swiotlb_buffer(tlb_addr))
3854 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3855 					 aligned_size, dir, attrs);
3856 swiotlb_error:
3857 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3858 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3859 		size, (unsigned long long)paddr, dir);
3860 
3861 	return DMA_MAPPING_ERROR;
3862 }
3863 
3864 static void
3865 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3866 		    enum dma_data_direction dir, unsigned long attrs)
3867 {
3868 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3869 	struct dmar_domain *domain;
3870 	phys_addr_t tlb_addr;
3871 
3872 	domain = find_domain(dev);
3873 	if (WARN_ON(!domain))
3874 		return;
3875 
3876 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3877 	if (WARN_ON(!tlb_addr))
3878 		return;
3879 
3880 	intel_unmap(dev, dev_addr, size);
3881 	if (is_swiotlb_buffer(tlb_addr))
3882 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3883 					 aligned_size, dir, attrs);
3884 
3885 	trace_bounce_unmap_single(dev, dev_addr, size);
3886 }
3887 
3888 static dma_addr_t
3889 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3890 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3891 {
3892 	return bounce_map_single(dev, page_to_phys(page) + offset,
3893 				 size, dir, attrs, *dev->dma_mask);
3894 }
3895 
3896 static dma_addr_t
3897 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3898 		    enum dma_data_direction dir, unsigned long attrs)
3899 {
3900 	return bounce_map_single(dev, phys_addr, size,
3901 				 dir, attrs, *dev->dma_mask);
3902 }
3903 
3904 static void
3905 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3906 		  enum dma_data_direction dir, unsigned long attrs)
3907 {
3908 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3909 }
3910 
3911 static void
3912 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3913 		      enum dma_data_direction dir, unsigned long attrs)
3914 {
3915 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3916 }
3917 
3918 static void
3919 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3920 		enum dma_data_direction dir, unsigned long attrs)
3921 {
3922 	struct scatterlist *sg;
3923 	int i;
3924 
3925 	for_each_sg(sglist, sg, nelems, i)
3926 		bounce_unmap_page(dev, sg->dma_address,
3927 				  sg_dma_len(sg), dir, attrs);
3928 }
3929 
3930 static int
3931 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3932 	      enum dma_data_direction dir, unsigned long attrs)
3933 {
3934 	int i;
3935 	struct scatterlist *sg;
3936 
3937 	for_each_sg(sglist, sg, nelems, i) {
3938 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3939 						  sg->offset, sg->length,
3940 						  dir, attrs);
3941 		if (sg->dma_address == DMA_MAPPING_ERROR)
3942 			goto out_unmap;
3943 		sg_dma_len(sg) = sg->length;
3944 	}
3945 
3946 	for_each_sg(sglist, sg, nelems, i)
3947 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3948 
3949 	return nelems;
3950 
3951 out_unmap:
3952 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3953 	return 0;
3954 }
3955 
3956 static void
3957 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3958 			   size_t size, enum dma_data_direction dir)
3959 {
3960 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3961 }
3962 
3963 static void
3964 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3965 			      size_t size, enum dma_data_direction dir)
3966 {
3967 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3968 }
3969 
3970 static void
3971 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3972 		       int nelems, enum dma_data_direction dir)
3973 {
3974 	struct scatterlist *sg;
3975 	int i;
3976 
3977 	for_each_sg(sglist, sg, nelems, i)
3978 		bounce_sync_single(dev, sg_dma_address(sg),
3979 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3980 }
3981 
3982 static void
3983 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3984 			  int nelems, enum dma_data_direction dir)
3985 {
3986 	struct scatterlist *sg;
3987 	int i;
3988 
3989 	for_each_sg(sglist, sg, nelems, i)
3990 		bounce_sync_single(dev, sg_dma_address(sg),
3991 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3992 }
3993 
3994 static const struct dma_map_ops bounce_dma_ops = {
3995 	.alloc			= intel_alloc_coherent,
3996 	.free			= intel_free_coherent,
3997 	.map_sg			= bounce_map_sg,
3998 	.unmap_sg		= bounce_unmap_sg,
3999 	.map_page		= bounce_map_page,
4000 	.unmap_page		= bounce_unmap_page,
4001 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
4002 	.sync_single_for_device	= bounce_sync_single_for_device,
4003 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
4004 	.sync_sg_for_device	= bounce_sync_sg_for_device,
4005 	.map_resource		= bounce_map_resource,
4006 	.unmap_resource		= bounce_unmap_resource,
4007 	.alloc_pages		= dma_common_alloc_pages,
4008 	.free_pages		= dma_common_free_pages,
4009 	.dma_supported		= dma_direct_supported,
4010 };
4011 
4012 static inline int iommu_domain_cache_init(void)
4013 {
4014 	int ret = 0;
4015 
4016 	iommu_domain_cache = kmem_cache_create("iommu_domain",
4017 					 sizeof(struct dmar_domain),
4018 					 0,
4019 					 SLAB_HWCACHE_ALIGN,
4020 
4021 					 NULL);
4022 	if (!iommu_domain_cache) {
4023 		pr_err("Couldn't create iommu_domain cache\n");
4024 		ret = -ENOMEM;
4025 	}
4026 
4027 	return ret;
4028 }
4029 
4030 static inline int iommu_devinfo_cache_init(void)
4031 {
4032 	int ret = 0;
4033 
4034 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4035 					 sizeof(struct device_domain_info),
4036 					 0,
4037 					 SLAB_HWCACHE_ALIGN,
4038 					 NULL);
4039 	if (!iommu_devinfo_cache) {
4040 		pr_err("Couldn't create devinfo cache\n");
4041 		ret = -ENOMEM;
4042 	}
4043 
4044 	return ret;
4045 }
4046 
4047 static int __init iommu_init_mempool(void)
4048 {
4049 	int ret;
4050 	ret = iova_cache_get();
4051 	if (ret)
4052 		return ret;
4053 
4054 	ret = iommu_domain_cache_init();
4055 	if (ret)
4056 		goto domain_error;
4057 
4058 	ret = iommu_devinfo_cache_init();
4059 	if (!ret)
4060 		return ret;
4061 
4062 	kmem_cache_destroy(iommu_domain_cache);
4063 domain_error:
4064 	iova_cache_put();
4065 
4066 	return -ENOMEM;
4067 }
4068 
4069 static void __init iommu_exit_mempool(void)
4070 {
4071 	kmem_cache_destroy(iommu_devinfo_cache);
4072 	kmem_cache_destroy(iommu_domain_cache);
4073 	iova_cache_put();
4074 }
4075 
4076 static void __init init_no_remapping_devices(void)
4077 {
4078 	struct dmar_drhd_unit *drhd;
4079 	struct device *dev;
4080 	int i;
4081 
4082 	for_each_drhd_unit(drhd) {
4083 		if (!drhd->include_all) {
4084 			for_each_active_dev_scope(drhd->devices,
4085 						  drhd->devices_cnt, i, dev)
4086 				break;
4087 			/* ignore DMAR unit if no devices exist */
4088 			if (i == drhd->devices_cnt)
4089 				drhd->ignored = 1;
4090 		}
4091 	}
4092 
4093 	for_each_active_drhd_unit(drhd) {
4094 		if (drhd->include_all)
4095 			continue;
4096 
4097 		for_each_active_dev_scope(drhd->devices,
4098 					  drhd->devices_cnt, i, dev)
4099 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4100 				break;
4101 		if (i < drhd->devices_cnt)
4102 			continue;
4103 
4104 		/* This IOMMU has *only* gfx devices. Either bypass it or
4105 		   set the gfx_mapped flag, as appropriate */
4106 		drhd->gfx_dedicated = 1;
4107 		if (!dmar_map_gfx)
4108 			drhd->ignored = 1;
4109 	}
4110 }
4111 
4112 #ifdef CONFIG_SUSPEND
4113 static int init_iommu_hw(void)
4114 {
4115 	struct dmar_drhd_unit *drhd;
4116 	struct intel_iommu *iommu = NULL;
4117 
4118 	for_each_active_iommu(iommu, drhd)
4119 		if (iommu->qi)
4120 			dmar_reenable_qi(iommu);
4121 
4122 	for_each_iommu(iommu, drhd) {
4123 		if (drhd->ignored) {
4124 			/*
4125 			 * we always have to disable PMRs or DMA may fail on
4126 			 * this device
4127 			 */
4128 			if (force_on)
4129 				iommu_disable_protect_mem_regions(iommu);
4130 			continue;
4131 		}
4132 
4133 		iommu_flush_write_buffer(iommu);
4134 
4135 		iommu_set_root_entry(iommu);
4136 
4137 		iommu->flush.flush_context(iommu, 0, 0, 0,
4138 					   DMA_CCMD_GLOBAL_INVL);
4139 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4140 		iommu_enable_translation(iommu);
4141 		iommu_disable_protect_mem_regions(iommu);
4142 	}
4143 
4144 	return 0;
4145 }
4146 
4147 static void iommu_flush_all(void)
4148 {
4149 	struct dmar_drhd_unit *drhd;
4150 	struct intel_iommu *iommu;
4151 
4152 	for_each_active_iommu(iommu, drhd) {
4153 		iommu->flush.flush_context(iommu, 0, 0, 0,
4154 					   DMA_CCMD_GLOBAL_INVL);
4155 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4156 					 DMA_TLB_GLOBAL_FLUSH);
4157 	}
4158 }
4159 
4160 static int iommu_suspend(void)
4161 {
4162 	struct dmar_drhd_unit *drhd;
4163 	struct intel_iommu *iommu = NULL;
4164 	unsigned long flag;
4165 
4166 	for_each_active_iommu(iommu, drhd) {
4167 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4168 						 GFP_ATOMIC);
4169 		if (!iommu->iommu_state)
4170 			goto nomem;
4171 	}
4172 
4173 	iommu_flush_all();
4174 
4175 	for_each_active_iommu(iommu, drhd) {
4176 		iommu_disable_translation(iommu);
4177 
4178 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4179 
4180 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4181 			readl(iommu->reg + DMAR_FECTL_REG);
4182 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4183 			readl(iommu->reg + DMAR_FEDATA_REG);
4184 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4185 			readl(iommu->reg + DMAR_FEADDR_REG);
4186 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4187 			readl(iommu->reg + DMAR_FEUADDR_REG);
4188 
4189 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4190 	}
4191 	return 0;
4192 
4193 nomem:
4194 	for_each_active_iommu(iommu, drhd)
4195 		kfree(iommu->iommu_state);
4196 
4197 	return -ENOMEM;
4198 }
4199 
4200 static void iommu_resume(void)
4201 {
4202 	struct dmar_drhd_unit *drhd;
4203 	struct intel_iommu *iommu = NULL;
4204 	unsigned long flag;
4205 
4206 	if (init_iommu_hw()) {
4207 		if (force_on)
4208 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4209 		else
4210 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4211 		return;
4212 	}
4213 
4214 	for_each_active_iommu(iommu, drhd) {
4215 
4216 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4217 
4218 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4219 			iommu->reg + DMAR_FECTL_REG);
4220 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4221 			iommu->reg + DMAR_FEDATA_REG);
4222 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4223 			iommu->reg + DMAR_FEADDR_REG);
4224 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4225 			iommu->reg + DMAR_FEUADDR_REG);
4226 
4227 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4228 	}
4229 
4230 	for_each_active_iommu(iommu, drhd)
4231 		kfree(iommu->iommu_state);
4232 }
4233 
4234 static struct syscore_ops iommu_syscore_ops = {
4235 	.resume		= iommu_resume,
4236 	.suspend	= iommu_suspend,
4237 };
4238 
4239 static void __init init_iommu_pm_ops(void)
4240 {
4241 	register_syscore_ops(&iommu_syscore_ops);
4242 }
4243 
4244 #else
4245 static inline void init_iommu_pm_ops(void) {}
4246 #endif	/* CONFIG_PM */
4247 
4248 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4249 {
4250 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4251 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4252 	    rmrr->end_address <= rmrr->base_address ||
4253 	    arch_rmrr_sanity_check(rmrr))
4254 		return -EINVAL;
4255 
4256 	return 0;
4257 }
4258 
4259 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4260 {
4261 	struct acpi_dmar_reserved_memory *rmrr;
4262 	struct dmar_rmrr_unit *rmrru;
4263 
4264 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4265 	if (rmrr_sanity_check(rmrr)) {
4266 		pr_warn(FW_BUG
4267 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4268 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4269 			   rmrr->base_address, rmrr->end_address,
4270 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4271 			   dmi_get_system_info(DMI_BIOS_VERSION),
4272 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4273 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4274 	}
4275 
4276 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4277 	if (!rmrru)
4278 		goto out;
4279 
4280 	rmrru->hdr = header;
4281 
4282 	rmrru->base_address = rmrr->base_address;
4283 	rmrru->end_address = rmrr->end_address;
4284 
4285 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4286 				((void *)rmrr) + rmrr->header.length,
4287 				&rmrru->devices_cnt);
4288 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4289 		goto free_rmrru;
4290 
4291 	list_add(&rmrru->list, &dmar_rmrr_units);
4292 
4293 	return 0;
4294 free_rmrru:
4295 	kfree(rmrru);
4296 out:
4297 	return -ENOMEM;
4298 }
4299 
4300 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4301 {
4302 	struct dmar_atsr_unit *atsru;
4303 	struct acpi_dmar_atsr *tmp;
4304 
4305 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4306 				dmar_rcu_check()) {
4307 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4308 		if (atsr->segment != tmp->segment)
4309 			continue;
4310 		if (atsr->header.length != tmp->header.length)
4311 			continue;
4312 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4313 			return atsru;
4314 	}
4315 
4316 	return NULL;
4317 }
4318 
4319 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4320 {
4321 	struct acpi_dmar_atsr *atsr;
4322 	struct dmar_atsr_unit *atsru;
4323 
4324 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4325 		return 0;
4326 
4327 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4328 	atsru = dmar_find_atsr(atsr);
4329 	if (atsru)
4330 		return 0;
4331 
4332 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4333 	if (!atsru)
4334 		return -ENOMEM;
4335 
4336 	/*
4337 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4338 	 * copy the memory content because the memory buffer will be freed
4339 	 * on return.
4340 	 */
4341 	atsru->hdr = (void *)(atsru + 1);
4342 	memcpy(atsru->hdr, hdr, hdr->length);
4343 	atsru->include_all = atsr->flags & 0x1;
4344 	if (!atsru->include_all) {
4345 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4346 				(void *)atsr + atsr->header.length,
4347 				&atsru->devices_cnt);
4348 		if (atsru->devices_cnt && atsru->devices == NULL) {
4349 			kfree(atsru);
4350 			return -ENOMEM;
4351 		}
4352 	}
4353 
4354 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4355 
4356 	return 0;
4357 }
4358 
4359 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4360 {
4361 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4362 	kfree(atsru);
4363 }
4364 
4365 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4366 {
4367 	struct acpi_dmar_atsr *atsr;
4368 	struct dmar_atsr_unit *atsru;
4369 
4370 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4371 	atsru = dmar_find_atsr(atsr);
4372 	if (atsru) {
4373 		list_del_rcu(&atsru->list);
4374 		synchronize_rcu();
4375 		intel_iommu_free_atsr(atsru);
4376 	}
4377 
4378 	return 0;
4379 }
4380 
4381 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4382 {
4383 	int i;
4384 	struct device *dev;
4385 	struct acpi_dmar_atsr *atsr;
4386 	struct dmar_atsr_unit *atsru;
4387 
4388 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4389 	atsru = dmar_find_atsr(atsr);
4390 	if (!atsru)
4391 		return 0;
4392 
4393 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4394 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4395 					  i, dev)
4396 			return -EBUSY;
4397 	}
4398 
4399 	return 0;
4400 }
4401 
4402 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4403 {
4404 	int sp, ret;
4405 	struct intel_iommu *iommu = dmaru->iommu;
4406 
4407 	if (g_iommus[iommu->seq_id])
4408 		return 0;
4409 
4410 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4411 		pr_warn("%s: Doesn't support hardware pass through.\n",
4412 			iommu->name);
4413 		return -ENXIO;
4414 	}
4415 	if (!ecap_sc_support(iommu->ecap) &&
4416 	    domain_update_iommu_snooping(iommu)) {
4417 		pr_warn("%s: Doesn't support snooping.\n",
4418 			iommu->name);
4419 		return -ENXIO;
4420 	}
4421 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4422 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4423 		pr_warn("%s: Doesn't support large page.\n",
4424 			iommu->name);
4425 		return -ENXIO;
4426 	}
4427 
4428 	/*
4429 	 * Disable translation if already enabled prior to OS handover.
4430 	 */
4431 	if (iommu->gcmd & DMA_GCMD_TE)
4432 		iommu_disable_translation(iommu);
4433 
4434 	g_iommus[iommu->seq_id] = iommu;
4435 	ret = iommu_init_domains(iommu);
4436 	if (ret == 0)
4437 		ret = iommu_alloc_root_entry(iommu);
4438 	if (ret)
4439 		goto out;
4440 
4441 	intel_svm_check(iommu);
4442 
4443 	if (dmaru->ignored) {
4444 		/*
4445 		 * we always have to disable PMRs or DMA may fail on this device
4446 		 */
4447 		if (force_on)
4448 			iommu_disable_protect_mem_regions(iommu);
4449 		return 0;
4450 	}
4451 
4452 	intel_iommu_init_qi(iommu);
4453 	iommu_flush_write_buffer(iommu);
4454 
4455 #ifdef CONFIG_INTEL_IOMMU_SVM
4456 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4457 		ret = intel_svm_enable_prq(iommu);
4458 		if (ret)
4459 			goto disable_iommu;
4460 	}
4461 #endif
4462 	ret = dmar_set_interrupt(iommu);
4463 	if (ret)
4464 		goto disable_iommu;
4465 
4466 	iommu_set_root_entry(iommu);
4467 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4468 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4469 	iommu_enable_translation(iommu);
4470 
4471 	iommu_disable_protect_mem_regions(iommu);
4472 	return 0;
4473 
4474 disable_iommu:
4475 	disable_dmar_iommu(iommu);
4476 out:
4477 	free_dmar_iommu(iommu);
4478 	return ret;
4479 }
4480 
4481 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4482 {
4483 	int ret = 0;
4484 	struct intel_iommu *iommu = dmaru->iommu;
4485 
4486 	if (!intel_iommu_enabled)
4487 		return 0;
4488 	if (iommu == NULL)
4489 		return -EINVAL;
4490 
4491 	if (insert) {
4492 		ret = intel_iommu_add(dmaru);
4493 	} else {
4494 		disable_dmar_iommu(iommu);
4495 		free_dmar_iommu(iommu);
4496 	}
4497 
4498 	return ret;
4499 }
4500 
4501 static void intel_iommu_free_dmars(void)
4502 {
4503 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4504 	struct dmar_atsr_unit *atsru, *atsr_n;
4505 
4506 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4507 		list_del(&rmrru->list);
4508 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4509 		kfree(rmrru);
4510 	}
4511 
4512 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4513 		list_del(&atsru->list);
4514 		intel_iommu_free_atsr(atsru);
4515 	}
4516 }
4517 
4518 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4519 {
4520 	int i, ret = 1;
4521 	struct pci_bus *bus;
4522 	struct pci_dev *bridge = NULL;
4523 	struct device *tmp;
4524 	struct acpi_dmar_atsr *atsr;
4525 	struct dmar_atsr_unit *atsru;
4526 
4527 	dev = pci_physfn(dev);
4528 	for (bus = dev->bus; bus; bus = bus->parent) {
4529 		bridge = bus->self;
4530 		/* If it's an integrated device, allow ATS */
4531 		if (!bridge)
4532 			return 1;
4533 		/* Connected via non-PCIe: no ATS */
4534 		if (!pci_is_pcie(bridge) ||
4535 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4536 			return 0;
4537 		/* If we found the root port, look it up in the ATSR */
4538 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4539 			break;
4540 	}
4541 
4542 	rcu_read_lock();
4543 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4544 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4545 		if (atsr->segment != pci_domain_nr(dev->bus))
4546 			continue;
4547 
4548 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4549 			if (tmp == &bridge->dev)
4550 				goto out;
4551 
4552 		if (atsru->include_all)
4553 			goto out;
4554 	}
4555 	ret = 0;
4556 out:
4557 	rcu_read_unlock();
4558 
4559 	return ret;
4560 }
4561 
4562 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4563 {
4564 	int ret;
4565 	struct dmar_rmrr_unit *rmrru;
4566 	struct dmar_atsr_unit *atsru;
4567 	struct acpi_dmar_atsr *atsr;
4568 	struct acpi_dmar_reserved_memory *rmrr;
4569 
4570 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4571 		return 0;
4572 
4573 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4574 		rmrr = container_of(rmrru->hdr,
4575 				    struct acpi_dmar_reserved_memory, header);
4576 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4577 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4578 				((void *)rmrr) + rmrr->header.length,
4579 				rmrr->segment, rmrru->devices,
4580 				rmrru->devices_cnt);
4581 			if (ret < 0)
4582 				return ret;
4583 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4584 			dmar_remove_dev_scope(info, rmrr->segment,
4585 				rmrru->devices, rmrru->devices_cnt);
4586 		}
4587 	}
4588 
4589 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4590 		if (atsru->include_all)
4591 			continue;
4592 
4593 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4594 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4595 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4596 					(void *)atsr + atsr->header.length,
4597 					atsr->segment, atsru->devices,
4598 					atsru->devices_cnt);
4599 			if (ret > 0)
4600 				break;
4601 			else if (ret < 0)
4602 				return ret;
4603 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4604 			if (dmar_remove_dev_scope(info, atsr->segment,
4605 					atsru->devices, atsru->devices_cnt))
4606 				break;
4607 		}
4608 	}
4609 
4610 	return 0;
4611 }
4612 
4613 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4614 				       unsigned long val, void *v)
4615 {
4616 	struct memory_notify *mhp = v;
4617 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4618 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4619 			mhp->nr_pages - 1);
4620 
4621 	switch (val) {
4622 	case MEM_GOING_ONLINE:
4623 		if (iommu_domain_identity_map(si_domain,
4624 					      start_vpfn, last_vpfn)) {
4625 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4626 				start_vpfn, last_vpfn);
4627 			return NOTIFY_BAD;
4628 		}
4629 		break;
4630 
4631 	case MEM_OFFLINE:
4632 	case MEM_CANCEL_ONLINE:
4633 		{
4634 			struct dmar_drhd_unit *drhd;
4635 			struct intel_iommu *iommu;
4636 			struct page *freelist;
4637 
4638 			freelist = domain_unmap(si_domain,
4639 						start_vpfn, last_vpfn);
4640 
4641 			rcu_read_lock();
4642 			for_each_active_iommu(iommu, drhd)
4643 				iommu_flush_iotlb_psi(iommu, si_domain,
4644 					start_vpfn, mhp->nr_pages,
4645 					!freelist, 0);
4646 			rcu_read_unlock();
4647 			dma_free_pagelist(freelist);
4648 		}
4649 		break;
4650 	}
4651 
4652 	return NOTIFY_OK;
4653 }
4654 
4655 static struct notifier_block intel_iommu_memory_nb = {
4656 	.notifier_call = intel_iommu_memory_notifier,
4657 	.priority = 0
4658 };
4659 
4660 static void free_all_cpu_cached_iovas(unsigned int cpu)
4661 {
4662 	int i;
4663 
4664 	for (i = 0; i < g_num_of_iommus; i++) {
4665 		struct intel_iommu *iommu = g_iommus[i];
4666 		struct dmar_domain *domain;
4667 		int did;
4668 
4669 		if (!iommu)
4670 			continue;
4671 
4672 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4673 			domain = get_iommu_domain(iommu, (u16)did);
4674 
4675 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4676 				continue;
4677 
4678 			free_cpu_cached_iovas(cpu, &domain->iovad);
4679 		}
4680 	}
4681 }
4682 
4683 static int intel_iommu_cpu_dead(unsigned int cpu)
4684 {
4685 	free_all_cpu_cached_iovas(cpu);
4686 	return 0;
4687 }
4688 
4689 static void intel_disable_iommus(void)
4690 {
4691 	struct intel_iommu *iommu = NULL;
4692 	struct dmar_drhd_unit *drhd;
4693 
4694 	for_each_iommu(iommu, drhd)
4695 		iommu_disable_translation(iommu);
4696 }
4697 
4698 void intel_iommu_shutdown(void)
4699 {
4700 	struct dmar_drhd_unit *drhd;
4701 	struct intel_iommu *iommu = NULL;
4702 
4703 	if (no_iommu || dmar_disabled)
4704 		return;
4705 
4706 	down_write(&dmar_global_lock);
4707 
4708 	/* Disable PMRs explicitly here. */
4709 	for_each_iommu(iommu, drhd)
4710 		iommu_disable_protect_mem_regions(iommu);
4711 
4712 	/* Make sure the IOMMUs are switched off */
4713 	intel_disable_iommus();
4714 
4715 	up_write(&dmar_global_lock);
4716 }
4717 
4718 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4719 {
4720 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4721 
4722 	return container_of(iommu_dev, struct intel_iommu, iommu);
4723 }
4724 
4725 static ssize_t intel_iommu_show_version(struct device *dev,
4726 					struct device_attribute *attr,
4727 					char *buf)
4728 {
4729 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4730 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4731 	return sprintf(buf, "%d:%d\n",
4732 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4733 }
4734 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4735 
4736 static ssize_t intel_iommu_show_address(struct device *dev,
4737 					struct device_attribute *attr,
4738 					char *buf)
4739 {
4740 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4741 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4742 }
4743 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4744 
4745 static ssize_t intel_iommu_show_cap(struct device *dev,
4746 				    struct device_attribute *attr,
4747 				    char *buf)
4748 {
4749 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4750 	return sprintf(buf, "%llx\n", iommu->cap);
4751 }
4752 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4753 
4754 static ssize_t intel_iommu_show_ecap(struct device *dev,
4755 				    struct device_attribute *attr,
4756 				    char *buf)
4757 {
4758 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4759 	return sprintf(buf, "%llx\n", iommu->ecap);
4760 }
4761 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4762 
4763 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4764 				      struct device_attribute *attr,
4765 				      char *buf)
4766 {
4767 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4768 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4769 }
4770 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4771 
4772 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4773 					   struct device_attribute *attr,
4774 					   char *buf)
4775 {
4776 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4777 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4778 						  cap_ndoms(iommu->cap)));
4779 }
4780 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4781 
4782 static struct attribute *intel_iommu_attrs[] = {
4783 	&dev_attr_version.attr,
4784 	&dev_attr_address.attr,
4785 	&dev_attr_cap.attr,
4786 	&dev_attr_ecap.attr,
4787 	&dev_attr_domains_supported.attr,
4788 	&dev_attr_domains_used.attr,
4789 	NULL,
4790 };
4791 
4792 static struct attribute_group intel_iommu_group = {
4793 	.name = "intel-iommu",
4794 	.attrs = intel_iommu_attrs,
4795 };
4796 
4797 const struct attribute_group *intel_iommu_groups[] = {
4798 	&intel_iommu_group,
4799 	NULL,
4800 };
4801 
4802 static inline bool has_external_pci(void)
4803 {
4804 	struct pci_dev *pdev = NULL;
4805 
4806 	for_each_pci_dev(pdev)
4807 		if (pdev->external_facing)
4808 			return true;
4809 
4810 	return false;
4811 }
4812 
4813 static int __init platform_optin_force_iommu(void)
4814 {
4815 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4816 		return 0;
4817 
4818 	if (no_iommu || dmar_disabled)
4819 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4820 
4821 	/*
4822 	 * If Intel-IOMMU is disabled by default, we will apply identity
4823 	 * map for all devices except those marked as being untrusted.
4824 	 */
4825 	if (dmar_disabled)
4826 		iommu_set_default_passthrough(false);
4827 
4828 	dmar_disabled = 0;
4829 	no_iommu = 0;
4830 
4831 	return 1;
4832 }
4833 
4834 static int __init probe_acpi_namespace_devices(void)
4835 {
4836 	struct dmar_drhd_unit *drhd;
4837 	/* To avoid a -Wunused-but-set-variable warning. */
4838 	struct intel_iommu *iommu __maybe_unused;
4839 	struct device *dev;
4840 	int i, ret = 0;
4841 
4842 	for_each_active_iommu(iommu, drhd) {
4843 		for_each_active_dev_scope(drhd->devices,
4844 					  drhd->devices_cnt, i, dev) {
4845 			struct acpi_device_physical_node *pn;
4846 			struct iommu_group *group;
4847 			struct acpi_device *adev;
4848 
4849 			if (dev->bus != &acpi_bus_type)
4850 				continue;
4851 
4852 			adev = to_acpi_device(dev);
4853 			mutex_lock(&adev->physical_node_lock);
4854 			list_for_each_entry(pn,
4855 					    &adev->physical_node_list, node) {
4856 				group = iommu_group_get(pn->dev);
4857 				if (group) {
4858 					iommu_group_put(group);
4859 					continue;
4860 				}
4861 
4862 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4863 				ret = iommu_probe_device(pn->dev);
4864 				if (ret)
4865 					break;
4866 			}
4867 			mutex_unlock(&adev->physical_node_lock);
4868 
4869 			if (ret)
4870 				return ret;
4871 		}
4872 	}
4873 
4874 	return 0;
4875 }
4876 
4877 int __init intel_iommu_init(void)
4878 {
4879 	int ret = -ENODEV;
4880 	struct dmar_drhd_unit *drhd;
4881 	struct intel_iommu *iommu;
4882 
4883 	/*
4884 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4885 	 * opt in, so enforce that.
4886 	 */
4887 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4888 		    platform_optin_force_iommu();
4889 
4890 	if (iommu_init_mempool()) {
4891 		if (force_on)
4892 			panic("tboot: Failed to initialize iommu memory\n");
4893 		return -ENOMEM;
4894 	}
4895 
4896 	down_write(&dmar_global_lock);
4897 	if (dmar_table_init()) {
4898 		if (force_on)
4899 			panic("tboot: Failed to initialize DMAR table\n");
4900 		goto out_free_dmar;
4901 	}
4902 
4903 	if (dmar_dev_scope_init() < 0) {
4904 		if (force_on)
4905 			panic("tboot: Failed to initialize DMAR device scope\n");
4906 		goto out_free_dmar;
4907 	}
4908 
4909 	up_write(&dmar_global_lock);
4910 
4911 	/*
4912 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4913 	 * complain later when we register it under the lock.
4914 	 */
4915 	dmar_register_bus_notifier();
4916 
4917 	down_write(&dmar_global_lock);
4918 
4919 	if (!no_iommu)
4920 		intel_iommu_debugfs_init();
4921 
4922 	if (no_iommu || dmar_disabled) {
4923 		/*
4924 		 * We exit the function here to ensure IOMMU's remapping and
4925 		 * mempool aren't setup, which means that the IOMMU's PMRs
4926 		 * won't be disabled via the call to init_dmars(). So disable
4927 		 * it explicitly here. The PMRs were setup by tboot prior to
4928 		 * calling SENTER, but the kernel is expected to reset/tear
4929 		 * down the PMRs.
4930 		 */
4931 		if (intel_iommu_tboot_noforce) {
4932 			for_each_iommu(iommu, drhd)
4933 				iommu_disable_protect_mem_regions(iommu);
4934 		}
4935 
4936 		/*
4937 		 * Make sure the IOMMUs are switched off, even when we
4938 		 * boot into a kexec kernel and the previous kernel left
4939 		 * them enabled
4940 		 */
4941 		intel_disable_iommus();
4942 		goto out_free_dmar;
4943 	}
4944 
4945 	if (list_empty(&dmar_rmrr_units))
4946 		pr_info("No RMRR found\n");
4947 
4948 	if (list_empty(&dmar_atsr_units))
4949 		pr_info("No ATSR found\n");
4950 
4951 	if (dmar_init_reserved_ranges()) {
4952 		if (force_on)
4953 			panic("tboot: Failed to reserve iommu ranges\n");
4954 		goto out_free_reserved_range;
4955 	}
4956 
4957 	if (dmar_map_gfx)
4958 		intel_iommu_gfx_mapped = 1;
4959 
4960 	init_no_remapping_devices();
4961 
4962 	ret = init_dmars();
4963 	if (ret) {
4964 		if (force_on)
4965 			panic("tboot: Failed to initialize DMARs\n");
4966 		pr_err("Initialization failed\n");
4967 		goto out_free_reserved_range;
4968 	}
4969 	up_write(&dmar_global_lock);
4970 
4971 	init_iommu_pm_ops();
4972 
4973 	down_read(&dmar_global_lock);
4974 	for_each_active_iommu(iommu, drhd) {
4975 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4976 				       intel_iommu_groups,
4977 				       "%s", iommu->name);
4978 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4979 		iommu_device_register(&iommu->iommu);
4980 	}
4981 	up_read(&dmar_global_lock);
4982 
4983 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4984 	if (si_domain && !hw_pass_through)
4985 		register_memory_notifier(&intel_iommu_memory_nb);
4986 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4987 			  intel_iommu_cpu_dead);
4988 
4989 	down_read(&dmar_global_lock);
4990 	if (probe_acpi_namespace_devices())
4991 		pr_warn("ACPI name space devices didn't probe correctly\n");
4992 
4993 	/* Finally, we enable the DMA remapping hardware. */
4994 	for_each_iommu(iommu, drhd) {
4995 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4996 			iommu_enable_translation(iommu);
4997 
4998 		iommu_disable_protect_mem_regions(iommu);
4999 	}
5000 	up_read(&dmar_global_lock);
5001 
5002 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5003 
5004 	intel_iommu_enabled = 1;
5005 
5006 	return 0;
5007 
5008 out_free_reserved_range:
5009 	put_iova_domain(&reserved_iova_list);
5010 out_free_dmar:
5011 	intel_iommu_free_dmars();
5012 	up_write(&dmar_global_lock);
5013 	iommu_exit_mempool();
5014 	return ret;
5015 }
5016 
5017 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5018 {
5019 	struct intel_iommu *iommu = opaque;
5020 
5021 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5022 	return 0;
5023 }
5024 
5025 /*
5026  * NB - intel-iommu lacks any sort of reference counting for the users of
5027  * dependent devices.  If multiple endpoints have intersecting dependent
5028  * devices, unbinding the driver from any one of them will possibly leave
5029  * the others unable to operate.
5030  */
5031 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5032 {
5033 	if (!iommu || !dev || !dev_is_pci(dev))
5034 		return;
5035 
5036 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5037 }
5038 
5039 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5040 {
5041 	struct dmar_domain *domain;
5042 	struct intel_iommu *iommu;
5043 	unsigned long flags;
5044 
5045 	assert_spin_locked(&device_domain_lock);
5046 
5047 	if (WARN_ON(!info))
5048 		return;
5049 
5050 	iommu = info->iommu;
5051 	domain = info->domain;
5052 
5053 	if (info->dev) {
5054 		if (dev_is_pci(info->dev) && sm_supported(iommu))
5055 			intel_pasid_tear_down_entry(iommu, info->dev,
5056 					PASID_RID2PASID, false);
5057 
5058 		iommu_disable_dev_iotlb(info);
5059 		if (!dev_is_real_dma_subdevice(info->dev))
5060 			domain_context_clear(iommu, info->dev);
5061 		intel_pasid_free_table(info->dev);
5062 	}
5063 
5064 	unlink_domain_info(info);
5065 
5066 	spin_lock_irqsave(&iommu->lock, flags);
5067 	domain_detach_iommu(domain, iommu);
5068 	spin_unlock_irqrestore(&iommu->lock, flags);
5069 
5070 	free_devinfo_mem(info);
5071 }
5072 
5073 static void dmar_remove_one_dev_info(struct device *dev)
5074 {
5075 	struct device_domain_info *info;
5076 	unsigned long flags;
5077 
5078 	spin_lock_irqsave(&device_domain_lock, flags);
5079 	info = get_domain_info(dev);
5080 	if (info)
5081 		__dmar_remove_one_dev_info(info);
5082 	spin_unlock_irqrestore(&device_domain_lock, flags);
5083 }
5084 
5085 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5086 {
5087 	int adjust_width;
5088 
5089 	/* calculate AGAW */
5090 	domain->gaw = guest_width;
5091 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5092 	domain->agaw = width_to_agaw(adjust_width);
5093 
5094 	domain->iommu_coherency = 0;
5095 	domain->iommu_snooping = 0;
5096 	domain->iommu_superpage = 0;
5097 	domain->max_addr = 0;
5098 
5099 	/* always allocate the top pgd */
5100 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5101 	if (!domain->pgd)
5102 		return -ENOMEM;
5103 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5104 	return 0;
5105 }
5106 
5107 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5108 {
5109 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5110 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5111 
5112 	if (!intel_iommu_strict &&
5113 	    init_iova_flush_queue(&dmar_domain->iovad,
5114 				  iommu_flush_iova, iova_entry_free))
5115 		pr_info("iova flush queue initialization failed\n");
5116 }
5117 
5118 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5119 {
5120 	struct dmar_domain *dmar_domain;
5121 	struct iommu_domain *domain;
5122 
5123 	switch (type) {
5124 	case IOMMU_DOMAIN_DMA:
5125 	case IOMMU_DOMAIN_UNMANAGED:
5126 		dmar_domain = alloc_domain(0);
5127 		if (!dmar_domain) {
5128 			pr_err("Can't allocate dmar_domain\n");
5129 			return NULL;
5130 		}
5131 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5132 			pr_err("Domain initialization failed\n");
5133 			domain_exit(dmar_domain);
5134 			return NULL;
5135 		}
5136 
5137 		if (type == IOMMU_DOMAIN_DMA)
5138 			intel_init_iova_domain(dmar_domain);
5139 
5140 		domain = &dmar_domain->domain;
5141 		domain->geometry.aperture_start = 0;
5142 		domain->geometry.aperture_end   =
5143 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5144 		domain->geometry.force_aperture = true;
5145 
5146 		return domain;
5147 	case IOMMU_DOMAIN_IDENTITY:
5148 		return &si_domain->domain;
5149 	default:
5150 		return NULL;
5151 	}
5152 
5153 	return NULL;
5154 }
5155 
5156 static void intel_iommu_domain_free(struct iommu_domain *domain)
5157 {
5158 	if (domain != &si_domain->domain)
5159 		domain_exit(to_dmar_domain(domain));
5160 }
5161 
5162 /*
5163  * Check whether a @domain could be attached to the @dev through the
5164  * aux-domain attach/detach APIs.
5165  */
5166 static inline bool
5167 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5168 {
5169 	struct device_domain_info *info = get_domain_info(dev);
5170 
5171 	return info && info->auxd_enabled &&
5172 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5173 }
5174 
5175 static void auxiliary_link_device(struct dmar_domain *domain,
5176 				  struct device *dev)
5177 {
5178 	struct device_domain_info *info = get_domain_info(dev);
5179 
5180 	assert_spin_locked(&device_domain_lock);
5181 	if (WARN_ON(!info))
5182 		return;
5183 
5184 	domain->auxd_refcnt++;
5185 	list_add(&domain->auxd, &info->auxiliary_domains);
5186 }
5187 
5188 static void auxiliary_unlink_device(struct dmar_domain *domain,
5189 				    struct device *dev)
5190 {
5191 	struct device_domain_info *info = get_domain_info(dev);
5192 
5193 	assert_spin_locked(&device_domain_lock);
5194 	if (WARN_ON(!info))
5195 		return;
5196 
5197 	list_del(&domain->auxd);
5198 	domain->auxd_refcnt--;
5199 
5200 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5201 		ioasid_free(domain->default_pasid);
5202 }
5203 
5204 static int aux_domain_add_dev(struct dmar_domain *domain,
5205 			      struct device *dev)
5206 {
5207 	int ret;
5208 	unsigned long flags;
5209 	struct intel_iommu *iommu;
5210 
5211 	iommu = device_to_iommu(dev, NULL, NULL);
5212 	if (!iommu)
5213 		return -ENODEV;
5214 
5215 	if (domain->default_pasid <= 0) {
5216 		u32 pasid;
5217 
5218 		/* No private data needed for the default pasid */
5219 		pasid = ioasid_alloc(NULL, PASID_MIN,
5220 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5221 				     NULL);
5222 		if (pasid == INVALID_IOASID) {
5223 			pr_err("Can't allocate default pasid\n");
5224 			return -ENODEV;
5225 		}
5226 		domain->default_pasid = pasid;
5227 	}
5228 
5229 	spin_lock_irqsave(&device_domain_lock, flags);
5230 	/*
5231 	 * iommu->lock must be held to attach domain to iommu and setup the
5232 	 * pasid entry for second level translation.
5233 	 */
5234 	spin_lock(&iommu->lock);
5235 	ret = domain_attach_iommu(domain, iommu);
5236 	if (ret)
5237 		goto attach_failed;
5238 
5239 	/* Setup the PASID entry for mediated devices: */
5240 	if (domain_use_first_level(domain))
5241 		ret = domain_setup_first_level(iommu, domain, dev,
5242 					       domain->default_pasid);
5243 	else
5244 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5245 						     domain->default_pasid);
5246 	if (ret)
5247 		goto table_failed;
5248 	spin_unlock(&iommu->lock);
5249 
5250 	auxiliary_link_device(domain, dev);
5251 
5252 	spin_unlock_irqrestore(&device_domain_lock, flags);
5253 
5254 	return 0;
5255 
5256 table_failed:
5257 	domain_detach_iommu(domain, iommu);
5258 attach_failed:
5259 	spin_unlock(&iommu->lock);
5260 	spin_unlock_irqrestore(&device_domain_lock, flags);
5261 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5262 		ioasid_free(domain->default_pasid);
5263 
5264 	return ret;
5265 }
5266 
5267 static void aux_domain_remove_dev(struct dmar_domain *domain,
5268 				  struct device *dev)
5269 {
5270 	struct device_domain_info *info;
5271 	struct intel_iommu *iommu;
5272 	unsigned long flags;
5273 
5274 	if (!is_aux_domain(dev, &domain->domain))
5275 		return;
5276 
5277 	spin_lock_irqsave(&device_domain_lock, flags);
5278 	info = get_domain_info(dev);
5279 	iommu = info->iommu;
5280 
5281 	auxiliary_unlink_device(domain, dev);
5282 
5283 	spin_lock(&iommu->lock);
5284 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5285 	domain_detach_iommu(domain, iommu);
5286 	spin_unlock(&iommu->lock);
5287 
5288 	spin_unlock_irqrestore(&device_domain_lock, flags);
5289 }
5290 
5291 static int prepare_domain_attach_device(struct iommu_domain *domain,
5292 					struct device *dev)
5293 {
5294 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5295 	struct intel_iommu *iommu;
5296 	int addr_width;
5297 
5298 	iommu = device_to_iommu(dev, NULL, NULL);
5299 	if (!iommu)
5300 		return -ENODEV;
5301 
5302 	/* check if this iommu agaw is sufficient for max mapped address */
5303 	addr_width = agaw_to_width(iommu->agaw);
5304 	if (addr_width > cap_mgaw(iommu->cap))
5305 		addr_width = cap_mgaw(iommu->cap);
5306 
5307 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5308 		dev_err(dev, "%s: iommu width (%d) is not "
5309 		        "sufficient for the mapped address (%llx)\n",
5310 		        __func__, addr_width, dmar_domain->max_addr);
5311 		return -EFAULT;
5312 	}
5313 	dmar_domain->gaw = addr_width;
5314 
5315 	/*
5316 	 * Knock out extra levels of page tables if necessary
5317 	 */
5318 	while (iommu->agaw < dmar_domain->agaw) {
5319 		struct dma_pte *pte;
5320 
5321 		pte = dmar_domain->pgd;
5322 		if (dma_pte_present(pte)) {
5323 			dmar_domain->pgd = (struct dma_pte *)
5324 				phys_to_virt(dma_pte_addr(pte));
5325 			free_pgtable_page(pte);
5326 		}
5327 		dmar_domain->agaw--;
5328 	}
5329 
5330 	return 0;
5331 }
5332 
5333 static int intel_iommu_attach_device(struct iommu_domain *domain,
5334 				     struct device *dev)
5335 {
5336 	int ret;
5337 
5338 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5339 	    device_is_rmrr_locked(dev)) {
5340 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5341 		return -EPERM;
5342 	}
5343 
5344 	if (is_aux_domain(dev, domain))
5345 		return -EPERM;
5346 
5347 	/* normally dev is not mapped */
5348 	if (unlikely(domain_context_mapped(dev))) {
5349 		struct dmar_domain *old_domain;
5350 
5351 		old_domain = find_domain(dev);
5352 		if (old_domain)
5353 			dmar_remove_one_dev_info(dev);
5354 	}
5355 
5356 	ret = prepare_domain_attach_device(domain, dev);
5357 	if (ret)
5358 		return ret;
5359 
5360 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5361 }
5362 
5363 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5364 					 struct device *dev)
5365 {
5366 	int ret;
5367 
5368 	if (!is_aux_domain(dev, domain))
5369 		return -EPERM;
5370 
5371 	ret = prepare_domain_attach_device(domain, dev);
5372 	if (ret)
5373 		return ret;
5374 
5375 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5376 }
5377 
5378 static void intel_iommu_detach_device(struct iommu_domain *domain,
5379 				      struct device *dev)
5380 {
5381 	dmar_remove_one_dev_info(dev);
5382 }
5383 
5384 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5385 					  struct device *dev)
5386 {
5387 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5388 }
5389 
5390 /*
5391  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5392  * VT-d granularity. Invalidation is typically included in the unmap operation
5393  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5394  * owns the first level page tables. Invalidations of translation caches in the
5395  * guest are trapped and passed down to the host.
5396  *
5397  * vIOMMU in the guest will only expose first level page tables, therefore
5398  * we do not support IOTLB granularity for request without PASID (second level).
5399  *
5400  * For example, to find the VT-d granularity encoding for IOTLB
5401  * type and page selective granularity within PASID:
5402  * X: indexed by iommu cache type
5403  * Y: indexed by enum iommu_inv_granularity
5404  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5405  */
5406 
5407 static const int
5408 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5409 	/*
5410 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5411 	 * page selective (address granularity)
5412 	 */
5413 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5414 	/* PASID based dev TLBs */
5415 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5416 	/* PASID cache */
5417 	{-EINVAL, -EINVAL, -EINVAL}
5418 };
5419 
5420 static inline int to_vtd_granularity(int type, int granu)
5421 {
5422 	return inv_type_granu_table[type][granu];
5423 }
5424 
5425 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5426 {
5427 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5428 
5429 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5430 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5431 	 * granu size in contiguous memory.
5432 	 */
5433 	return order_base_2(nr_pages);
5434 }
5435 
5436 #ifdef CONFIG_INTEL_IOMMU_SVM
5437 static int
5438 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5439 			   struct iommu_cache_invalidate_info *inv_info)
5440 {
5441 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5442 	struct device_domain_info *info;
5443 	struct intel_iommu *iommu;
5444 	unsigned long flags;
5445 	int cache_type;
5446 	u8 bus, devfn;
5447 	u16 did, sid;
5448 	int ret = 0;
5449 	u64 size = 0;
5450 
5451 	if (!inv_info || !dmar_domain)
5452 		return -EINVAL;
5453 
5454 	if (!dev || !dev_is_pci(dev))
5455 		return -ENODEV;
5456 
5457 	iommu = device_to_iommu(dev, &bus, &devfn);
5458 	if (!iommu)
5459 		return -ENODEV;
5460 
5461 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5462 		return -EINVAL;
5463 
5464 	spin_lock_irqsave(&device_domain_lock, flags);
5465 	spin_lock(&iommu->lock);
5466 	info = get_domain_info(dev);
5467 	if (!info) {
5468 		ret = -EINVAL;
5469 		goto out_unlock;
5470 	}
5471 	did = dmar_domain->iommu_did[iommu->seq_id];
5472 	sid = PCI_DEVID(bus, devfn);
5473 
5474 	/* Size is only valid in address selective invalidation */
5475 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5476 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5477 				   inv_info->granu.addr_info.nb_granules);
5478 
5479 	for_each_set_bit(cache_type,
5480 			 (unsigned long *)&inv_info->cache,
5481 			 IOMMU_CACHE_INV_TYPE_NR) {
5482 		int granu = 0;
5483 		u64 pasid = 0;
5484 		u64 addr = 0;
5485 
5486 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5487 		if (granu == -EINVAL) {
5488 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5489 					   cache_type, inv_info->granularity);
5490 			break;
5491 		}
5492 
5493 		/*
5494 		 * PASID is stored in different locations based on the
5495 		 * granularity.
5496 		 */
5497 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5498 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5499 			pasid = inv_info->granu.pasid_info.pasid;
5500 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5501 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5502 			pasid = inv_info->granu.addr_info.pasid;
5503 
5504 		switch (BIT(cache_type)) {
5505 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5506 			/* HW will ignore LSB bits based on address mask */
5507 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5508 			    size &&
5509 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5510 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5511 						   inv_info->granu.addr_info.addr, size);
5512 			}
5513 
5514 			/*
5515 			 * If granu is PASID-selective, address is ignored.
5516 			 * We use npages = -1 to indicate that.
5517 			 */
5518 			qi_flush_piotlb(iommu, did, pasid,
5519 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5520 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5521 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5522 
5523 			if (!info->ats_enabled)
5524 				break;
5525 			/*
5526 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5527 			 * in the guest may assume IOTLB flush is inclusive,
5528 			 * which is more efficient.
5529 			 */
5530 			fallthrough;
5531 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5532 			/*
5533 			 * PASID based device TLB invalidation does not support
5534 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5535 			 * IOMMU_INV_GRANU_ADDR.
5536 			 * The equivalent of that is we set the size to be the
5537 			 * entire range of 64 bit. User only provides PASID info
5538 			 * without address info. So we set addr to 0.
5539 			 */
5540 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5541 				size = 64 - VTD_PAGE_SHIFT;
5542 				addr = 0;
5543 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5544 				addr = inv_info->granu.addr_info.addr;
5545 			}
5546 
5547 			if (info->ats_enabled)
5548 				qi_flush_dev_iotlb_pasid(iommu, sid,
5549 						info->pfsid, pasid,
5550 						info->ats_qdep, addr,
5551 						size);
5552 			else
5553 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5554 			break;
5555 		default:
5556 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5557 					    cache_type);
5558 			ret = -EINVAL;
5559 		}
5560 	}
5561 out_unlock:
5562 	spin_unlock(&iommu->lock);
5563 	spin_unlock_irqrestore(&device_domain_lock, flags);
5564 
5565 	return ret;
5566 }
5567 #endif
5568 
5569 static int intel_iommu_map(struct iommu_domain *domain,
5570 			   unsigned long iova, phys_addr_t hpa,
5571 			   size_t size, int iommu_prot, gfp_t gfp)
5572 {
5573 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5574 	u64 max_addr;
5575 	int prot = 0;
5576 	int ret;
5577 
5578 	if (iommu_prot & IOMMU_READ)
5579 		prot |= DMA_PTE_READ;
5580 	if (iommu_prot & IOMMU_WRITE)
5581 		prot |= DMA_PTE_WRITE;
5582 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5583 		prot |= DMA_PTE_SNP;
5584 
5585 	max_addr = iova + size;
5586 	if (dmar_domain->max_addr < max_addr) {
5587 		u64 end;
5588 
5589 		/* check if minimum agaw is sufficient for mapped address */
5590 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5591 		if (end < max_addr) {
5592 			pr_err("%s: iommu width (%d) is not "
5593 			       "sufficient for the mapped address (%llx)\n",
5594 			       __func__, dmar_domain->gaw, max_addr);
5595 			return -EFAULT;
5596 		}
5597 		dmar_domain->max_addr = max_addr;
5598 	}
5599 	/* Round up size to next multiple of PAGE_SIZE, if it and
5600 	   the low bits of hpa would take us onto the next page */
5601 	size = aligned_nrpages(hpa, size);
5602 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5603 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5604 	return ret;
5605 }
5606 
5607 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5608 				unsigned long iova, size_t size,
5609 				struct iommu_iotlb_gather *gather)
5610 {
5611 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5612 	struct page *freelist = NULL;
5613 	unsigned long start_pfn, last_pfn;
5614 	unsigned int npages;
5615 	int iommu_id, level = 0;
5616 
5617 	/* Cope with horrid API which requires us to unmap more than the
5618 	   size argument if it happens to be a large-page mapping. */
5619 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5620 
5621 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5622 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5623 
5624 	start_pfn = iova >> VTD_PAGE_SHIFT;
5625 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5626 
5627 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5628 
5629 	npages = last_pfn - start_pfn + 1;
5630 
5631 	for_each_domain_iommu(iommu_id, dmar_domain)
5632 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5633 				      start_pfn, npages, !freelist, 0);
5634 
5635 	dma_free_pagelist(freelist);
5636 
5637 	if (dmar_domain->max_addr == iova + size)
5638 		dmar_domain->max_addr = iova;
5639 
5640 	return size;
5641 }
5642 
5643 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5644 					    dma_addr_t iova)
5645 {
5646 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5647 	struct dma_pte *pte;
5648 	int level = 0;
5649 	u64 phys = 0;
5650 
5651 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5652 	if (pte && dma_pte_present(pte))
5653 		phys = dma_pte_addr(pte) +
5654 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5655 						VTD_PAGE_SHIFT) - 1));
5656 
5657 	return phys;
5658 }
5659 
5660 static inline bool scalable_mode_support(void)
5661 {
5662 	struct dmar_drhd_unit *drhd;
5663 	struct intel_iommu *iommu;
5664 	bool ret = true;
5665 
5666 	rcu_read_lock();
5667 	for_each_active_iommu(iommu, drhd) {
5668 		if (!sm_supported(iommu)) {
5669 			ret = false;
5670 			break;
5671 		}
5672 	}
5673 	rcu_read_unlock();
5674 
5675 	return ret;
5676 }
5677 
5678 static inline bool iommu_pasid_support(void)
5679 {
5680 	struct dmar_drhd_unit *drhd;
5681 	struct intel_iommu *iommu;
5682 	bool ret = true;
5683 
5684 	rcu_read_lock();
5685 	for_each_active_iommu(iommu, drhd) {
5686 		if (!pasid_supported(iommu)) {
5687 			ret = false;
5688 			break;
5689 		}
5690 	}
5691 	rcu_read_unlock();
5692 
5693 	return ret;
5694 }
5695 
5696 static inline bool nested_mode_support(void)
5697 {
5698 	struct dmar_drhd_unit *drhd;
5699 	struct intel_iommu *iommu;
5700 	bool ret = true;
5701 
5702 	rcu_read_lock();
5703 	for_each_active_iommu(iommu, drhd) {
5704 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5705 			ret = false;
5706 			break;
5707 		}
5708 	}
5709 	rcu_read_unlock();
5710 
5711 	return ret;
5712 }
5713 
5714 static bool intel_iommu_capable(enum iommu_cap cap)
5715 {
5716 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5717 		return domain_update_iommu_snooping(NULL) == 1;
5718 	if (cap == IOMMU_CAP_INTR_REMAP)
5719 		return irq_remapping_enabled == 1;
5720 
5721 	return false;
5722 }
5723 
5724 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5725 {
5726 	struct intel_iommu *iommu;
5727 
5728 	iommu = device_to_iommu(dev, NULL, NULL);
5729 	if (!iommu)
5730 		return ERR_PTR(-ENODEV);
5731 
5732 	if (translation_pre_enabled(iommu))
5733 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5734 
5735 	return &iommu->iommu;
5736 }
5737 
5738 static void intel_iommu_release_device(struct device *dev)
5739 {
5740 	struct intel_iommu *iommu;
5741 
5742 	iommu = device_to_iommu(dev, NULL, NULL);
5743 	if (!iommu)
5744 		return;
5745 
5746 	dmar_remove_one_dev_info(dev);
5747 
5748 	set_dma_ops(dev, NULL);
5749 }
5750 
5751 static void intel_iommu_probe_finalize(struct device *dev)
5752 {
5753 	struct iommu_domain *domain;
5754 
5755 	domain = iommu_get_domain_for_dev(dev);
5756 	if (device_needs_bounce(dev))
5757 		set_dma_ops(dev, &bounce_dma_ops);
5758 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5759 		set_dma_ops(dev, &intel_dma_ops);
5760 	else
5761 		set_dma_ops(dev, NULL);
5762 }
5763 
5764 static void intel_iommu_get_resv_regions(struct device *device,
5765 					 struct list_head *head)
5766 {
5767 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5768 	struct iommu_resv_region *reg;
5769 	struct dmar_rmrr_unit *rmrr;
5770 	struct device *i_dev;
5771 	int i;
5772 
5773 	down_read(&dmar_global_lock);
5774 	for_each_rmrr_units(rmrr) {
5775 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5776 					  i, i_dev) {
5777 			struct iommu_resv_region *resv;
5778 			enum iommu_resv_type type;
5779 			size_t length;
5780 
5781 			if (i_dev != device &&
5782 			    !is_downstream_to_pci_bridge(device, i_dev))
5783 				continue;
5784 
5785 			length = rmrr->end_address - rmrr->base_address + 1;
5786 
5787 			type = device_rmrr_is_relaxable(device) ?
5788 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5789 
5790 			resv = iommu_alloc_resv_region(rmrr->base_address,
5791 						       length, prot, type);
5792 			if (!resv)
5793 				break;
5794 
5795 			list_add_tail(&resv->list, head);
5796 		}
5797 	}
5798 	up_read(&dmar_global_lock);
5799 
5800 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5801 	if (dev_is_pci(device)) {
5802 		struct pci_dev *pdev = to_pci_dev(device);
5803 
5804 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5805 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5806 						   IOMMU_RESV_DIRECT_RELAXABLE);
5807 			if (reg)
5808 				list_add_tail(&reg->list, head);
5809 		}
5810 	}
5811 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5812 
5813 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5814 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5815 				      0, IOMMU_RESV_MSI);
5816 	if (!reg)
5817 		return;
5818 	list_add_tail(&reg->list, head);
5819 }
5820 
5821 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5822 {
5823 	struct device_domain_info *info;
5824 	struct context_entry *context;
5825 	struct dmar_domain *domain;
5826 	unsigned long flags;
5827 	u64 ctx_lo;
5828 	int ret;
5829 
5830 	domain = find_domain(dev);
5831 	if (!domain)
5832 		return -EINVAL;
5833 
5834 	spin_lock_irqsave(&device_domain_lock, flags);
5835 	spin_lock(&iommu->lock);
5836 
5837 	ret = -EINVAL;
5838 	info = get_domain_info(dev);
5839 	if (!info || !info->pasid_supported)
5840 		goto out;
5841 
5842 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5843 	if (WARN_ON(!context))
5844 		goto out;
5845 
5846 	ctx_lo = context[0].lo;
5847 
5848 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5849 		ctx_lo |= CONTEXT_PASIDE;
5850 		context[0].lo = ctx_lo;
5851 		wmb();
5852 		iommu->flush.flush_context(iommu,
5853 					   domain->iommu_did[iommu->seq_id],
5854 					   PCI_DEVID(info->bus, info->devfn),
5855 					   DMA_CCMD_MASK_NOBIT,
5856 					   DMA_CCMD_DEVICE_INVL);
5857 	}
5858 
5859 	/* Enable PASID support in the device, if it wasn't already */
5860 	if (!info->pasid_enabled)
5861 		iommu_enable_dev_iotlb(info);
5862 
5863 	ret = 0;
5864 
5865  out:
5866 	spin_unlock(&iommu->lock);
5867 	spin_unlock_irqrestore(&device_domain_lock, flags);
5868 
5869 	return ret;
5870 }
5871 
5872 static void intel_iommu_apply_resv_region(struct device *dev,
5873 					  struct iommu_domain *domain,
5874 					  struct iommu_resv_region *region)
5875 {
5876 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5877 	unsigned long start, end;
5878 
5879 	start = IOVA_PFN(region->start);
5880 	end   = IOVA_PFN(region->start + region->length - 1);
5881 
5882 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5883 }
5884 
5885 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5886 {
5887 	if (dev_is_pci(dev))
5888 		return pci_device_group(dev);
5889 	return generic_device_group(dev);
5890 }
5891 
5892 static int intel_iommu_enable_auxd(struct device *dev)
5893 {
5894 	struct device_domain_info *info;
5895 	struct intel_iommu *iommu;
5896 	unsigned long flags;
5897 	int ret;
5898 
5899 	iommu = device_to_iommu(dev, NULL, NULL);
5900 	if (!iommu || dmar_disabled)
5901 		return -EINVAL;
5902 
5903 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5904 		return -EINVAL;
5905 
5906 	ret = intel_iommu_enable_pasid(iommu, dev);
5907 	if (ret)
5908 		return -ENODEV;
5909 
5910 	spin_lock_irqsave(&device_domain_lock, flags);
5911 	info = get_domain_info(dev);
5912 	info->auxd_enabled = 1;
5913 	spin_unlock_irqrestore(&device_domain_lock, flags);
5914 
5915 	return 0;
5916 }
5917 
5918 static int intel_iommu_disable_auxd(struct device *dev)
5919 {
5920 	struct device_domain_info *info;
5921 	unsigned long flags;
5922 
5923 	spin_lock_irqsave(&device_domain_lock, flags);
5924 	info = get_domain_info(dev);
5925 	if (!WARN_ON(!info))
5926 		info->auxd_enabled = 0;
5927 	spin_unlock_irqrestore(&device_domain_lock, flags);
5928 
5929 	return 0;
5930 }
5931 
5932 /*
5933  * A PCI express designated vendor specific extended capability is defined
5934  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5935  * for system software and tools to detect endpoint devices supporting the
5936  * Intel scalable IO virtualization without host driver dependency.
5937  *
5938  * Returns the address of the matching extended capability structure within
5939  * the device's PCI configuration space or 0 if the device does not support
5940  * it.
5941  */
5942 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5943 {
5944 	int pos;
5945 	u16 vendor, id;
5946 
5947 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5948 	while (pos) {
5949 		pci_read_config_word(pdev, pos + 4, &vendor);
5950 		pci_read_config_word(pdev, pos + 8, &id);
5951 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5952 			return pos;
5953 
5954 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5955 	}
5956 
5957 	return 0;
5958 }
5959 
5960 static bool
5961 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5962 {
5963 	if (feat == IOMMU_DEV_FEAT_AUX) {
5964 		int ret;
5965 
5966 		if (!dev_is_pci(dev) || dmar_disabled ||
5967 		    !scalable_mode_support() || !iommu_pasid_support())
5968 			return false;
5969 
5970 		ret = pci_pasid_features(to_pci_dev(dev));
5971 		if (ret < 0)
5972 			return false;
5973 
5974 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5975 	}
5976 
5977 	if (feat == IOMMU_DEV_FEAT_SVA) {
5978 		struct device_domain_info *info = get_domain_info(dev);
5979 
5980 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5981 			info->pasid_supported && info->pri_supported &&
5982 			info->ats_supported;
5983 	}
5984 
5985 	return false;
5986 }
5987 
5988 static int
5989 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5990 {
5991 	if (feat == IOMMU_DEV_FEAT_AUX)
5992 		return intel_iommu_enable_auxd(dev);
5993 
5994 	if (feat == IOMMU_DEV_FEAT_SVA) {
5995 		struct device_domain_info *info = get_domain_info(dev);
5996 
5997 		if (!info)
5998 			return -EINVAL;
5999 
6000 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6001 			return 0;
6002 	}
6003 
6004 	return -ENODEV;
6005 }
6006 
6007 static int
6008 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6009 {
6010 	if (feat == IOMMU_DEV_FEAT_AUX)
6011 		return intel_iommu_disable_auxd(dev);
6012 
6013 	return -ENODEV;
6014 }
6015 
6016 static bool
6017 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6018 {
6019 	struct device_domain_info *info = get_domain_info(dev);
6020 
6021 	if (feat == IOMMU_DEV_FEAT_AUX)
6022 		return scalable_mode_support() && info && info->auxd_enabled;
6023 
6024 	return false;
6025 }
6026 
6027 static int
6028 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6029 {
6030 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6031 
6032 	return dmar_domain->default_pasid > 0 ?
6033 			dmar_domain->default_pasid : -EINVAL;
6034 }
6035 
6036 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6037 					   struct device *dev)
6038 {
6039 	return attach_deferred(dev);
6040 }
6041 
6042 static int
6043 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6044 			    enum iommu_attr attr, void *data)
6045 {
6046 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6047 	unsigned long flags;
6048 	int ret = 0;
6049 
6050 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6051 		return -EINVAL;
6052 
6053 	switch (attr) {
6054 	case DOMAIN_ATTR_NESTING:
6055 		spin_lock_irqsave(&device_domain_lock, flags);
6056 		if (nested_mode_support() &&
6057 		    list_empty(&dmar_domain->devices)) {
6058 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6059 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6060 		} else {
6061 			ret = -ENODEV;
6062 		}
6063 		spin_unlock_irqrestore(&device_domain_lock, flags);
6064 		break;
6065 	default:
6066 		ret = -EINVAL;
6067 		break;
6068 	}
6069 
6070 	return ret;
6071 }
6072 
6073 /*
6074  * Check that the device does not live on an external facing PCI port that is
6075  * marked as untrusted. Such devices should not be able to apply quirks and
6076  * thus not be able to bypass the IOMMU restrictions.
6077  */
6078 static bool risky_device(struct pci_dev *pdev)
6079 {
6080 	if (pdev->untrusted) {
6081 		pci_info(pdev,
6082 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6083 			 pdev->vendor, pdev->device);
6084 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6085 		return true;
6086 	}
6087 	return false;
6088 }
6089 
6090 const struct iommu_ops intel_iommu_ops = {
6091 	.capable		= intel_iommu_capable,
6092 	.domain_alloc		= intel_iommu_domain_alloc,
6093 	.domain_free		= intel_iommu_domain_free,
6094 	.domain_set_attr	= intel_iommu_domain_set_attr,
6095 	.attach_dev		= intel_iommu_attach_device,
6096 	.detach_dev		= intel_iommu_detach_device,
6097 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6098 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6099 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6100 	.map			= intel_iommu_map,
6101 	.unmap			= intel_iommu_unmap,
6102 	.iova_to_phys		= intel_iommu_iova_to_phys,
6103 	.probe_device		= intel_iommu_probe_device,
6104 	.probe_finalize		= intel_iommu_probe_finalize,
6105 	.release_device		= intel_iommu_release_device,
6106 	.get_resv_regions	= intel_iommu_get_resv_regions,
6107 	.put_resv_regions	= generic_iommu_put_resv_regions,
6108 	.apply_resv_region	= intel_iommu_apply_resv_region,
6109 	.device_group		= intel_iommu_device_group,
6110 	.dev_has_feat		= intel_iommu_dev_has_feat,
6111 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6112 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6113 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6114 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6115 	.def_domain_type	= device_def_domain_type,
6116 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6117 #ifdef CONFIG_INTEL_IOMMU_SVM
6118 	.cache_invalidate	= intel_iommu_sva_invalidate,
6119 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6120 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6121 	.sva_bind		= intel_svm_bind,
6122 	.sva_unbind		= intel_svm_unbind,
6123 	.sva_get_pasid		= intel_svm_get_pasid,
6124 	.page_response		= intel_svm_page_response,
6125 #endif
6126 };
6127 
6128 static void quirk_iommu_igfx(struct pci_dev *dev)
6129 {
6130 	if (risky_device(dev))
6131 		return;
6132 
6133 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6134 	dmar_map_gfx = 0;
6135 }
6136 
6137 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6145 
6146 /* Broadwell igfx malfunctions with dmar */
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6170 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6171 
6172 static void quirk_iommu_rwbf(struct pci_dev *dev)
6173 {
6174 	if (risky_device(dev))
6175 		return;
6176 
6177 	/*
6178 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6179 	 * but needs it. Same seems to hold for the desktop versions.
6180 	 */
6181 	pci_info(dev, "Forcing write-buffer flush capability\n");
6182 	rwbf_quirk = 1;
6183 }
6184 
6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6187 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6189 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6190 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6191 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6192 
6193 #define GGC 0x52
6194 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6195 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6196 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6197 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6198 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6199 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6200 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6201 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6202 
6203 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6204 {
6205 	unsigned short ggc;
6206 
6207 	if (risky_device(dev))
6208 		return;
6209 
6210 	if (pci_read_config_word(dev, GGC, &ggc))
6211 		return;
6212 
6213 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6214 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6215 		dmar_map_gfx = 0;
6216 	} else if (dmar_map_gfx) {
6217 		/* we have to ensure the gfx device is idle before we flush */
6218 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6219 		intel_iommu_strict = 1;
6220        }
6221 }
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6226 
6227 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6228 {
6229 	unsigned short ver;
6230 
6231 	if (!IS_GFX_DEVICE(dev))
6232 		return;
6233 
6234 	ver = (dev->device >> 8) & 0xff;
6235 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6236 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6237 	    ver != 0x9a)
6238 		return;
6239 
6240 	if (risky_device(dev))
6241 		return;
6242 
6243 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6244 	iommu_skip_te_disable = 1;
6245 }
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6247 
6248 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6249    ISOCH DMAR unit for the Azalia sound device, but not give it any
6250    TLB entries, which causes it to deadlock. Check for that.  We do
6251    this in a function called from init_dmars(), instead of in a PCI
6252    quirk, because we don't want to print the obnoxious "BIOS broken"
6253    message if VT-d is actually disabled.
6254 */
6255 static void __init check_tylersburg_isoch(void)
6256 {
6257 	struct pci_dev *pdev;
6258 	uint32_t vtisochctrl;
6259 
6260 	/* If there's no Azalia in the system anyway, forget it. */
6261 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6262 	if (!pdev)
6263 		return;
6264 
6265 	if (risky_device(pdev)) {
6266 		pci_dev_put(pdev);
6267 		return;
6268 	}
6269 
6270 	pci_dev_put(pdev);
6271 
6272 	/* System Management Registers. Might be hidden, in which case
6273 	   we can't do the sanity check. But that's OK, because the
6274 	   known-broken BIOSes _don't_ actually hide it, so far. */
6275 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6276 	if (!pdev)
6277 		return;
6278 
6279 	if (risky_device(pdev)) {
6280 		pci_dev_put(pdev);
6281 		return;
6282 	}
6283 
6284 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6285 		pci_dev_put(pdev);
6286 		return;
6287 	}
6288 
6289 	pci_dev_put(pdev);
6290 
6291 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6292 	if (vtisochctrl & 1)
6293 		return;
6294 
6295 	/* Drop all bits other than the number of TLB entries */
6296 	vtisochctrl &= 0x1c;
6297 
6298 	/* If we have the recommended number of TLB entries (16), fine. */
6299 	if (vtisochctrl == 0x10)
6300 		return;
6301 
6302 	/* Zero TLB entries? You get to ride the short bus to school. */
6303 	if (!vtisochctrl) {
6304 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6305 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6306 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6307 		     dmi_get_system_info(DMI_BIOS_VERSION),
6308 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6309 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6310 		return;
6311 	}
6312 
6313 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6314 	       vtisochctrl);
6315 }
6316