xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 2b1b1267080fe822789d0845a58ebb452724736b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "intel-pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline unsigned long level_mask(int level)
132 {
133 	return -1UL << level_to_offset_bits(level);
134 }
135 
136 static inline unsigned long level_size(int level)
137 {
138 	return 1UL << level_to_offset_bits(level);
139 }
140 
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 
360 #define IDENTMAP_GFX		2
361 #define IDENTMAP_AZALIA		4
362 
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365 
366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370 	struct device_domain_info *info;
371 
372 	if (!dev)
373 		return NULL;
374 
375 	info = dev->archdata.iommu;
376 	if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
377 		     info == DEFER_DEVICE_DOMAIN_INFO))
378 		return NULL;
379 
380 	return info;
381 }
382 
383 DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
385 
386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
387 				to_pci_dev(d)->untrusted)
388 
389 /*
390  * Iterate over elements in device_domain_list and call the specified
391  * callback @fn against each element.
392  */
393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
394 				     void *data), void *data)
395 {
396 	int ret = 0;
397 	unsigned long flags;
398 	struct device_domain_info *info;
399 
400 	spin_lock_irqsave(&device_domain_lock, flags);
401 	list_for_each_entry(info, &device_domain_list, global) {
402 		ret = fn(info, data);
403 		if (ret) {
404 			spin_unlock_irqrestore(&device_domain_lock, flags);
405 			return ret;
406 		}
407 	}
408 	spin_unlock_irqrestore(&device_domain_lock, flags);
409 
410 	return 0;
411 }
412 
413 const struct iommu_ops intel_iommu_ops;
414 
415 static bool translation_pre_enabled(struct intel_iommu *iommu)
416 {
417 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
418 }
419 
420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
421 {
422 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
423 }
424 
425 static void init_translation_status(struct intel_iommu *iommu)
426 {
427 	u32 gsts;
428 
429 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
430 	if (gsts & DMA_GSTS_TES)
431 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
432 }
433 
434 static int __init intel_iommu_setup(char *str)
435 {
436 	if (!str)
437 		return -EINVAL;
438 	while (*str) {
439 		if (!strncmp(str, "on", 2)) {
440 			dmar_disabled = 0;
441 			pr_info("IOMMU enabled\n");
442 		} else if (!strncmp(str, "off", 3)) {
443 			dmar_disabled = 1;
444 			no_platform_optin = 1;
445 			pr_info("IOMMU disabled\n");
446 		} else if (!strncmp(str, "igfx_off", 8)) {
447 			dmar_map_gfx = 0;
448 			pr_info("Disable GFX device mapping\n");
449 		} else if (!strncmp(str, "forcedac", 8)) {
450 			pr_info("Forcing DAC for PCI devices\n");
451 			dmar_forcedac = 1;
452 		} else if (!strncmp(str, "strict", 6)) {
453 			pr_info("Disable batched IOTLB flush\n");
454 			intel_iommu_strict = 1;
455 		} else if (!strncmp(str, "sp_off", 6)) {
456 			pr_info("Disable supported super page\n");
457 			intel_iommu_superpage = 0;
458 		} else if (!strncmp(str, "sm_on", 5)) {
459 			pr_info("Intel-IOMMU: scalable mode supported\n");
460 			intel_iommu_sm = 1;
461 		} else if (!strncmp(str, "tboot_noforce", 13)) {
462 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 			intel_iommu_tboot_noforce = 1;
464 		} else if (!strncmp(str, "nobounce", 8)) {
465 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
466 			intel_no_bounce = 1;
467 		}
468 
469 		str += strcspn(str, ",");
470 		while (*str == ',')
471 			str++;
472 	}
473 	return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476 
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479 
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482 	struct dmar_domain **domains;
483 	int idx = did >> 8;
484 
485 	domains = iommu->domains[idx];
486 	if (!domains)
487 		return NULL;
488 
489 	return domains[did & 0xff];
490 }
491 
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 			     struct dmar_domain *domain)
494 {
495 	struct dmar_domain **domains;
496 	int idx = did >> 8;
497 
498 	if (!iommu->domains[idx]) {
499 		size_t size = 256 * sizeof(struct dmar_domain *);
500 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501 	}
502 
503 	domains = iommu->domains[idx];
504 	if (WARN_ON(!domains))
505 		return;
506 	else
507 		domains[did & 0xff] = domain;
508 }
509 
510 void *alloc_pgtable_page(int node)
511 {
512 	struct page *page;
513 	void *vaddr = NULL;
514 
515 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516 	if (page)
517 		vaddr = page_address(page);
518 	return vaddr;
519 }
520 
521 void free_pgtable_page(void *vaddr)
522 {
523 	free_page((unsigned long)vaddr);
524 }
525 
526 static inline void *alloc_domain_mem(void)
527 {
528 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530 
531 static void free_domain_mem(void *vaddr)
532 {
533 	kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535 
536 static inline void * alloc_devinfo_mem(void)
537 {
538 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540 
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543 	kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545 
546 static inline int domain_type_is_si(struct dmar_domain *domain)
547 {
548 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
549 }
550 
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 {
553 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
554 }
555 
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
557 				       unsigned long pfn)
558 {
559 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 
561 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
562 }
563 
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 {
566 	unsigned long sagaw;
567 	int agaw = -1;
568 
569 	sagaw = cap_sagaw(iommu->cap);
570 	for (agaw = width_to_agaw(max_gaw);
571 	     agaw >= 0; agaw--) {
572 		if (test_bit(agaw, &sagaw))
573 			break;
574 	}
575 
576 	return agaw;
577 }
578 
579 /*
580  * Calculate max SAGAW for each iommu.
581  */
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 {
584 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
585 }
586 
587 /*
588  * calculate agaw for each iommu.
589  * "SAGAW" may be different across iommus, use a default agaw, and
590  * get a supported less agaw for iommus that don't support the default agaw.
591  */
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 {
594 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
595 }
596 
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 {
600 	int iommu_id;
601 
602 	/* si_domain and vm domain should not get here. */
603 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
604 		return NULL;
605 
606 	for_each_domain_iommu(iommu_id, domain)
607 		break;
608 
609 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
610 		return NULL;
611 
612 	return g_iommus[iommu_id];
613 }
614 
615 static void domain_update_iommu_coherency(struct dmar_domain *domain)
616 {
617 	struct dmar_drhd_unit *drhd;
618 	struct intel_iommu *iommu;
619 	bool found = false;
620 	int i;
621 
622 	domain->iommu_coherency = 1;
623 
624 	for_each_domain_iommu(i, domain) {
625 		found = true;
626 		if (!ecap_coherent(g_iommus[i]->ecap)) {
627 			domain->iommu_coherency = 0;
628 			break;
629 		}
630 	}
631 	if (found)
632 		return;
633 
634 	/* No hardware attached; use lowest common denominator */
635 	rcu_read_lock();
636 	for_each_active_iommu(iommu, drhd) {
637 		if (!ecap_coherent(iommu->ecap)) {
638 			domain->iommu_coherency = 0;
639 			break;
640 		}
641 	}
642 	rcu_read_unlock();
643 }
644 
645 static int domain_update_iommu_snooping(struct intel_iommu *skip)
646 {
647 	struct dmar_drhd_unit *drhd;
648 	struct intel_iommu *iommu;
649 	int ret = 1;
650 
651 	rcu_read_lock();
652 	for_each_active_iommu(iommu, drhd) {
653 		if (iommu != skip) {
654 			if (!ecap_sc_support(iommu->ecap)) {
655 				ret = 0;
656 				break;
657 			}
658 		}
659 	}
660 	rcu_read_unlock();
661 
662 	return ret;
663 }
664 
665 static int domain_update_iommu_superpage(struct dmar_domain *domain,
666 					 struct intel_iommu *skip)
667 {
668 	struct dmar_drhd_unit *drhd;
669 	struct intel_iommu *iommu;
670 	int mask = 0x3;
671 
672 	if (!intel_iommu_superpage) {
673 		return 0;
674 	}
675 
676 	/* set iommu_superpage to the smallest common denominator */
677 	rcu_read_lock();
678 	for_each_active_iommu(iommu, drhd) {
679 		if (iommu != skip) {
680 			if (domain && domain_use_first_level(domain)) {
681 				if (!cap_fl1gp_support(iommu->cap))
682 					mask = 0x1;
683 			} else {
684 				mask &= cap_super_page_val(iommu->cap);
685 			}
686 
687 			if (!mask)
688 				break;
689 		}
690 	}
691 	rcu_read_unlock();
692 
693 	return fls(mask);
694 }
695 
696 /* Some capabilities may be different across iommus */
697 static void domain_update_iommu_cap(struct dmar_domain *domain)
698 {
699 	domain_update_iommu_coherency(domain);
700 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
701 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
702 }
703 
704 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
705 					 u8 devfn, int alloc)
706 {
707 	struct root_entry *root = &iommu->root_entry[bus];
708 	struct context_entry *context;
709 	u64 *entry;
710 
711 	entry = &root->lo;
712 	if (sm_supported(iommu)) {
713 		if (devfn >= 0x80) {
714 			devfn -= 0x80;
715 			entry = &root->hi;
716 		}
717 		devfn *= 2;
718 	}
719 	if (*entry & 1)
720 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
721 	else {
722 		unsigned long phy_addr;
723 		if (!alloc)
724 			return NULL;
725 
726 		context = alloc_pgtable_page(iommu->node);
727 		if (!context)
728 			return NULL;
729 
730 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731 		phy_addr = virt_to_phys((void *)context);
732 		*entry = phy_addr | 1;
733 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
734 	}
735 	return &context[devfn];
736 }
737 
738 static int iommu_dummy(struct device *dev)
739 {
740 	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
741 }
742 
743 static bool attach_deferred(struct device *dev)
744 {
745 	return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
746 }
747 
748 /**
749  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750  *				 sub-hierarchy of a candidate PCI-PCI bridge
751  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752  * @bridge: the candidate PCI-PCI bridge
753  *
754  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
755  */
756 static bool
757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
758 {
759 	struct pci_dev *pdev, *pbridge;
760 
761 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
762 		return false;
763 
764 	pdev = to_pci_dev(dev);
765 	pbridge = to_pci_dev(bridge);
766 
767 	if (pbridge->subordinate &&
768 	    pbridge->subordinate->number <= pdev->bus->number &&
769 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
770 		return true;
771 
772 	return false;
773 }
774 
775 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
776 {
777 	struct dmar_drhd_unit *drhd = NULL;
778 	struct intel_iommu *iommu;
779 	struct device *tmp;
780 	struct pci_dev *pdev = NULL;
781 	u16 segment = 0;
782 	int i;
783 
784 	if (iommu_dummy(dev))
785 		return NULL;
786 
787 	if (dev_is_pci(dev)) {
788 		struct pci_dev *pf_pdev;
789 
790 		pdev = pci_real_dma_dev(to_pci_dev(dev));
791 
792 		/* VFs aren't listed in scope tables; we need to look up
793 		 * the PF instead to find the IOMMU. */
794 		pf_pdev = pci_physfn(pdev);
795 		dev = &pf_pdev->dev;
796 		segment = pci_domain_nr(pdev->bus);
797 	} else if (has_acpi_companion(dev))
798 		dev = &ACPI_COMPANION(dev)->dev;
799 
800 	rcu_read_lock();
801 	for_each_active_iommu(iommu, drhd) {
802 		if (pdev && segment != drhd->segment)
803 			continue;
804 
805 		for_each_active_dev_scope(drhd->devices,
806 					  drhd->devices_cnt, i, tmp) {
807 			if (tmp == dev) {
808 				/* For a VF use its original BDF# not that of the PF
809 				 * which we used for the IOMMU lookup. Strictly speaking
810 				 * we could do this for all PCI devices; we only need to
811 				 * get the BDF# from the scope table for ACPI matches. */
812 				if (pdev && pdev->is_virtfn)
813 					goto got_pdev;
814 
815 				*bus = drhd->devices[i].bus;
816 				*devfn = drhd->devices[i].devfn;
817 				goto out;
818 			}
819 
820 			if (is_downstream_to_pci_bridge(dev, tmp))
821 				goto got_pdev;
822 		}
823 
824 		if (pdev && drhd->include_all) {
825 		got_pdev:
826 			*bus = pdev->bus->number;
827 			*devfn = pdev->devfn;
828 			goto out;
829 		}
830 	}
831 	iommu = NULL;
832  out:
833 	rcu_read_unlock();
834 
835 	return iommu;
836 }
837 
838 static void domain_flush_cache(struct dmar_domain *domain,
839 			       void *addr, int size)
840 {
841 	if (!domain->iommu_coherency)
842 		clflush_cache_range(addr, size);
843 }
844 
845 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
846 {
847 	struct context_entry *context;
848 	int ret = 0;
849 	unsigned long flags;
850 
851 	spin_lock_irqsave(&iommu->lock, flags);
852 	context = iommu_context_addr(iommu, bus, devfn, 0);
853 	if (context)
854 		ret = context_present(context);
855 	spin_unlock_irqrestore(&iommu->lock, flags);
856 	return ret;
857 }
858 
859 static void free_context_table(struct intel_iommu *iommu)
860 {
861 	int i;
862 	unsigned long flags;
863 	struct context_entry *context;
864 
865 	spin_lock_irqsave(&iommu->lock, flags);
866 	if (!iommu->root_entry) {
867 		goto out;
868 	}
869 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
870 		context = iommu_context_addr(iommu, i, 0, 0);
871 		if (context)
872 			free_pgtable_page(context);
873 
874 		if (!sm_supported(iommu))
875 			continue;
876 
877 		context = iommu_context_addr(iommu, i, 0x80, 0);
878 		if (context)
879 			free_pgtable_page(context);
880 
881 	}
882 	free_pgtable_page(iommu->root_entry);
883 	iommu->root_entry = NULL;
884 out:
885 	spin_unlock_irqrestore(&iommu->lock, flags);
886 }
887 
888 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
889 				      unsigned long pfn, int *target_level)
890 {
891 	struct dma_pte *parent, *pte;
892 	int level = agaw_to_level(domain->agaw);
893 	int offset;
894 
895 	BUG_ON(!domain->pgd);
896 
897 	if (!domain_pfn_supported(domain, pfn))
898 		/* Address beyond IOMMU's addressing capabilities. */
899 		return NULL;
900 
901 	parent = domain->pgd;
902 
903 	while (1) {
904 		void *tmp_page;
905 
906 		offset = pfn_level_offset(pfn, level);
907 		pte = &parent[offset];
908 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
909 			break;
910 		if (level == *target_level)
911 			break;
912 
913 		if (!dma_pte_present(pte)) {
914 			uint64_t pteval;
915 
916 			tmp_page = alloc_pgtable_page(domain->nid);
917 
918 			if (!tmp_page)
919 				return NULL;
920 
921 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
922 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
923 			if (domain_use_first_level(domain))
924 				pteval |= DMA_FL_PTE_XD;
925 			if (cmpxchg64(&pte->val, 0ULL, pteval))
926 				/* Someone else set it while we were thinking; use theirs. */
927 				free_pgtable_page(tmp_page);
928 			else
929 				domain_flush_cache(domain, pte, sizeof(*pte));
930 		}
931 		if (level == 1)
932 			break;
933 
934 		parent = phys_to_virt(dma_pte_addr(pte));
935 		level--;
936 	}
937 
938 	if (!*target_level)
939 		*target_level = level;
940 
941 	return pte;
942 }
943 
944 /* return address's pte at specific level */
945 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
946 					 unsigned long pfn,
947 					 int level, int *large_page)
948 {
949 	struct dma_pte *parent, *pte;
950 	int total = agaw_to_level(domain->agaw);
951 	int offset;
952 
953 	parent = domain->pgd;
954 	while (level <= total) {
955 		offset = pfn_level_offset(pfn, total);
956 		pte = &parent[offset];
957 		if (level == total)
958 			return pte;
959 
960 		if (!dma_pte_present(pte)) {
961 			*large_page = total;
962 			break;
963 		}
964 
965 		if (dma_pte_superpage(pte)) {
966 			*large_page = total;
967 			return pte;
968 		}
969 
970 		parent = phys_to_virt(dma_pte_addr(pte));
971 		total--;
972 	}
973 	return NULL;
974 }
975 
976 /* clear last level pte, a tlb flush should be followed */
977 static void dma_pte_clear_range(struct dmar_domain *domain,
978 				unsigned long start_pfn,
979 				unsigned long last_pfn)
980 {
981 	unsigned int large_page;
982 	struct dma_pte *first_pte, *pte;
983 
984 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
985 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
986 	BUG_ON(start_pfn > last_pfn);
987 
988 	/* we don't need lock here; nobody else touches the iova range */
989 	do {
990 		large_page = 1;
991 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
992 		if (!pte) {
993 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
994 			continue;
995 		}
996 		do {
997 			dma_clear_pte(pte);
998 			start_pfn += lvl_to_nr_pages(large_page);
999 			pte++;
1000 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1001 
1002 		domain_flush_cache(domain, first_pte,
1003 				   (void *)pte - (void *)first_pte);
1004 
1005 	} while (start_pfn && start_pfn <= last_pfn);
1006 }
1007 
1008 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1009 			       int retain_level, struct dma_pte *pte,
1010 			       unsigned long pfn, unsigned long start_pfn,
1011 			       unsigned long last_pfn)
1012 {
1013 	pfn = max(start_pfn, pfn);
1014 	pte = &pte[pfn_level_offset(pfn, level)];
1015 
1016 	do {
1017 		unsigned long level_pfn;
1018 		struct dma_pte *level_pte;
1019 
1020 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1021 			goto next;
1022 
1023 		level_pfn = pfn & level_mask(level);
1024 		level_pte = phys_to_virt(dma_pte_addr(pte));
1025 
1026 		if (level > 2) {
1027 			dma_pte_free_level(domain, level - 1, retain_level,
1028 					   level_pte, level_pfn, start_pfn,
1029 					   last_pfn);
1030 		}
1031 
1032 		/*
1033 		 * Free the page table if we're below the level we want to
1034 		 * retain and the range covers the entire table.
1035 		 */
1036 		if (level < retain_level && !(start_pfn > level_pfn ||
1037 		      last_pfn < level_pfn + level_size(level) - 1)) {
1038 			dma_clear_pte(pte);
1039 			domain_flush_cache(domain, pte, sizeof(*pte));
1040 			free_pgtable_page(level_pte);
1041 		}
1042 next:
1043 		pfn += level_size(level);
1044 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1045 }
1046 
1047 /*
1048  * clear last level (leaf) ptes and free page table pages below the
1049  * level we wish to keep intact.
1050  */
1051 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1052 				   unsigned long start_pfn,
1053 				   unsigned long last_pfn,
1054 				   int retain_level)
1055 {
1056 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058 	BUG_ON(start_pfn > last_pfn);
1059 
1060 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1061 
1062 	/* We don't need lock here; nobody else touches the iova range */
1063 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1064 			   domain->pgd, 0, start_pfn, last_pfn);
1065 
1066 	/* free pgd */
1067 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1068 		free_pgtable_page(domain->pgd);
1069 		domain->pgd = NULL;
1070 	}
1071 }
1072 
1073 /* When a page at a given level is being unlinked from its parent, we don't
1074    need to *modify* it at all. All we need to do is make a list of all the
1075    pages which can be freed just as soon as we've flushed the IOTLB and we
1076    know the hardware page-walk will no longer touch them.
1077    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1078    be freed. */
1079 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1080 					    int level, struct dma_pte *pte,
1081 					    struct page *freelist)
1082 {
1083 	struct page *pg;
1084 
1085 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1086 	pg->freelist = freelist;
1087 	freelist = pg;
1088 
1089 	if (level == 1)
1090 		return freelist;
1091 
1092 	pte = page_address(pg);
1093 	do {
1094 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1095 			freelist = dma_pte_list_pagetables(domain, level - 1,
1096 							   pte, freelist);
1097 		pte++;
1098 	} while (!first_pte_in_page(pte));
1099 
1100 	return freelist;
1101 }
1102 
1103 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1104 					struct dma_pte *pte, unsigned long pfn,
1105 					unsigned long start_pfn,
1106 					unsigned long last_pfn,
1107 					struct page *freelist)
1108 {
1109 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1110 
1111 	pfn = max(start_pfn, pfn);
1112 	pte = &pte[pfn_level_offset(pfn, level)];
1113 
1114 	do {
1115 		unsigned long level_pfn;
1116 
1117 		if (!dma_pte_present(pte))
1118 			goto next;
1119 
1120 		level_pfn = pfn & level_mask(level);
1121 
1122 		/* If range covers entire pagetable, free it */
1123 		if (start_pfn <= level_pfn &&
1124 		    last_pfn >= level_pfn + level_size(level) - 1) {
1125 			/* These suborbinate page tables are going away entirely. Don't
1126 			   bother to clear them; we're just going to *free* them. */
1127 			if (level > 1 && !dma_pte_superpage(pte))
1128 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1129 
1130 			dma_clear_pte(pte);
1131 			if (!first_pte)
1132 				first_pte = pte;
1133 			last_pte = pte;
1134 		} else if (level > 1) {
1135 			/* Recurse down into a level that isn't *entirely* obsolete */
1136 			freelist = dma_pte_clear_level(domain, level - 1,
1137 						       phys_to_virt(dma_pte_addr(pte)),
1138 						       level_pfn, start_pfn, last_pfn,
1139 						       freelist);
1140 		}
1141 next:
1142 		pfn += level_size(level);
1143 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144 
1145 	if (first_pte)
1146 		domain_flush_cache(domain, first_pte,
1147 				   (void *)++last_pte - (void *)first_pte);
1148 
1149 	return freelist;
1150 }
1151 
1152 /* We can't just free the pages because the IOMMU may still be walking
1153    the page tables, and may have cached the intermediate levels. The
1154    pages can only be freed after the IOTLB flush has been done. */
1155 static struct page *domain_unmap(struct dmar_domain *domain,
1156 				 unsigned long start_pfn,
1157 				 unsigned long last_pfn)
1158 {
1159 	struct page *freelist;
1160 
1161 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1162 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1163 	BUG_ON(start_pfn > last_pfn);
1164 
1165 	/* we don't need lock here; nobody else touches the iova range */
1166 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1167 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1168 
1169 	/* free pgd */
1170 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1171 		struct page *pgd_page = virt_to_page(domain->pgd);
1172 		pgd_page->freelist = freelist;
1173 		freelist = pgd_page;
1174 
1175 		domain->pgd = NULL;
1176 	}
1177 
1178 	return freelist;
1179 }
1180 
1181 static void dma_free_pagelist(struct page *freelist)
1182 {
1183 	struct page *pg;
1184 
1185 	while ((pg = freelist)) {
1186 		freelist = pg->freelist;
1187 		free_pgtable_page(page_address(pg));
1188 	}
1189 }
1190 
1191 static void iova_entry_free(unsigned long data)
1192 {
1193 	struct page *freelist = (struct page *)data;
1194 
1195 	dma_free_pagelist(freelist);
1196 }
1197 
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1200 {
1201 	struct root_entry *root;
1202 	unsigned long flags;
1203 
1204 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1205 	if (!root) {
1206 		pr_err("Allocating root entry for %s failed\n",
1207 			iommu->name);
1208 		return -ENOMEM;
1209 	}
1210 
1211 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1212 
1213 	spin_lock_irqsave(&iommu->lock, flags);
1214 	iommu->root_entry = root;
1215 	spin_unlock_irqrestore(&iommu->lock, flags);
1216 
1217 	return 0;
1218 }
1219 
1220 static void iommu_set_root_entry(struct intel_iommu *iommu)
1221 {
1222 	u64 addr;
1223 	u32 sts;
1224 	unsigned long flag;
1225 
1226 	addr = virt_to_phys(iommu->root_entry);
1227 	if (sm_supported(iommu))
1228 		addr |= DMA_RTADDR_SMT;
1229 
1230 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1232 
1233 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1234 
1235 	/* Make sure hardware complete it */
1236 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237 		      readl, (sts & DMA_GSTS_RTPS), sts);
1238 
1239 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 }
1241 
1242 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1243 {
1244 	u32 val;
1245 	unsigned long flag;
1246 
1247 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1248 		return;
1249 
1250 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1251 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1252 
1253 	/* Make sure hardware complete it */
1254 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1255 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1256 
1257 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1258 }
1259 
1260 /* return value determine if we need a write buffer flush */
1261 static void __iommu_flush_context(struct intel_iommu *iommu,
1262 				  u16 did, u16 source_id, u8 function_mask,
1263 				  u64 type)
1264 {
1265 	u64 val = 0;
1266 	unsigned long flag;
1267 
1268 	switch (type) {
1269 	case DMA_CCMD_GLOBAL_INVL:
1270 		val = DMA_CCMD_GLOBAL_INVL;
1271 		break;
1272 	case DMA_CCMD_DOMAIN_INVL:
1273 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1274 		break;
1275 	case DMA_CCMD_DEVICE_INVL:
1276 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1277 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1278 		break;
1279 	default:
1280 		BUG();
1281 	}
1282 	val |= DMA_CCMD_ICC;
1283 
1284 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286 
1287 	/* Make sure hardware complete it */
1288 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290 
1291 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293 
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 				u64 addr, unsigned int size_order, u64 type)
1297 {
1298 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 	u64 val = 0, val_iva = 0;
1300 	unsigned long flag;
1301 
1302 	switch (type) {
1303 	case DMA_TLB_GLOBAL_FLUSH:
1304 		/* global flush doesn't need set IVA_REG */
1305 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306 		break;
1307 	case DMA_TLB_DSI_FLUSH:
1308 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 		break;
1310 	case DMA_TLB_PSI_FLUSH:
1311 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 		/* IH bit is passed in as part of address */
1313 		val_iva = size_order | addr;
1314 		break;
1315 	default:
1316 		BUG();
1317 	}
1318 	/* Note: set drain read/write */
1319 #if 0
1320 	/*
1321 	 * This is probably to be super secure.. Looks like we can
1322 	 * ignore it without any impact.
1323 	 */
1324 	if (cap_read_drain(iommu->cap))
1325 		val |= DMA_TLB_READ_DRAIN;
1326 #endif
1327 	if (cap_write_drain(iommu->cap))
1328 		val |= DMA_TLB_WRITE_DRAIN;
1329 
1330 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1331 	/* Note: Only uses first TLB reg currently */
1332 	if (val_iva)
1333 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1334 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1335 
1336 	/* Make sure hardware complete it */
1337 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1338 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1339 
1340 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1341 
1342 	/* check IOTLB invalidation granularity */
1343 	if (DMA_TLB_IAIG(val) == 0)
1344 		pr_err("Flush IOTLB failed\n");
1345 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1346 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1347 			(unsigned long long)DMA_TLB_IIRG(type),
1348 			(unsigned long long)DMA_TLB_IAIG(val));
1349 }
1350 
1351 static struct device_domain_info *
1352 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1353 			 u8 bus, u8 devfn)
1354 {
1355 	struct device_domain_info *info;
1356 
1357 	assert_spin_locked(&device_domain_lock);
1358 
1359 	if (!iommu->qi)
1360 		return NULL;
1361 
1362 	list_for_each_entry(info, &domain->devices, link)
1363 		if (info->iommu == iommu && info->bus == bus &&
1364 		    info->devfn == devfn) {
1365 			if (info->ats_supported && info->dev)
1366 				return info;
1367 			break;
1368 		}
1369 
1370 	return NULL;
1371 }
1372 
1373 static void domain_update_iotlb(struct dmar_domain *domain)
1374 {
1375 	struct device_domain_info *info;
1376 	bool has_iotlb_device = false;
1377 
1378 	assert_spin_locked(&device_domain_lock);
1379 
1380 	list_for_each_entry(info, &domain->devices, link) {
1381 		struct pci_dev *pdev;
1382 
1383 		if (!info->dev || !dev_is_pci(info->dev))
1384 			continue;
1385 
1386 		pdev = to_pci_dev(info->dev);
1387 		if (pdev->ats_enabled) {
1388 			has_iotlb_device = true;
1389 			break;
1390 		}
1391 	}
1392 
1393 	domain->has_iotlb_device = has_iotlb_device;
1394 }
1395 
1396 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1397 {
1398 	struct pci_dev *pdev;
1399 
1400 	assert_spin_locked(&device_domain_lock);
1401 
1402 	if (!info || !dev_is_pci(info->dev))
1403 		return;
1404 
1405 	pdev = to_pci_dev(info->dev);
1406 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1407 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1408 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1409 	 * reserved, which should be set to 0.
1410 	 */
1411 	if (!ecap_dit(info->iommu->ecap))
1412 		info->pfsid = 0;
1413 	else {
1414 		struct pci_dev *pf_pdev;
1415 
1416 		/* pdev will be returned if device is not a vf */
1417 		pf_pdev = pci_physfn(pdev);
1418 		info->pfsid = pci_dev_id(pf_pdev);
1419 	}
1420 
1421 #ifdef CONFIG_INTEL_IOMMU_SVM
1422 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1423 	   the device if you enable PASID support after ATS support is
1424 	   undefined. So always enable PASID support on devices which
1425 	   have it, even if we can't yet know if we're ever going to
1426 	   use it. */
1427 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1428 		info->pasid_enabled = 1;
1429 
1430 	if (info->pri_supported &&
1431 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1432 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1433 		info->pri_enabled = 1;
1434 #endif
1435 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1436 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1437 		info->ats_enabled = 1;
1438 		domain_update_iotlb(info->domain);
1439 		info->ats_qdep = pci_ats_queue_depth(pdev);
1440 	}
1441 }
1442 
1443 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1444 {
1445 	struct pci_dev *pdev;
1446 
1447 	assert_spin_locked(&device_domain_lock);
1448 
1449 	if (!dev_is_pci(info->dev))
1450 		return;
1451 
1452 	pdev = to_pci_dev(info->dev);
1453 
1454 	if (info->ats_enabled) {
1455 		pci_disable_ats(pdev);
1456 		info->ats_enabled = 0;
1457 		domain_update_iotlb(info->domain);
1458 	}
1459 #ifdef CONFIG_INTEL_IOMMU_SVM
1460 	if (info->pri_enabled) {
1461 		pci_disable_pri(pdev);
1462 		info->pri_enabled = 0;
1463 	}
1464 	if (info->pasid_enabled) {
1465 		pci_disable_pasid(pdev);
1466 		info->pasid_enabled = 0;
1467 	}
1468 #endif
1469 }
1470 
1471 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1472 				  u64 addr, unsigned mask)
1473 {
1474 	u16 sid, qdep;
1475 	unsigned long flags;
1476 	struct device_domain_info *info;
1477 
1478 	if (!domain->has_iotlb_device)
1479 		return;
1480 
1481 	spin_lock_irqsave(&device_domain_lock, flags);
1482 	list_for_each_entry(info, &domain->devices, link) {
1483 		if (!info->ats_enabled)
1484 			continue;
1485 
1486 		sid = info->bus << 8 | info->devfn;
1487 		qdep = info->ats_qdep;
1488 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1489 				qdep, addr, mask);
1490 	}
1491 	spin_unlock_irqrestore(&device_domain_lock, flags);
1492 }
1493 
1494 static void domain_flush_piotlb(struct intel_iommu *iommu,
1495 				struct dmar_domain *domain,
1496 				u64 addr, unsigned long npages, bool ih)
1497 {
1498 	u16 did = domain->iommu_did[iommu->seq_id];
1499 
1500 	if (domain->default_pasid)
1501 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1502 				addr, npages, ih);
1503 
1504 	if (!list_empty(&domain->devices))
1505 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1506 }
1507 
1508 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1509 				  struct dmar_domain *domain,
1510 				  unsigned long pfn, unsigned int pages,
1511 				  int ih, int map)
1512 {
1513 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1514 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1515 	u16 did = domain->iommu_did[iommu->seq_id];
1516 
1517 	BUG_ON(pages == 0);
1518 
1519 	if (ih)
1520 		ih = 1 << 6;
1521 
1522 	if (domain_use_first_level(domain)) {
1523 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1524 	} else {
1525 		/*
1526 		 * Fallback to domain selective flush if no PSI support or
1527 		 * the size is too big. PSI requires page size to be 2 ^ x,
1528 		 * and the base address is naturally aligned to the size.
1529 		 */
1530 		if (!cap_pgsel_inv(iommu->cap) ||
1531 		    mask > cap_max_amask_val(iommu->cap))
1532 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1533 							DMA_TLB_DSI_FLUSH);
1534 		else
1535 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1536 							DMA_TLB_PSI_FLUSH);
1537 	}
1538 
1539 	/*
1540 	 * In caching mode, changes of pages from non-present to present require
1541 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1542 	 */
1543 	if (!cap_caching_mode(iommu->cap) || !map)
1544 		iommu_flush_dev_iotlb(domain, addr, mask);
1545 }
1546 
1547 /* Notification for newly created mappings */
1548 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1549 					struct dmar_domain *domain,
1550 					unsigned long pfn, unsigned int pages)
1551 {
1552 	/*
1553 	 * It's a non-present to present mapping. Only flush if caching mode
1554 	 * and second level.
1555 	 */
1556 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1557 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1558 	else
1559 		iommu_flush_write_buffer(iommu);
1560 }
1561 
1562 static void iommu_flush_iova(struct iova_domain *iovad)
1563 {
1564 	struct dmar_domain *domain;
1565 	int idx;
1566 
1567 	domain = container_of(iovad, struct dmar_domain, iovad);
1568 
1569 	for_each_domain_iommu(idx, domain) {
1570 		struct intel_iommu *iommu = g_iommus[idx];
1571 		u16 did = domain->iommu_did[iommu->seq_id];
1572 
1573 		if (domain_use_first_level(domain))
1574 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1575 		else
1576 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1577 						 DMA_TLB_DSI_FLUSH);
1578 
1579 		if (!cap_caching_mode(iommu->cap))
1580 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1581 					      0, MAX_AGAW_PFN_WIDTH);
1582 	}
1583 }
1584 
1585 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1586 {
1587 	u32 pmen;
1588 	unsigned long flags;
1589 
1590 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1591 		return;
1592 
1593 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1594 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1595 	pmen &= ~DMA_PMEN_EPM;
1596 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1597 
1598 	/* wait for the protected region status bit to clear */
1599 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1600 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1601 
1602 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1603 }
1604 
1605 static void iommu_enable_translation(struct intel_iommu *iommu)
1606 {
1607 	u32 sts;
1608 	unsigned long flags;
1609 
1610 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1611 	iommu->gcmd |= DMA_GCMD_TE;
1612 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1613 
1614 	/* Make sure hardware complete it */
1615 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1616 		      readl, (sts & DMA_GSTS_TES), sts);
1617 
1618 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1619 }
1620 
1621 static void iommu_disable_translation(struct intel_iommu *iommu)
1622 {
1623 	u32 sts;
1624 	unsigned long flag;
1625 
1626 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1627 	iommu->gcmd &= ~DMA_GCMD_TE;
1628 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1629 
1630 	/* Make sure hardware complete it */
1631 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1632 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1633 
1634 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1635 }
1636 
1637 static int iommu_init_domains(struct intel_iommu *iommu)
1638 {
1639 	u32 ndomains, nlongs;
1640 	size_t size;
1641 
1642 	ndomains = cap_ndoms(iommu->cap);
1643 	pr_debug("%s: Number of Domains supported <%d>\n",
1644 		 iommu->name, ndomains);
1645 	nlongs = BITS_TO_LONGS(ndomains);
1646 
1647 	spin_lock_init(&iommu->lock);
1648 
1649 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1650 	if (!iommu->domain_ids) {
1651 		pr_err("%s: Allocating domain id array failed\n",
1652 		       iommu->name);
1653 		return -ENOMEM;
1654 	}
1655 
1656 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1657 	iommu->domains = kzalloc(size, GFP_KERNEL);
1658 
1659 	if (iommu->domains) {
1660 		size = 256 * sizeof(struct dmar_domain *);
1661 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1662 	}
1663 
1664 	if (!iommu->domains || !iommu->domains[0]) {
1665 		pr_err("%s: Allocating domain array failed\n",
1666 		       iommu->name);
1667 		kfree(iommu->domain_ids);
1668 		kfree(iommu->domains);
1669 		iommu->domain_ids = NULL;
1670 		iommu->domains    = NULL;
1671 		return -ENOMEM;
1672 	}
1673 
1674 	/*
1675 	 * If Caching mode is set, then invalid translations are tagged
1676 	 * with domain-id 0, hence we need to pre-allocate it. We also
1677 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1678 	 * make sure it is not used for a real domain.
1679 	 */
1680 	set_bit(0, iommu->domain_ids);
1681 
1682 	/*
1683 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1684 	 * entry for first-level or pass-through translation modes should
1685 	 * be programmed with a domain id different from those used for
1686 	 * second-level or nested translation. We reserve a domain id for
1687 	 * this purpose.
1688 	 */
1689 	if (sm_supported(iommu))
1690 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1691 
1692 	return 0;
1693 }
1694 
1695 static void disable_dmar_iommu(struct intel_iommu *iommu)
1696 {
1697 	struct device_domain_info *info, *tmp;
1698 	unsigned long flags;
1699 
1700 	if (!iommu->domains || !iommu->domain_ids)
1701 		return;
1702 
1703 	spin_lock_irqsave(&device_domain_lock, flags);
1704 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1705 		if (info->iommu != iommu)
1706 			continue;
1707 
1708 		if (!info->dev || !info->domain)
1709 			continue;
1710 
1711 		__dmar_remove_one_dev_info(info);
1712 	}
1713 	spin_unlock_irqrestore(&device_domain_lock, flags);
1714 
1715 	if (iommu->gcmd & DMA_GCMD_TE)
1716 		iommu_disable_translation(iommu);
1717 }
1718 
1719 static void free_dmar_iommu(struct intel_iommu *iommu)
1720 {
1721 	if ((iommu->domains) && (iommu->domain_ids)) {
1722 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1723 		int i;
1724 
1725 		for (i = 0; i < elems; i++)
1726 			kfree(iommu->domains[i]);
1727 		kfree(iommu->domains);
1728 		kfree(iommu->domain_ids);
1729 		iommu->domains = NULL;
1730 		iommu->domain_ids = NULL;
1731 	}
1732 
1733 	g_iommus[iommu->seq_id] = NULL;
1734 
1735 	/* free context mapping */
1736 	free_context_table(iommu);
1737 
1738 #ifdef CONFIG_INTEL_IOMMU_SVM
1739 	if (pasid_supported(iommu)) {
1740 		if (ecap_prs(iommu->ecap))
1741 			intel_svm_finish_prq(iommu);
1742 	}
1743 	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1744 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1745 
1746 #endif
1747 }
1748 
1749 /*
1750  * Check and return whether first level is used by default for
1751  * DMA translation.
1752  */
1753 static bool first_level_by_default(void)
1754 {
1755 	struct dmar_drhd_unit *drhd;
1756 	struct intel_iommu *iommu;
1757 	static int first_level_support = -1;
1758 
1759 	if (likely(first_level_support != -1))
1760 		return first_level_support;
1761 
1762 	first_level_support = 1;
1763 
1764 	rcu_read_lock();
1765 	for_each_active_iommu(iommu, drhd) {
1766 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1767 			first_level_support = 0;
1768 			break;
1769 		}
1770 	}
1771 	rcu_read_unlock();
1772 
1773 	return first_level_support;
1774 }
1775 
1776 static struct dmar_domain *alloc_domain(int flags)
1777 {
1778 	struct dmar_domain *domain;
1779 
1780 	domain = alloc_domain_mem();
1781 	if (!domain)
1782 		return NULL;
1783 
1784 	memset(domain, 0, sizeof(*domain));
1785 	domain->nid = NUMA_NO_NODE;
1786 	domain->flags = flags;
1787 	if (first_level_by_default())
1788 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1789 	domain->has_iotlb_device = false;
1790 	INIT_LIST_HEAD(&domain->devices);
1791 
1792 	return domain;
1793 }
1794 
1795 /* Must be called with iommu->lock */
1796 static int domain_attach_iommu(struct dmar_domain *domain,
1797 			       struct intel_iommu *iommu)
1798 {
1799 	unsigned long ndomains;
1800 	int num;
1801 
1802 	assert_spin_locked(&device_domain_lock);
1803 	assert_spin_locked(&iommu->lock);
1804 
1805 	domain->iommu_refcnt[iommu->seq_id] += 1;
1806 	domain->iommu_count += 1;
1807 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1808 		ndomains = cap_ndoms(iommu->cap);
1809 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1810 
1811 		if (num >= ndomains) {
1812 			pr_err("%s: No free domain ids\n", iommu->name);
1813 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1814 			domain->iommu_count -= 1;
1815 			return -ENOSPC;
1816 		}
1817 
1818 		set_bit(num, iommu->domain_ids);
1819 		set_iommu_domain(iommu, num, domain);
1820 
1821 		domain->iommu_did[iommu->seq_id] = num;
1822 		domain->nid			 = iommu->node;
1823 
1824 		domain_update_iommu_cap(domain);
1825 	}
1826 
1827 	return 0;
1828 }
1829 
1830 static int domain_detach_iommu(struct dmar_domain *domain,
1831 			       struct intel_iommu *iommu)
1832 {
1833 	int num, count;
1834 
1835 	assert_spin_locked(&device_domain_lock);
1836 	assert_spin_locked(&iommu->lock);
1837 
1838 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1839 	count = --domain->iommu_count;
1840 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1841 		num = domain->iommu_did[iommu->seq_id];
1842 		clear_bit(num, iommu->domain_ids);
1843 		set_iommu_domain(iommu, num, NULL);
1844 
1845 		domain_update_iommu_cap(domain);
1846 		domain->iommu_did[iommu->seq_id] = 0;
1847 	}
1848 
1849 	return count;
1850 }
1851 
1852 static struct iova_domain reserved_iova_list;
1853 static struct lock_class_key reserved_rbtree_key;
1854 
1855 static int dmar_init_reserved_ranges(void)
1856 {
1857 	struct pci_dev *pdev = NULL;
1858 	struct iova *iova;
1859 	int i;
1860 
1861 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1862 
1863 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1864 		&reserved_rbtree_key);
1865 
1866 	/* IOAPIC ranges shouldn't be accessed by DMA */
1867 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1868 		IOVA_PFN(IOAPIC_RANGE_END));
1869 	if (!iova) {
1870 		pr_err("Reserve IOAPIC range failed\n");
1871 		return -ENODEV;
1872 	}
1873 
1874 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1875 	for_each_pci_dev(pdev) {
1876 		struct resource *r;
1877 
1878 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1879 			r = &pdev->resource[i];
1880 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1881 				continue;
1882 			iova = reserve_iova(&reserved_iova_list,
1883 					    IOVA_PFN(r->start),
1884 					    IOVA_PFN(r->end));
1885 			if (!iova) {
1886 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1887 				return -ENODEV;
1888 			}
1889 		}
1890 	}
1891 	return 0;
1892 }
1893 
1894 static inline int guestwidth_to_adjustwidth(int gaw)
1895 {
1896 	int agaw;
1897 	int r = (gaw - 12) % 9;
1898 
1899 	if (r == 0)
1900 		agaw = gaw;
1901 	else
1902 		agaw = gaw + 9 - r;
1903 	if (agaw > 64)
1904 		agaw = 64;
1905 	return agaw;
1906 }
1907 
1908 static void domain_exit(struct dmar_domain *domain)
1909 {
1910 
1911 	/* Remove associated devices and clear attached or cached domains */
1912 	domain_remove_dev_info(domain);
1913 
1914 	/* destroy iovas */
1915 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1916 		put_iova_domain(&domain->iovad);
1917 
1918 	if (domain->pgd) {
1919 		struct page *freelist;
1920 
1921 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1922 		dma_free_pagelist(freelist);
1923 	}
1924 
1925 	free_domain_mem(domain);
1926 }
1927 
1928 /*
1929  * Get the PASID directory size for scalable mode context entry.
1930  * Value of X in the PDTS field of a scalable mode context entry
1931  * indicates PASID directory with 2^(X + 7) entries.
1932  */
1933 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1934 {
1935 	int pds, max_pde;
1936 
1937 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1938 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1939 	if (pds < 7)
1940 		return 0;
1941 
1942 	return pds - 7;
1943 }
1944 
1945 /*
1946  * Set the RID_PASID field of a scalable mode context entry. The
1947  * IOMMU hardware will use the PASID value set in this field for
1948  * DMA translations of DMA requests without PASID.
1949  */
1950 static inline void
1951 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1952 {
1953 	context->hi |= pasid & ((1 << 20) - 1);
1954 	context->hi |= (1 << 20);
1955 }
1956 
1957 /*
1958  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1959  * entry.
1960  */
1961 static inline void context_set_sm_dte(struct context_entry *context)
1962 {
1963 	context->lo |= (1 << 2);
1964 }
1965 
1966 /*
1967  * Set the PRE(Page Request Enable) field of a scalable mode context
1968  * entry.
1969  */
1970 static inline void context_set_sm_pre(struct context_entry *context)
1971 {
1972 	context->lo |= (1 << 4);
1973 }
1974 
1975 /* Convert value to context PASID directory size field coding. */
1976 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1977 
1978 static int domain_context_mapping_one(struct dmar_domain *domain,
1979 				      struct intel_iommu *iommu,
1980 				      struct pasid_table *table,
1981 				      u8 bus, u8 devfn)
1982 {
1983 	u16 did = domain->iommu_did[iommu->seq_id];
1984 	int translation = CONTEXT_TT_MULTI_LEVEL;
1985 	struct device_domain_info *info = NULL;
1986 	struct context_entry *context;
1987 	unsigned long flags;
1988 	int ret;
1989 
1990 	WARN_ON(did == 0);
1991 
1992 	if (hw_pass_through && domain_type_is_si(domain))
1993 		translation = CONTEXT_TT_PASS_THROUGH;
1994 
1995 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1996 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1997 
1998 	BUG_ON(!domain->pgd);
1999 
2000 	spin_lock_irqsave(&device_domain_lock, flags);
2001 	spin_lock(&iommu->lock);
2002 
2003 	ret = -ENOMEM;
2004 	context = iommu_context_addr(iommu, bus, devfn, 1);
2005 	if (!context)
2006 		goto out_unlock;
2007 
2008 	ret = 0;
2009 	if (context_present(context))
2010 		goto out_unlock;
2011 
2012 	/*
2013 	 * For kdump cases, old valid entries may be cached due to the
2014 	 * in-flight DMA and copied pgtable, but there is no unmapping
2015 	 * behaviour for them, thus we need an explicit cache flush for
2016 	 * the newly-mapped device. For kdump, at this point, the device
2017 	 * is supposed to finish reset at its driver probe stage, so no
2018 	 * in-flight DMA will exist, and we don't need to worry anymore
2019 	 * hereafter.
2020 	 */
2021 	if (context_copied(context)) {
2022 		u16 did_old = context_domain_id(context);
2023 
2024 		if (did_old < cap_ndoms(iommu->cap)) {
2025 			iommu->flush.flush_context(iommu, did_old,
2026 						   (((u16)bus) << 8) | devfn,
2027 						   DMA_CCMD_MASK_NOBIT,
2028 						   DMA_CCMD_DEVICE_INVL);
2029 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2030 						 DMA_TLB_DSI_FLUSH);
2031 		}
2032 	}
2033 
2034 	context_clear_entry(context);
2035 
2036 	if (sm_supported(iommu)) {
2037 		unsigned long pds;
2038 
2039 		WARN_ON(!table);
2040 
2041 		/* Setup the PASID DIR pointer: */
2042 		pds = context_get_sm_pds(table);
2043 		context->lo = (u64)virt_to_phys(table->table) |
2044 				context_pdts(pds);
2045 
2046 		/* Setup the RID_PASID field: */
2047 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2048 
2049 		/*
2050 		 * Setup the Device-TLB enable bit and Page request
2051 		 * Enable bit:
2052 		 */
2053 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2054 		if (info && info->ats_supported)
2055 			context_set_sm_dte(context);
2056 		if (info && info->pri_supported)
2057 			context_set_sm_pre(context);
2058 	} else {
2059 		struct dma_pte *pgd = domain->pgd;
2060 		int agaw;
2061 
2062 		context_set_domain_id(context, did);
2063 
2064 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2065 			/*
2066 			 * Skip top levels of page tables for iommu which has
2067 			 * less agaw than default. Unnecessary for PT mode.
2068 			 */
2069 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2070 				ret = -ENOMEM;
2071 				pgd = phys_to_virt(dma_pte_addr(pgd));
2072 				if (!dma_pte_present(pgd))
2073 					goto out_unlock;
2074 			}
2075 
2076 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2077 			if (info && info->ats_supported)
2078 				translation = CONTEXT_TT_DEV_IOTLB;
2079 			else
2080 				translation = CONTEXT_TT_MULTI_LEVEL;
2081 
2082 			context_set_address_root(context, virt_to_phys(pgd));
2083 			context_set_address_width(context, agaw);
2084 		} else {
2085 			/*
2086 			 * In pass through mode, AW must be programmed to
2087 			 * indicate the largest AGAW value supported by
2088 			 * hardware. And ASR is ignored by hardware.
2089 			 */
2090 			context_set_address_width(context, iommu->msagaw);
2091 		}
2092 
2093 		context_set_translation_type(context, translation);
2094 	}
2095 
2096 	context_set_fault_enable(context);
2097 	context_set_present(context);
2098 	domain_flush_cache(domain, context, sizeof(*context));
2099 
2100 	/*
2101 	 * It's a non-present to present mapping. If hardware doesn't cache
2102 	 * non-present entry we only need to flush the write-buffer. If the
2103 	 * _does_ cache non-present entries, then it does so in the special
2104 	 * domain #0, which we have to flush:
2105 	 */
2106 	if (cap_caching_mode(iommu->cap)) {
2107 		iommu->flush.flush_context(iommu, 0,
2108 					   (((u16)bus) << 8) | devfn,
2109 					   DMA_CCMD_MASK_NOBIT,
2110 					   DMA_CCMD_DEVICE_INVL);
2111 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2112 	} else {
2113 		iommu_flush_write_buffer(iommu);
2114 	}
2115 	iommu_enable_dev_iotlb(info);
2116 
2117 	ret = 0;
2118 
2119 out_unlock:
2120 	spin_unlock(&iommu->lock);
2121 	spin_unlock_irqrestore(&device_domain_lock, flags);
2122 
2123 	return ret;
2124 }
2125 
2126 struct domain_context_mapping_data {
2127 	struct dmar_domain *domain;
2128 	struct intel_iommu *iommu;
2129 	struct pasid_table *table;
2130 };
2131 
2132 static int domain_context_mapping_cb(struct pci_dev *pdev,
2133 				     u16 alias, void *opaque)
2134 {
2135 	struct domain_context_mapping_data *data = opaque;
2136 
2137 	return domain_context_mapping_one(data->domain, data->iommu,
2138 					  data->table, PCI_BUS_NUM(alias),
2139 					  alias & 0xff);
2140 }
2141 
2142 static int
2143 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2144 {
2145 	struct domain_context_mapping_data data;
2146 	struct pasid_table *table;
2147 	struct intel_iommu *iommu;
2148 	u8 bus, devfn;
2149 
2150 	iommu = device_to_iommu(dev, &bus, &devfn);
2151 	if (!iommu)
2152 		return -ENODEV;
2153 
2154 	table = intel_pasid_get_table(dev);
2155 
2156 	if (!dev_is_pci(dev))
2157 		return domain_context_mapping_one(domain, iommu, table,
2158 						  bus, devfn);
2159 
2160 	data.domain = domain;
2161 	data.iommu = iommu;
2162 	data.table = table;
2163 
2164 	return pci_for_each_dma_alias(to_pci_dev(dev),
2165 				      &domain_context_mapping_cb, &data);
2166 }
2167 
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169 				    u16 alias, void *opaque)
2170 {
2171 	struct intel_iommu *iommu = opaque;
2172 
2173 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174 }
2175 
2176 static int domain_context_mapped(struct device *dev)
2177 {
2178 	struct intel_iommu *iommu;
2179 	u8 bus, devfn;
2180 
2181 	iommu = device_to_iommu(dev, &bus, &devfn);
2182 	if (!iommu)
2183 		return -ENODEV;
2184 
2185 	if (!dev_is_pci(dev))
2186 		return device_context_mapped(iommu, bus, devfn);
2187 
2188 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2189 				       domain_context_mapped_cb, iommu);
2190 }
2191 
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194 					    size_t size)
2195 {
2196 	host_addr &= ~PAGE_MASK;
2197 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198 }
2199 
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202 					  unsigned long iov_pfn,
2203 					  unsigned long phy_pfn,
2204 					  unsigned long pages)
2205 {
2206 	int support, level = 1;
2207 	unsigned long pfnmerge;
2208 
2209 	support = domain->iommu_superpage;
2210 
2211 	/* To use a large page, the virtual *and* physical addresses
2212 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213 	   of them will mean we have to use smaller pages. So just
2214 	   merge them and check both at once. */
2215 	pfnmerge = iov_pfn | phy_pfn;
2216 
2217 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218 		pages >>= VTD_STRIDE_SHIFT;
2219 		if (!pages)
2220 			break;
2221 		pfnmerge >>= VTD_STRIDE_SHIFT;
2222 		level++;
2223 		support--;
2224 	}
2225 	return level;
2226 }
2227 
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229 			    struct scatterlist *sg, unsigned long phys_pfn,
2230 			    unsigned long nr_pages, int prot)
2231 {
2232 	struct dma_pte *first_pte = NULL, *pte = NULL;
2233 	phys_addr_t uninitialized_var(pteval);
2234 	unsigned long sg_res = 0;
2235 	unsigned int largepage_lvl = 0;
2236 	unsigned long lvl_pages = 0;
2237 	u64 attr;
2238 
2239 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2240 
2241 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2242 		return -EINVAL;
2243 
2244 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2245 	if (domain_use_first_level(domain))
2246 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2247 
2248 	if (!sg) {
2249 		sg_res = nr_pages;
2250 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2251 	}
2252 
2253 	while (nr_pages > 0) {
2254 		uint64_t tmp;
2255 
2256 		if (!sg_res) {
2257 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2258 
2259 			sg_res = aligned_nrpages(sg->offset, sg->length);
2260 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2261 			sg->dma_length = sg->length;
2262 			pteval = (sg_phys(sg) - pgoff) | attr;
2263 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2264 		}
2265 
2266 		if (!pte) {
2267 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2268 
2269 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2270 			if (!pte)
2271 				return -ENOMEM;
2272 			/* It is large page*/
2273 			if (largepage_lvl > 1) {
2274 				unsigned long nr_superpages, end_pfn;
2275 
2276 				pteval |= DMA_PTE_LARGE_PAGE;
2277 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2278 
2279 				nr_superpages = sg_res / lvl_pages;
2280 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2281 
2282 				/*
2283 				 * Ensure that old small page tables are
2284 				 * removed to make room for superpage(s).
2285 				 * We're adding new large pages, so make sure
2286 				 * we don't remove their parent tables.
2287 				 */
2288 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2289 						       largepage_lvl + 1);
2290 			} else {
2291 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2292 			}
2293 
2294 		}
2295 		/* We don't need lock here, nobody else
2296 		 * touches the iova range
2297 		 */
2298 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2299 		if (tmp) {
2300 			static int dumps = 5;
2301 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2302 				iov_pfn, tmp, (unsigned long long)pteval);
2303 			if (dumps) {
2304 				dumps--;
2305 				debug_dma_dump_mappings(NULL);
2306 			}
2307 			WARN_ON(1);
2308 		}
2309 
2310 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2311 
2312 		BUG_ON(nr_pages < lvl_pages);
2313 		BUG_ON(sg_res < lvl_pages);
2314 
2315 		nr_pages -= lvl_pages;
2316 		iov_pfn += lvl_pages;
2317 		phys_pfn += lvl_pages;
2318 		pteval += lvl_pages * VTD_PAGE_SIZE;
2319 		sg_res -= lvl_pages;
2320 
2321 		/* If the next PTE would be the first in a new page, then we
2322 		   need to flush the cache on the entries we've just written.
2323 		   And then we'll need to recalculate 'pte', so clear it and
2324 		   let it get set again in the if (!pte) block above.
2325 
2326 		   If we're done (!nr_pages) we need to flush the cache too.
2327 
2328 		   Also if we've been setting superpages, we may need to
2329 		   recalculate 'pte' and switch back to smaller pages for the
2330 		   end of the mapping, if the trailing size is not enough to
2331 		   use another superpage (i.e. sg_res < lvl_pages). */
2332 		pte++;
2333 		if (!nr_pages || first_pte_in_page(pte) ||
2334 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2335 			domain_flush_cache(domain, first_pte,
2336 					   (void *)pte - (void *)first_pte);
2337 			pte = NULL;
2338 		}
2339 
2340 		if (!sg_res && nr_pages)
2341 			sg = sg_next(sg);
2342 	}
2343 	return 0;
2344 }
2345 
2346 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2347 			  struct scatterlist *sg, unsigned long phys_pfn,
2348 			  unsigned long nr_pages, int prot)
2349 {
2350 	int iommu_id, ret;
2351 	struct intel_iommu *iommu;
2352 
2353 	/* Do the real mapping first */
2354 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2355 	if (ret)
2356 		return ret;
2357 
2358 	for_each_domain_iommu(iommu_id, domain) {
2359 		iommu = g_iommus[iommu_id];
2360 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2361 	}
2362 
2363 	return 0;
2364 }
2365 
2366 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2367 				    struct scatterlist *sg, unsigned long nr_pages,
2368 				    int prot)
2369 {
2370 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2371 }
2372 
2373 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2374 				     unsigned long phys_pfn, unsigned long nr_pages,
2375 				     int prot)
2376 {
2377 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2378 }
2379 
2380 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2381 {
2382 	unsigned long flags;
2383 	struct context_entry *context;
2384 	u16 did_old;
2385 
2386 	if (!iommu)
2387 		return;
2388 
2389 	spin_lock_irqsave(&iommu->lock, flags);
2390 	context = iommu_context_addr(iommu, bus, devfn, 0);
2391 	if (!context) {
2392 		spin_unlock_irqrestore(&iommu->lock, flags);
2393 		return;
2394 	}
2395 	did_old = context_domain_id(context);
2396 	context_clear_entry(context);
2397 	__iommu_flush_cache(iommu, context, sizeof(*context));
2398 	spin_unlock_irqrestore(&iommu->lock, flags);
2399 	iommu->flush.flush_context(iommu,
2400 				   did_old,
2401 				   (((u16)bus) << 8) | devfn,
2402 				   DMA_CCMD_MASK_NOBIT,
2403 				   DMA_CCMD_DEVICE_INVL);
2404 	iommu->flush.flush_iotlb(iommu,
2405 				 did_old,
2406 				 0,
2407 				 0,
2408 				 DMA_TLB_DSI_FLUSH);
2409 }
2410 
2411 static inline void unlink_domain_info(struct device_domain_info *info)
2412 {
2413 	assert_spin_locked(&device_domain_lock);
2414 	list_del(&info->link);
2415 	list_del(&info->global);
2416 	if (info->dev)
2417 		info->dev->archdata.iommu = NULL;
2418 }
2419 
2420 static void domain_remove_dev_info(struct dmar_domain *domain)
2421 {
2422 	struct device_domain_info *info, *tmp;
2423 	unsigned long flags;
2424 
2425 	spin_lock_irqsave(&device_domain_lock, flags);
2426 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2427 		__dmar_remove_one_dev_info(info);
2428 	spin_unlock_irqrestore(&device_domain_lock, flags);
2429 }
2430 
2431 struct dmar_domain *find_domain(struct device *dev)
2432 {
2433 	struct device_domain_info *info;
2434 
2435 	if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2436 		return NULL;
2437 
2438 	/* No lock here, assumes no domain exit in normal case */
2439 	info = get_domain_info(dev);
2440 	if (likely(info))
2441 		return info->domain;
2442 
2443 	return NULL;
2444 }
2445 
2446 static void do_deferred_attach(struct device *dev)
2447 {
2448 	struct iommu_domain *domain;
2449 
2450 	dev->archdata.iommu = NULL;
2451 	domain = iommu_get_domain_for_dev(dev);
2452 	if (domain)
2453 		intel_iommu_attach_device(domain, dev);
2454 }
2455 
2456 static inline struct device_domain_info *
2457 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2458 {
2459 	struct device_domain_info *info;
2460 
2461 	list_for_each_entry(info, &device_domain_list, global)
2462 		if (info->segment == segment && info->bus == bus &&
2463 		    info->devfn == devfn)
2464 			return info;
2465 
2466 	return NULL;
2467 }
2468 
2469 static int domain_setup_first_level(struct intel_iommu *iommu,
2470 				    struct dmar_domain *domain,
2471 				    struct device *dev,
2472 				    int pasid)
2473 {
2474 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2475 	struct dma_pte *pgd = domain->pgd;
2476 	int agaw, level;
2477 
2478 	/*
2479 	 * Skip top levels of page tables for iommu which has
2480 	 * less agaw than default. Unnecessary for PT mode.
2481 	 */
2482 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2483 		pgd = phys_to_virt(dma_pte_addr(pgd));
2484 		if (!dma_pte_present(pgd))
2485 			return -ENOMEM;
2486 	}
2487 
2488 	level = agaw_to_level(agaw);
2489 	if (level != 4 && level != 5)
2490 		return -EINVAL;
2491 
2492 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2493 
2494 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2495 					     domain->iommu_did[iommu->seq_id],
2496 					     flags);
2497 }
2498 
2499 static bool dev_is_real_dma_subdevice(struct device *dev)
2500 {
2501 	return dev && dev_is_pci(dev) &&
2502 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2503 }
2504 
2505 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2506 						    int bus, int devfn,
2507 						    struct device *dev,
2508 						    struct dmar_domain *domain)
2509 {
2510 	struct dmar_domain *found = NULL;
2511 	struct device_domain_info *info;
2512 	unsigned long flags;
2513 	int ret;
2514 
2515 	info = alloc_devinfo_mem();
2516 	if (!info)
2517 		return NULL;
2518 
2519 	if (!dev_is_real_dma_subdevice(dev)) {
2520 		info->bus = bus;
2521 		info->devfn = devfn;
2522 		info->segment = iommu->segment;
2523 	} else {
2524 		struct pci_dev *pdev = to_pci_dev(dev);
2525 
2526 		info->bus = pdev->bus->number;
2527 		info->devfn = pdev->devfn;
2528 		info->segment = pci_domain_nr(pdev->bus);
2529 	}
2530 
2531 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2532 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2533 	info->ats_qdep = 0;
2534 	info->dev = dev;
2535 	info->domain = domain;
2536 	info->iommu = iommu;
2537 	info->pasid_table = NULL;
2538 	info->auxd_enabled = 0;
2539 	INIT_LIST_HEAD(&info->auxiliary_domains);
2540 
2541 	if (dev && dev_is_pci(dev)) {
2542 		struct pci_dev *pdev = to_pci_dev(info->dev);
2543 
2544 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2545 		    pci_ats_supported(pdev) &&
2546 		    dmar_find_matched_atsr_unit(pdev))
2547 			info->ats_supported = 1;
2548 
2549 		if (sm_supported(iommu)) {
2550 			if (pasid_supported(iommu)) {
2551 				int features = pci_pasid_features(pdev);
2552 				if (features >= 0)
2553 					info->pasid_supported = features | 1;
2554 			}
2555 
2556 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2557 			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2558 				info->pri_supported = 1;
2559 		}
2560 	}
2561 
2562 	spin_lock_irqsave(&device_domain_lock, flags);
2563 	if (dev)
2564 		found = find_domain(dev);
2565 
2566 	if (!found) {
2567 		struct device_domain_info *info2;
2568 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2569 						       info->devfn);
2570 		if (info2) {
2571 			found      = info2->domain;
2572 			info2->dev = dev;
2573 		}
2574 	}
2575 
2576 	if (found) {
2577 		spin_unlock_irqrestore(&device_domain_lock, flags);
2578 		free_devinfo_mem(info);
2579 		/* Caller must free the original domain */
2580 		return found;
2581 	}
2582 
2583 	spin_lock(&iommu->lock);
2584 	ret = domain_attach_iommu(domain, iommu);
2585 	spin_unlock(&iommu->lock);
2586 
2587 	if (ret) {
2588 		spin_unlock_irqrestore(&device_domain_lock, flags);
2589 		free_devinfo_mem(info);
2590 		return NULL;
2591 	}
2592 
2593 	list_add(&info->link, &domain->devices);
2594 	list_add(&info->global, &device_domain_list);
2595 	if (dev)
2596 		dev->archdata.iommu = info;
2597 	spin_unlock_irqrestore(&device_domain_lock, flags);
2598 
2599 	/* PASID table is mandatory for a PCI device in scalable mode. */
2600 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2601 		ret = intel_pasid_alloc_table(dev);
2602 		if (ret) {
2603 			dev_err(dev, "PASID table allocation failed\n");
2604 			dmar_remove_one_dev_info(dev);
2605 			return NULL;
2606 		}
2607 
2608 		/* Setup the PASID entry for requests without PASID: */
2609 		spin_lock(&iommu->lock);
2610 		if (hw_pass_through && domain_type_is_si(domain))
2611 			ret = intel_pasid_setup_pass_through(iommu, domain,
2612 					dev, PASID_RID2PASID);
2613 		else if (domain_use_first_level(domain))
2614 			ret = domain_setup_first_level(iommu, domain, dev,
2615 					PASID_RID2PASID);
2616 		else
2617 			ret = intel_pasid_setup_second_level(iommu, domain,
2618 					dev, PASID_RID2PASID);
2619 		spin_unlock(&iommu->lock);
2620 		if (ret) {
2621 			dev_err(dev, "Setup RID2PASID failed\n");
2622 			dmar_remove_one_dev_info(dev);
2623 			return NULL;
2624 		}
2625 	}
2626 
2627 	if (dev && domain_context_mapping(domain, dev)) {
2628 		dev_err(dev, "Domain context map failed\n");
2629 		dmar_remove_one_dev_info(dev);
2630 		return NULL;
2631 	}
2632 
2633 	return domain;
2634 }
2635 
2636 static int iommu_domain_identity_map(struct dmar_domain *domain,
2637 				     unsigned long first_vpfn,
2638 				     unsigned long last_vpfn)
2639 {
2640 	/*
2641 	 * RMRR range might have overlap with physical memory range,
2642 	 * clear it first
2643 	 */
2644 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2645 
2646 	return __domain_mapping(domain, first_vpfn, NULL,
2647 				first_vpfn, last_vpfn - first_vpfn + 1,
2648 				DMA_PTE_READ|DMA_PTE_WRITE);
2649 }
2650 
2651 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2652 
2653 static int __init si_domain_init(int hw)
2654 {
2655 	struct dmar_rmrr_unit *rmrr;
2656 	struct device *dev;
2657 	int i, nid, ret;
2658 
2659 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2660 	if (!si_domain)
2661 		return -EFAULT;
2662 
2663 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2664 		domain_exit(si_domain);
2665 		return -EFAULT;
2666 	}
2667 
2668 	if (hw)
2669 		return 0;
2670 
2671 	for_each_online_node(nid) {
2672 		unsigned long start_pfn, end_pfn;
2673 		int i;
2674 
2675 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2676 			ret = iommu_domain_identity_map(si_domain,
2677 					mm_to_dma_pfn(start_pfn),
2678 					mm_to_dma_pfn(end_pfn));
2679 			if (ret)
2680 				return ret;
2681 		}
2682 	}
2683 
2684 	/*
2685 	 * Identity map the RMRRs so that devices with RMRRs could also use
2686 	 * the si_domain.
2687 	 */
2688 	for_each_rmrr_units(rmrr) {
2689 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2690 					  i, dev) {
2691 			unsigned long long start = rmrr->base_address;
2692 			unsigned long long end = rmrr->end_address;
2693 
2694 			if (WARN_ON(end < start ||
2695 				    end >> agaw_to_width(si_domain->agaw)))
2696 				continue;
2697 
2698 			ret = iommu_domain_identity_map(si_domain, start, end);
2699 			if (ret)
2700 				return ret;
2701 		}
2702 	}
2703 
2704 	return 0;
2705 }
2706 
2707 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2708 {
2709 	struct dmar_domain *ndomain;
2710 	struct intel_iommu *iommu;
2711 	u8 bus, devfn;
2712 
2713 	iommu = device_to_iommu(dev, &bus, &devfn);
2714 	if (!iommu)
2715 		return -ENODEV;
2716 
2717 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2718 	if (ndomain != domain)
2719 		return -EBUSY;
2720 
2721 	return 0;
2722 }
2723 
2724 static bool device_has_rmrr(struct device *dev)
2725 {
2726 	struct dmar_rmrr_unit *rmrr;
2727 	struct device *tmp;
2728 	int i;
2729 
2730 	rcu_read_lock();
2731 	for_each_rmrr_units(rmrr) {
2732 		/*
2733 		 * Return TRUE if this RMRR contains the device that
2734 		 * is passed in.
2735 		 */
2736 		for_each_active_dev_scope(rmrr->devices,
2737 					  rmrr->devices_cnt, i, tmp)
2738 			if (tmp == dev ||
2739 			    is_downstream_to_pci_bridge(dev, tmp)) {
2740 				rcu_read_unlock();
2741 				return true;
2742 			}
2743 	}
2744 	rcu_read_unlock();
2745 	return false;
2746 }
2747 
2748 /**
2749  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2750  * is relaxable (ie. is allowed to be not enforced under some conditions)
2751  * @dev: device handle
2752  *
2753  * We assume that PCI USB devices with RMRRs have them largely
2754  * for historical reasons and that the RMRR space is not actively used post
2755  * boot.  This exclusion may change if vendors begin to abuse it.
2756  *
2757  * The same exception is made for graphics devices, with the requirement that
2758  * any use of the RMRR regions will be torn down before assigning the device
2759  * to a guest.
2760  *
2761  * Return: true if the RMRR is relaxable, false otherwise
2762  */
2763 static bool device_rmrr_is_relaxable(struct device *dev)
2764 {
2765 	struct pci_dev *pdev;
2766 
2767 	if (!dev_is_pci(dev))
2768 		return false;
2769 
2770 	pdev = to_pci_dev(dev);
2771 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2772 		return true;
2773 	else
2774 		return false;
2775 }
2776 
2777 /*
2778  * There are a couple cases where we need to restrict the functionality of
2779  * devices associated with RMRRs.  The first is when evaluating a device for
2780  * identity mapping because problems exist when devices are moved in and out
2781  * of domains and their respective RMRR information is lost.  This means that
2782  * a device with associated RMRRs will never be in a "passthrough" domain.
2783  * The second is use of the device through the IOMMU API.  This interface
2784  * expects to have full control of the IOVA space for the device.  We cannot
2785  * satisfy both the requirement that RMRR access is maintained and have an
2786  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2787  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2788  * We therefore prevent devices associated with an RMRR from participating in
2789  * the IOMMU API, which eliminates them from device assignment.
2790  *
2791  * In both cases, devices which have relaxable RMRRs are not concerned by this
2792  * restriction. See device_rmrr_is_relaxable comment.
2793  */
2794 static bool device_is_rmrr_locked(struct device *dev)
2795 {
2796 	if (!device_has_rmrr(dev))
2797 		return false;
2798 
2799 	if (device_rmrr_is_relaxable(dev))
2800 		return false;
2801 
2802 	return true;
2803 }
2804 
2805 /*
2806  * Return the required default domain type for a specific device.
2807  *
2808  * @dev: the device in query
2809  * @startup: true if this is during early boot
2810  *
2811  * Returns:
2812  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2813  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2814  *  - 0: both identity and dynamic domains work for this device
2815  */
2816 static int device_def_domain_type(struct device *dev)
2817 {
2818 	if (dev_is_pci(dev)) {
2819 		struct pci_dev *pdev = to_pci_dev(dev);
2820 
2821 		/*
2822 		 * Prevent any device marked as untrusted from getting
2823 		 * placed into the statically identity mapping domain.
2824 		 */
2825 		if (pdev->untrusted)
2826 			return IOMMU_DOMAIN_DMA;
2827 
2828 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2829 			return IOMMU_DOMAIN_IDENTITY;
2830 
2831 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2832 			return IOMMU_DOMAIN_IDENTITY;
2833 	}
2834 
2835 	return 0;
2836 }
2837 
2838 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2839 {
2840 	/*
2841 	 * Start from the sane iommu hardware state.
2842 	 * If the queued invalidation is already initialized by us
2843 	 * (for example, while enabling interrupt-remapping) then
2844 	 * we got the things already rolling from a sane state.
2845 	 */
2846 	if (!iommu->qi) {
2847 		/*
2848 		 * Clear any previous faults.
2849 		 */
2850 		dmar_fault(-1, iommu);
2851 		/*
2852 		 * Disable queued invalidation if supported and already enabled
2853 		 * before OS handover.
2854 		 */
2855 		dmar_disable_qi(iommu);
2856 	}
2857 
2858 	if (dmar_enable_qi(iommu)) {
2859 		/*
2860 		 * Queued Invalidate not enabled, use Register Based Invalidate
2861 		 */
2862 		iommu->flush.flush_context = __iommu_flush_context;
2863 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2864 		pr_info("%s: Using Register based invalidation\n",
2865 			iommu->name);
2866 	} else {
2867 		iommu->flush.flush_context = qi_flush_context;
2868 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2869 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2870 	}
2871 }
2872 
2873 static int copy_context_table(struct intel_iommu *iommu,
2874 			      struct root_entry *old_re,
2875 			      struct context_entry **tbl,
2876 			      int bus, bool ext)
2877 {
2878 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2879 	struct context_entry *new_ce = NULL, ce;
2880 	struct context_entry *old_ce = NULL;
2881 	struct root_entry re;
2882 	phys_addr_t old_ce_phys;
2883 
2884 	tbl_idx = ext ? bus * 2 : bus;
2885 	memcpy(&re, old_re, sizeof(re));
2886 
2887 	for (devfn = 0; devfn < 256; devfn++) {
2888 		/* First calculate the correct index */
2889 		idx = (ext ? devfn * 2 : devfn) % 256;
2890 
2891 		if (idx == 0) {
2892 			/* First save what we may have and clean up */
2893 			if (new_ce) {
2894 				tbl[tbl_idx] = new_ce;
2895 				__iommu_flush_cache(iommu, new_ce,
2896 						    VTD_PAGE_SIZE);
2897 				pos = 1;
2898 			}
2899 
2900 			if (old_ce)
2901 				memunmap(old_ce);
2902 
2903 			ret = 0;
2904 			if (devfn < 0x80)
2905 				old_ce_phys = root_entry_lctp(&re);
2906 			else
2907 				old_ce_phys = root_entry_uctp(&re);
2908 
2909 			if (!old_ce_phys) {
2910 				if (ext && devfn == 0) {
2911 					/* No LCTP, try UCTP */
2912 					devfn = 0x7f;
2913 					continue;
2914 				} else {
2915 					goto out;
2916 				}
2917 			}
2918 
2919 			ret = -ENOMEM;
2920 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2921 					MEMREMAP_WB);
2922 			if (!old_ce)
2923 				goto out;
2924 
2925 			new_ce = alloc_pgtable_page(iommu->node);
2926 			if (!new_ce)
2927 				goto out_unmap;
2928 
2929 			ret = 0;
2930 		}
2931 
2932 		/* Now copy the context entry */
2933 		memcpy(&ce, old_ce + idx, sizeof(ce));
2934 
2935 		if (!__context_present(&ce))
2936 			continue;
2937 
2938 		did = context_domain_id(&ce);
2939 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2940 			set_bit(did, iommu->domain_ids);
2941 
2942 		/*
2943 		 * We need a marker for copied context entries. This
2944 		 * marker needs to work for the old format as well as
2945 		 * for extended context entries.
2946 		 *
2947 		 * Bit 67 of the context entry is used. In the old
2948 		 * format this bit is available to software, in the
2949 		 * extended format it is the PGE bit, but PGE is ignored
2950 		 * by HW if PASIDs are disabled (and thus still
2951 		 * available).
2952 		 *
2953 		 * So disable PASIDs first and then mark the entry
2954 		 * copied. This means that we don't copy PASID
2955 		 * translations from the old kernel, but this is fine as
2956 		 * faults there are not fatal.
2957 		 */
2958 		context_clear_pasid_enable(&ce);
2959 		context_set_copied(&ce);
2960 
2961 		new_ce[idx] = ce;
2962 	}
2963 
2964 	tbl[tbl_idx + pos] = new_ce;
2965 
2966 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2967 
2968 out_unmap:
2969 	memunmap(old_ce);
2970 
2971 out:
2972 	return ret;
2973 }
2974 
2975 static int copy_translation_tables(struct intel_iommu *iommu)
2976 {
2977 	struct context_entry **ctxt_tbls;
2978 	struct root_entry *old_rt;
2979 	phys_addr_t old_rt_phys;
2980 	int ctxt_table_entries;
2981 	unsigned long flags;
2982 	u64 rtaddr_reg;
2983 	int bus, ret;
2984 	bool new_ext, ext;
2985 
2986 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2987 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2988 	new_ext    = !!ecap_ecs(iommu->ecap);
2989 
2990 	/*
2991 	 * The RTT bit can only be changed when translation is disabled,
2992 	 * but disabling translation means to open a window for data
2993 	 * corruption. So bail out and don't copy anything if we would
2994 	 * have to change the bit.
2995 	 */
2996 	if (new_ext != ext)
2997 		return -EINVAL;
2998 
2999 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3000 	if (!old_rt_phys)
3001 		return -EINVAL;
3002 
3003 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3004 	if (!old_rt)
3005 		return -ENOMEM;
3006 
3007 	/* This is too big for the stack - allocate it from slab */
3008 	ctxt_table_entries = ext ? 512 : 256;
3009 	ret = -ENOMEM;
3010 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3011 	if (!ctxt_tbls)
3012 		goto out_unmap;
3013 
3014 	for (bus = 0; bus < 256; bus++) {
3015 		ret = copy_context_table(iommu, &old_rt[bus],
3016 					 ctxt_tbls, bus, ext);
3017 		if (ret) {
3018 			pr_err("%s: Failed to copy context table for bus %d\n",
3019 				iommu->name, bus);
3020 			continue;
3021 		}
3022 	}
3023 
3024 	spin_lock_irqsave(&iommu->lock, flags);
3025 
3026 	/* Context tables are copied, now write them to the root_entry table */
3027 	for (bus = 0; bus < 256; bus++) {
3028 		int idx = ext ? bus * 2 : bus;
3029 		u64 val;
3030 
3031 		if (ctxt_tbls[idx]) {
3032 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3033 			iommu->root_entry[bus].lo = val;
3034 		}
3035 
3036 		if (!ext || !ctxt_tbls[idx + 1])
3037 			continue;
3038 
3039 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3040 		iommu->root_entry[bus].hi = val;
3041 	}
3042 
3043 	spin_unlock_irqrestore(&iommu->lock, flags);
3044 
3045 	kfree(ctxt_tbls);
3046 
3047 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3048 
3049 	ret = 0;
3050 
3051 out_unmap:
3052 	memunmap(old_rt);
3053 
3054 	return ret;
3055 }
3056 
3057 #ifdef CONFIG_INTEL_IOMMU_SVM
3058 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3059 {
3060 	struct intel_iommu *iommu = data;
3061 	ioasid_t ioasid;
3062 
3063 	if (!iommu)
3064 		return INVALID_IOASID;
3065 	/*
3066 	 * VT-d virtual command interface always uses the full 20 bit
3067 	 * PASID range. Host can partition guest PASID range based on
3068 	 * policies but it is out of guest's control.
3069 	 */
3070 	if (min < PASID_MIN || max > intel_pasid_max_id)
3071 		return INVALID_IOASID;
3072 
3073 	if (vcmd_alloc_pasid(iommu, &ioasid))
3074 		return INVALID_IOASID;
3075 
3076 	return ioasid;
3077 }
3078 
3079 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3080 {
3081 	struct intel_iommu *iommu = data;
3082 
3083 	if (!iommu)
3084 		return;
3085 	/*
3086 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3087 	 * We can only free the PASID when all the devices are unbound.
3088 	 */
3089 	if (ioasid_find(NULL, ioasid, NULL)) {
3090 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3091 		return;
3092 	}
3093 	vcmd_free_pasid(iommu, ioasid);
3094 }
3095 
3096 static void register_pasid_allocator(struct intel_iommu *iommu)
3097 {
3098 	/*
3099 	 * If we are running in the host, no need for custom allocator
3100 	 * in that PASIDs are allocated from the host system-wide.
3101 	 */
3102 	if (!cap_caching_mode(iommu->cap))
3103 		return;
3104 
3105 	if (!sm_supported(iommu)) {
3106 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3107 		return;
3108 	}
3109 
3110 	/*
3111 	 * Register a custom PASID allocator if we are running in a guest,
3112 	 * guest PASID must be obtained via virtual command interface.
3113 	 * There can be multiple vIOMMUs in each guest but only one allocator
3114 	 * is active. All vIOMMU allocators will eventually be calling the same
3115 	 * host allocator.
3116 	 */
3117 	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3118 		return;
3119 
3120 	pr_info("Register custom PASID allocator\n");
3121 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3122 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3123 	iommu->pasid_allocator.pdata = (void *)iommu;
3124 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3125 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3126 		/*
3127 		 * Disable scalable mode on this IOMMU if there
3128 		 * is no custom allocator. Mixing SM capable vIOMMU
3129 		 * and non-SM vIOMMU are not supported.
3130 		 */
3131 		intel_iommu_sm = 0;
3132 	}
3133 }
3134 #endif
3135 
3136 static int __init init_dmars(void)
3137 {
3138 	struct dmar_drhd_unit *drhd;
3139 	struct intel_iommu *iommu;
3140 	int ret;
3141 
3142 	/*
3143 	 * for each drhd
3144 	 *    allocate root
3145 	 *    initialize and program root entry to not present
3146 	 * endfor
3147 	 */
3148 	for_each_drhd_unit(drhd) {
3149 		/*
3150 		 * lock not needed as this is only incremented in the single
3151 		 * threaded kernel __init code path all other access are read
3152 		 * only
3153 		 */
3154 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3155 			g_num_of_iommus++;
3156 			continue;
3157 		}
3158 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3159 	}
3160 
3161 	/* Preallocate enough resources for IOMMU hot-addition */
3162 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3163 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3164 
3165 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3166 			GFP_KERNEL);
3167 	if (!g_iommus) {
3168 		pr_err("Allocating global iommu array failed\n");
3169 		ret = -ENOMEM;
3170 		goto error;
3171 	}
3172 
3173 	for_each_iommu(iommu, drhd) {
3174 		if (drhd->ignored) {
3175 			iommu_disable_translation(iommu);
3176 			continue;
3177 		}
3178 
3179 		/*
3180 		 * Find the max pasid size of all IOMMU's in the system.
3181 		 * We need to ensure the system pasid table is no bigger
3182 		 * than the smallest supported.
3183 		 */
3184 		if (pasid_supported(iommu)) {
3185 			u32 temp = 2 << ecap_pss(iommu->ecap);
3186 
3187 			intel_pasid_max_id = min_t(u32, temp,
3188 						   intel_pasid_max_id);
3189 		}
3190 
3191 		g_iommus[iommu->seq_id] = iommu;
3192 
3193 		intel_iommu_init_qi(iommu);
3194 
3195 		ret = iommu_init_domains(iommu);
3196 		if (ret)
3197 			goto free_iommu;
3198 
3199 		init_translation_status(iommu);
3200 
3201 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3202 			iommu_disable_translation(iommu);
3203 			clear_translation_pre_enabled(iommu);
3204 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3205 				iommu->name);
3206 		}
3207 
3208 		/*
3209 		 * TBD:
3210 		 * we could share the same root & context tables
3211 		 * among all IOMMU's. Need to Split it later.
3212 		 */
3213 		ret = iommu_alloc_root_entry(iommu);
3214 		if (ret)
3215 			goto free_iommu;
3216 
3217 		if (translation_pre_enabled(iommu)) {
3218 			pr_info("Translation already enabled - trying to copy translation structures\n");
3219 
3220 			ret = copy_translation_tables(iommu);
3221 			if (ret) {
3222 				/*
3223 				 * We found the IOMMU with translation
3224 				 * enabled - but failed to copy over the
3225 				 * old root-entry table. Try to proceed
3226 				 * by disabling translation now and
3227 				 * allocating a clean root-entry table.
3228 				 * This might cause DMAR faults, but
3229 				 * probably the dump will still succeed.
3230 				 */
3231 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3232 				       iommu->name);
3233 				iommu_disable_translation(iommu);
3234 				clear_translation_pre_enabled(iommu);
3235 			} else {
3236 				pr_info("Copied translation tables from previous kernel for %s\n",
3237 					iommu->name);
3238 			}
3239 		}
3240 
3241 		if (!ecap_pass_through(iommu->ecap))
3242 			hw_pass_through = 0;
3243 		intel_svm_check(iommu);
3244 	}
3245 
3246 	/*
3247 	 * Now that qi is enabled on all iommus, set the root entry and flush
3248 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3249 	 * flush_context function will loop forever and the boot hangs.
3250 	 */
3251 	for_each_active_iommu(iommu, drhd) {
3252 		iommu_flush_write_buffer(iommu);
3253 #ifdef CONFIG_INTEL_IOMMU_SVM
3254 		register_pasid_allocator(iommu);
3255 #endif
3256 		iommu_set_root_entry(iommu);
3257 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3258 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3259 	}
3260 
3261 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3262 	dmar_map_gfx = 0;
3263 #endif
3264 
3265 	if (!dmar_map_gfx)
3266 		iommu_identity_mapping |= IDENTMAP_GFX;
3267 
3268 	check_tylersburg_isoch();
3269 
3270 	ret = si_domain_init(hw_pass_through);
3271 	if (ret)
3272 		goto free_iommu;
3273 
3274 	/*
3275 	 * for each drhd
3276 	 *   enable fault log
3277 	 *   global invalidate context cache
3278 	 *   global invalidate iotlb
3279 	 *   enable translation
3280 	 */
3281 	for_each_iommu(iommu, drhd) {
3282 		if (drhd->ignored) {
3283 			/*
3284 			 * we always have to disable PMRs or DMA may fail on
3285 			 * this device
3286 			 */
3287 			if (force_on)
3288 				iommu_disable_protect_mem_regions(iommu);
3289 			continue;
3290 		}
3291 
3292 		iommu_flush_write_buffer(iommu);
3293 
3294 #ifdef CONFIG_INTEL_IOMMU_SVM
3295 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3296 			/*
3297 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3298 			 * could cause possible lock race condition.
3299 			 */
3300 			up_write(&dmar_global_lock);
3301 			ret = intel_svm_enable_prq(iommu);
3302 			down_write(&dmar_global_lock);
3303 			if (ret)
3304 				goto free_iommu;
3305 		}
3306 #endif
3307 		ret = dmar_set_interrupt(iommu);
3308 		if (ret)
3309 			goto free_iommu;
3310 	}
3311 
3312 	return 0;
3313 
3314 free_iommu:
3315 	for_each_active_iommu(iommu, drhd) {
3316 		disable_dmar_iommu(iommu);
3317 		free_dmar_iommu(iommu);
3318 	}
3319 
3320 	kfree(g_iommus);
3321 
3322 error:
3323 	return ret;
3324 }
3325 
3326 /* This takes a number of _MM_ pages, not VTD pages */
3327 static unsigned long intel_alloc_iova(struct device *dev,
3328 				     struct dmar_domain *domain,
3329 				     unsigned long nrpages, uint64_t dma_mask)
3330 {
3331 	unsigned long iova_pfn;
3332 
3333 	/*
3334 	 * Restrict dma_mask to the width that the iommu can handle.
3335 	 * First-level translation restricts the input-address to a
3336 	 * canonical address (i.e., address bits 63:N have the same
3337 	 * value as address bit [N-1], where N is 48-bits with 4-level
3338 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3339 	 * [N-1].
3340 	 */
3341 	if (domain_use_first_level(domain))
3342 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3343 				 dma_mask);
3344 	else
3345 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3346 				 dma_mask);
3347 
3348 	/* Ensure we reserve the whole size-aligned region */
3349 	nrpages = __roundup_pow_of_two(nrpages);
3350 
3351 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3352 		/*
3353 		 * First try to allocate an io virtual address in
3354 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3355 		 * from higher range
3356 		 */
3357 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3358 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3359 		if (iova_pfn)
3360 			return iova_pfn;
3361 	}
3362 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3363 				   IOVA_PFN(dma_mask), true);
3364 	if (unlikely(!iova_pfn)) {
3365 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3366 			     nrpages);
3367 		return 0;
3368 	}
3369 
3370 	return iova_pfn;
3371 }
3372 
3373 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3374 				     size_t size, int dir, u64 dma_mask)
3375 {
3376 	struct dmar_domain *domain;
3377 	phys_addr_t start_paddr;
3378 	unsigned long iova_pfn;
3379 	int prot = 0;
3380 	int ret;
3381 	struct intel_iommu *iommu;
3382 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3383 
3384 	BUG_ON(dir == DMA_NONE);
3385 
3386 	if (unlikely(attach_deferred(dev)))
3387 		do_deferred_attach(dev);
3388 
3389 	domain = find_domain(dev);
3390 	if (!domain)
3391 		return DMA_MAPPING_ERROR;
3392 
3393 	iommu = domain_get_iommu(domain);
3394 	size = aligned_nrpages(paddr, size);
3395 
3396 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3397 	if (!iova_pfn)
3398 		goto error;
3399 
3400 	/*
3401 	 * Check if DMAR supports zero-length reads on write only
3402 	 * mappings..
3403 	 */
3404 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3405 			!cap_zlr(iommu->cap))
3406 		prot |= DMA_PTE_READ;
3407 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3408 		prot |= DMA_PTE_WRITE;
3409 	/*
3410 	 * paddr - (paddr + size) might be partial page, we should map the whole
3411 	 * page.  Note: if two part of one page are separately mapped, we
3412 	 * might have two guest_addr mapping to the same host paddr, but this
3413 	 * is not a big problem
3414 	 */
3415 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3416 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3417 	if (ret)
3418 		goto error;
3419 
3420 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3421 	start_paddr += paddr & ~PAGE_MASK;
3422 
3423 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3424 
3425 	return start_paddr;
3426 
3427 error:
3428 	if (iova_pfn)
3429 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3430 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3431 		size, (unsigned long long)paddr, dir);
3432 	return DMA_MAPPING_ERROR;
3433 }
3434 
3435 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3436 				 unsigned long offset, size_t size,
3437 				 enum dma_data_direction dir,
3438 				 unsigned long attrs)
3439 {
3440 	return __intel_map_single(dev, page_to_phys(page) + offset,
3441 				  size, dir, *dev->dma_mask);
3442 }
3443 
3444 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3445 				     size_t size, enum dma_data_direction dir,
3446 				     unsigned long attrs)
3447 {
3448 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3449 }
3450 
3451 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3452 {
3453 	struct dmar_domain *domain;
3454 	unsigned long start_pfn, last_pfn;
3455 	unsigned long nrpages;
3456 	unsigned long iova_pfn;
3457 	struct intel_iommu *iommu;
3458 	struct page *freelist;
3459 	struct pci_dev *pdev = NULL;
3460 
3461 	domain = find_domain(dev);
3462 	BUG_ON(!domain);
3463 
3464 	iommu = domain_get_iommu(domain);
3465 
3466 	iova_pfn = IOVA_PFN(dev_addr);
3467 
3468 	nrpages = aligned_nrpages(dev_addr, size);
3469 	start_pfn = mm_to_dma_pfn(iova_pfn);
3470 	last_pfn = start_pfn + nrpages - 1;
3471 
3472 	if (dev_is_pci(dev))
3473 		pdev = to_pci_dev(dev);
3474 
3475 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3476 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3477 			!has_iova_flush_queue(&domain->iovad)) {
3478 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3479 				      nrpages, !freelist, 0);
3480 		/* free iova */
3481 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3482 		dma_free_pagelist(freelist);
3483 	} else {
3484 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3485 			   (unsigned long)freelist);
3486 		/*
3487 		 * queue up the release of the unmap to save the 1/6th of the
3488 		 * cpu used up by the iotlb flush operation...
3489 		 */
3490 	}
3491 
3492 	trace_unmap_single(dev, dev_addr, size);
3493 }
3494 
3495 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3496 			     size_t size, enum dma_data_direction dir,
3497 			     unsigned long attrs)
3498 {
3499 	intel_unmap(dev, dev_addr, size);
3500 }
3501 
3502 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3503 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3504 {
3505 	intel_unmap(dev, dev_addr, size);
3506 }
3507 
3508 static void *intel_alloc_coherent(struct device *dev, size_t size,
3509 				  dma_addr_t *dma_handle, gfp_t flags,
3510 				  unsigned long attrs)
3511 {
3512 	struct page *page = NULL;
3513 	int order;
3514 
3515 	if (unlikely(attach_deferred(dev)))
3516 		do_deferred_attach(dev);
3517 
3518 	size = PAGE_ALIGN(size);
3519 	order = get_order(size);
3520 
3521 	if (gfpflags_allow_blocking(flags)) {
3522 		unsigned int count = size >> PAGE_SHIFT;
3523 
3524 		page = dma_alloc_from_contiguous(dev, count, order,
3525 						 flags & __GFP_NOWARN);
3526 	}
3527 
3528 	if (!page)
3529 		page = alloc_pages(flags, order);
3530 	if (!page)
3531 		return NULL;
3532 	memset(page_address(page), 0, size);
3533 
3534 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3535 					 DMA_BIDIRECTIONAL,
3536 					 dev->coherent_dma_mask);
3537 	if (*dma_handle != DMA_MAPPING_ERROR)
3538 		return page_address(page);
3539 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3540 		__free_pages(page, order);
3541 
3542 	return NULL;
3543 }
3544 
3545 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3546 				dma_addr_t dma_handle, unsigned long attrs)
3547 {
3548 	int order;
3549 	struct page *page = virt_to_page(vaddr);
3550 
3551 	size = PAGE_ALIGN(size);
3552 	order = get_order(size);
3553 
3554 	intel_unmap(dev, dma_handle, size);
3555 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3556 		__free_pages(page, order);
3557 }
3558 
3559 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3560 			   int nelems, enum dma_data_direction dir,
3561 			   unsigned long attrs)
3562 {
3563 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3564 	unsigned long nrpages = 0;
3565 	struct scatterlist *sg;
3566 	int i;
3567 
3568 	for_each_sg(sglist, sg, nelems, i) {
3569 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3570 	}
3571 
3572 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3573 
3574 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3575 }
3576 
3577 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3578 			enum dma_data_direction dir, unsigned long attrs)
3579 {
3580 	int i;
3581 	struct dmar_domain *domain;
3582 	size_t size = 0;
3583 	int prot = 0;
3584 	unsigned long iova_pfn;
3585 	int ret;
3586 	struct scatterlist *sg;
3587 	unsigned long start_vpfn;
3588 	struct intel_iommu *iommu;
3589 
3590 	BUG_ON(dir == DMA_NONE);
3591 
3592 	if (unlikely(attach_deferred(dev)))
3593 		do_deferred_attach(dev);
3594 
3595 	domain = find_domain(dev);
3596 	if (!domain)
3597 		return 0;
3598 
3599 	iommu = domain_get_iommu(domain);
3600 
3601 	for_each_sg(sglist, sg, nelems, i)
3602 		size += aligned_nrpages(sg->offset, sg->length);
3603 
3604 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3605 				*dev->dma_mask);
3606 	if (!iova_pfn) {
3607 		sglist->dma_length = 0;
3608 		return 0;
3609 	}
3610 
3611 	/*
3612 	 * Check if DMAR supports zero-length reads on write only
3613 	 * mappings..
3614 	 */
3615 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3616 			!cap_zlr(iommu->cap))
3617 		prot |= DMA_PTE_READ;
3618 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3619 		prot |= DMA_PTE_WRITE;
3620 
3621 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3622 
3623 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3624 	if (unlikely(ret)) {
3625 		dma_pte_free_pagetable(domain, start_vpfn,
3626 				       start_vpfn + size - 1,
3627 				       agaw_to_level(domain->agaw) + 1);
3628 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3629 		return 0;
3630 	}
3631 
3632 	for_each_sg(sglist, sg, nelems, i)
3633 		trace_map_sg(dev, i + 1, nelems, sg);
3634 
3635 	return nelems;
3636 }
3637 
3638 static u64 intel_get_required_mask(struct device *dev)
3639 {
3640 	return DMA_BIT_MASK(32);
3641 }
3642 
3643 static const struct dma_map_ops intel_dma_ops = {
3644 	.alloc = intel_alloc_coherent,
3645 	.free = intel_free_coherent,
3646 	.map_sg = intel_map_sg,
3647 	.unmap_sg = intel_unmap_sg,
3648 	.map_page = intel_map_page,
3649 	.unmap_page = intel_unmap_page,
3650 	.map_resource = intel_map_resource,
3651 	.unmap_resource = intel_unmap_resource,
3652 	.dma_supported = dma_direct_supported,
3653 	.mmap = dma_common_mmap,
3654 	.get_sgtable = dma_common_get_sgtable,
3655 	.get_required_mask = intel_get_required_mask,
3656 };
3657 
3658 static void
3659 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3660 		   enum dma_data_direction dir, enum dma_sync_target target)
3661 {
3662 	struct dmar_domain *domain;
3663 	phys_addr_t tlb_addr;
3664 
3665 	domain = find_domain(dev);
3666 	if (WARN_ON(!domain))
3667 		return;
3668 
3669 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3670 	if (is_swiotlb_buffer(tlb_addr))
3671 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3672 }
3673 
3674 static dma_addr_t
3675 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3676 		  enum dma_data_direction dir, unsigned long attrs,
3677 		  u64 dma_mask)
3678 {
3679 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3680 	struct dmar_domain *domain;
3681 	struct intel_iommu *iommu;
3682 	unsigned long iova_pfn;
3683 	unsigned long nrpages;
3684 	phys_addr_t tlb_addr;
3685 	int prot = 0;
3686 	int ret;
3687 
3688 	if (unlikely(attach_deferred(dev)))
3689 		do_deferred_attach(dev);
3690 
3691 	domain = find_domain(dev);
3692 
3693 	if (WARN_ON(dir == DMA_NONE || !domain))
3694 		return DMA_MAPPING_ERROR;
3695 
3696 	iommu = domain_get_iommu(domain);
3697 	if (WARN_ON(!iommu))
3698 		return DMA_MAPPING_ERROR;
3699 
3700 	nrpages = aligned_nrpages(0, size);
3701 	iova_pfn = intel_alloc_iova(dev, domain,
3702 				    dma_to_mm_pfn(nrpages), dma_mask);
3703 	if (!iova_pfn)
3704 		return DMA_MAPPING_ERROR;
3705 
3706 	/*
3707 	 * Check if DMAR supports zero-length reads on write only
3708 	 * mappings..
3709 	 */
3710 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3711 			!cap_zlr(iommu->cap))
3712 		prot |= DMA_PTE_READ;
3713 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714 		prot |= DMA_PTE_WRITE;
3715 
3716 	/*
3717 	 * If both the physical buffer start address and size are
3718 	 * page aligned, we don't need to use a bounce page.
3719 	 */
3720 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3721 		tlb_addr = swiotlb_tbl_map_single(dev,
3722 				__phys_to_dma(dev, io_tlb_start),
3723 				paddr, size, aligned_size, dir, attrs);
3724 		if (tlb_addr == DMA_MAPPING_ERROR) {
3725 			goto swiotlb_error;
3726 		} else {
3727 			/* Cleanup the padding area. */
3728 			void *padding_start = phys_to_virt(tlb_addr);
3729 			size_t padding_size = aligned_size;
3730 
3731 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3732 			    (dir == DMA_TO_DEVICE ||
3733 			     dir == DMA_BIDIRECTIONAL)) {
3734 				padding_start += size;
3735 				padding_size -= size;
3736 			}
3737 
3738 			memset(padding_start, 0, padding_size);
3739 		}
3740 	} else {
3741 		tlb_addr = paddr;
3742 	}
3743 
3744 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3745 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3746 	if (ret)
3747 		goto mapping_error;
3748 
3749 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3750 
3751 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3752 
3753 mapping_error:
3754 	if (is_swiotlb_buffer(tlb_addr))
3755 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3756 					 aligned_size, dir, attrs);
3757 swiotlb_error:
3758 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3759 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3760 		size, (unsigned long long)paddr, dir);
3761 
3762 	return DMA_MAPPING_ERROR;
3763 }
3764 
3765 static void
3766 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3767 		    enum dma_data_direction dir, unsigned long attrs)
3768 {
3769 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3770 	struct dmar_domain *domain;
3771 	phys_addr_t tlb_addr;
3772 
3773 	domain = find_domain(dev);
3774 	if (WARN_ON(!domain))
3775 		return;
3776 
3777 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3778 	if (WARN_ON(!tlb_addr))
3779 		return;
3780 
3781 	intel_unmap(dev, dev_addr, size);
3782 	if (is_swiotlb_buffer(tlb_addr))
3783 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3784 					 aligned_size, dir, attrs);
3785 
3786 	trace_bounce_unmap_single(dev, dev_addr, size);
3787 }
3788 
3789 static dma_addr_t
3790 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3791 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3792 {
3793 	return bounce_map_single(dev, page_to_phys(page) + offset,
3794 				 size, dir, attrs, *dev->dma_mask);
3795 }
3796 
3797 static dma_addr_t
3798 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3799 		    enum dma_data_direction dir, unsigned long attrs)
3800 {
3801 	return bounce_map_single(dev, phys_addr, size,
3802 				 dir, attrs, *dev->dma_mask);
3803 }
3804 
3805 static void
3806 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3807 		  enum dma_data_direction dir, unsigned long attrs)
3808 {
3809 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3810 }
3811 
3812 static void
3813 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3814 		      enum dma_data_direction dir, unsigned long attrs)
3815 {
3816 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3817 }
3818 
3819 static void
3820 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3821 		enum dma_data_direction dir, unsigned long attrs)
3822 {
3823 	struct scatterlist *sg;
3824 	int i;
3825 
3826 	for_each_sg(sglist, sg, nelems, i)
3827 		bounce_unmap_page(dev, sg->dma_address,
3828 				  sg_dma_len(sg), dir, attrs);
3829 }
3830 
3831 static int
3832 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3833 	      enum dma_data_direction dir, unsigned long attrs)
3834 {
3835 	int i;
3836 	struct scatterlist *sg;
3837 
3838 	for_each_sg(sglist, sg, nelems, i) {
3839 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3840 						  sg->offset, sg->length,
3841 						  dir, attrs);
3842 		if (sg->dma_address == DMA_MAPPING_ERROR)
3843 			goto out_unmap;
3844 		sg_dma_len(sg) = sg->length;
3845 	}
3846 
3847 	for_each_sg(sglist, sg, nelems, i)
3848 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3849 
3850 	return nelems;
3851 
3852 out_unmap:
3853 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3854 	return 0;
3855 }
3856 
3857 static void
3858 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3859 			   size_t size, enum dma_data_direction dir)
3860 {
3861 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3862 }
3863 
3864 static void
3865 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3866 			      size_t size, enum dma_data_direction dir)
3867 {
3868 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3869 }
3870 
3871 static void
3872 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3873 		       int nelems, enum dma_data_direction dir)
3874 {
3875 	struct scatterlist *sg;
3876 	int i;
3877 
3878 	for_each_sg(sglist, sg, nelems, i)
3879 		bounce_sync_single(dev, sg_dma_address(sg),
3880 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3881 }
3882 
3883 static void
3884 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3885 			  int nelems, enum dma_data_direction dir)
3886 {
3887 	struct scatterlist *sg;
3888 	int i;
3889 
3890 	for_each_sg(sglist, sg, nelems, i)
3891 		bounce_sync_single(dev, sg_dma_address(sg),
3892 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3893 }
3894 
3895 static const struct dma_map_ops bounce_dma_ops = {
3896 	.alloc			= intel_alloc_coherent,
3897 	.free			= intel_free_coherent,
3898 	.map_sg			= bounce_map_sg,
3899 	.unmap_sg		= bounce_unmap_sg,
3900 	.map_page		= bounce_map_page,
3901 	.unmap_page		= bounce_unmap_page,
3902 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
3903 	.sync_single_for_device	= bounce_sync_single_for_device,
3904 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
3905 	.sync_sg_for_device	= bounce_sync_sg_for_device,
3906 	.map_resource		= bounce_map_resource,
3907 	.unmap_resource		= bounce_unmap_resource,
3908 	.dma_supported		= dma_direct_supported,
3909 };
3910 
3911 static inline int iommu_domain_cache_init(void)
3912 {
3913 	int ret = 0;
3914 
3915 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3916 					 sizeof(struct dmar_domain),
3917 					 0,
3918 					 SLAB_HWCACHE_ALIGN,
3919 
3920 					 NULL);
3921 	if (!iommu_domain_cache) {
3922 		pr_err("Couldn't create iommu_domain cache\n");
3923 		ret = -ENOMEM;
3924 	}
3925 
3926 	return ret;
3927 }
3928 
3929 static inline int iommu_devinfo_cache_init(void)
3930 {
3931 	int ret = 0;
3932 
3933 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3934 					 sizeof(struct device_domain_info),
3935 					 0,
3936 					 SLAB_HWCACHE_ALIGN,
3937 					 NULL);
3938 	if (!iommu_devinfo_cache) {
3939 		pr_err("Couldn't create devinfo cache\n");
3940 		ret = -ENOMEM;
3941 	}
3942 
3943 	return ret;
3944 }
3945 
3946 static int __init iommu_init_mempool(void)
3947 {
3948 	int ret;
3949 	ret = iova_cache_get();
3950 	if (ret)
3951 		return ret;
3952 
3953 	ret = iommu_domain_cache_init();
3954 	if (ret)
3955 		goto domain_error;
3956 
3957 	ret = iommu_devinfo_cache_init();
3958 	if (!ret)
3959 		return ret;
3960 
3961 	kmem_cache_destroy(iommu_domain_cache);
3962 domain_error:
3963 	iova_cache_put();
3964 
3965 	return -ENOMEM;
3966 }
3967 
3968 static void __init iommu_exit_mempool(void)
3969 {
3970 	kmem_cache_destroy(iommu_devinfo_cache);
3971 	kmem_cache_destroy(iommu_domain_cache);
3972 	iova_cache_put();
3973 }
3974 
3975 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3976 {
3977 	struct dmar_drhd_unit *drhd;
3978 	u32 vtbar;
3979 	int rc;
3980 
3981 	/* We know that this device on this chipset has its own IOMMU.
3982 	 * If we find it under a different IOMMU, then the BIOS is lying
3983 	 * to us. Hope that the IOMMU for this device is actually
3984 	 * disabled, and it needs no translation...
3985 	 */
3986 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3987 	if (rc) {
3988 		/* "can't" happen */
3989 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3990 		return;
3991 	}
3992 	vtbar &= 0xffff0000;
3993 
3994 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3995 	drhd = dmar_find_matched_drhd_unit(pdev);
3996 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3997 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3998 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3999 		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4000 	}
4001 }
4002 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4003 
4004 static void __init init_no_remapping_devices(void)
4005 {
4006 	struct dmar_drhd_unit *drhd;
4007 	struct device *dev;
4008 	int i;
4009 
4010 	for_each_drhd_unit(drhd) {
4011 		if (!drhd->include_all) {
4012 			for_each_active_dev_scope(drhd->devices,
4013 						  drhd->devices_cnt, i, dev)
4014 				break;
4015 			/* ignore DMAR unit if no devices exist */
4016 			if (i == drhd->devices_cnt)
4017 				drhd->ignored = 1;
4018 		}
4019 	}
4020 
4021 	for_each_active_drhd_unit(drhd) {
4022 		if (drhd->include_all)
4023 			continue;
4024 
4025 		for_each_active_dev_scope(drhd->devices,
4026 					  drhd->devices_cnt, i, dev)
4027 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4028 				break;
4029 		if (i < drhd->devices_cnt)
4030 			continue;
4031 
4032 		/* This IOMMU has *only* gfx devices. Either bypass it or
4033 		   set the gfx_mapped flag, as appropriate */
4034 		if (!dmar_map_gfx) {
4035 			drhd->ignored = 1;
4036 			for_each_active_dev_scope(drhd->devices,
4037 						  drhd->devices_cnt, i, dev)
4038 				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4039 		}
4040 	}
4041 }
4042 
4043 #ifdef CONFIG_SUSPEND
4044 static int init_iommu_hw(void)
4045 {
4046 	struct dmar_drhd_unit *drhd;
4047 	struct intel_iommu *iommu = NULL;
4048 
4049 	for_each_active_iommu(iommu, drhd)
4050 		if (iommu->qi)
4051 			dmar_reenable_qi(iommu);
4052 
4053 	for_each_iommu(iommu, drhd) {
4054 		if (drhd->ignored) {
4055 			/*
4056 			 * we always have to disable PMRs or DMA may fail on
4057 			 * this device
4058 			 */
4059 			if (force_on)
4060 				iommu_disable_protect_mem_regions(iommu);
4061 			continue;
4062 		}
4063 
4064 		iommu_flush_write_buffer(iommu);
4065 
4066 		iommu_set_root_entry(iommu);
4067 
4068 		iommu->flush.flush_context(iommu, 0, 0, 0,
4069 					   DMA_CCMD_GLOBAL_INVL);
4070 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4071 		iommu_enable_translation(iommu);
4072 		iommu_disable_protect_mem_regions(iommu);
4073 	}
4074 
4075 	return 0;
4076 }
4077 
4078 static void iommu_flush_all(void)
4079 {
4080 	struct dmar_drhd_unit *drhd;
4081 	struct intel_iommu *iommu;
4082 
4083 	for_each_active_iommu(iommu, drhd) {
4084 		iommu->flush.flush_context(iommu, 0, 0, 0,
4085 					   DMA_CCMD_GLOBAL_INVL);
4086 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4087 					 DMA_TLB_GLOBAL_FLUSH);
4088 	}
4089 }
4090 
4091 static int iommu_suspend(void)
4092 {
4093 	struct dmar_drhd_unit *drhd;
4094 	struct intel_iommu *iommu = NULL;
4095 	unsigned long flag;
4096 
4097 	for_each_active_iommu(iommu, drhd) {
4098 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4099 						 GFP_ATOMIC);
4100 		if (!iommu->iommu_state)
4101 			goto nomem;
4102 	}
4103 
4104 	iommu_flush_all();
4105 
4106 	for_each_active_iommu(iommu, drhd) {
4107 		iommu_disable_translation(iommu);
4108 
4109 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4110 
4111 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4112 			readl(iommu->reg + DMAR_FECTL_REG);
4113 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4114 			readl(iommu->reg + DMAR_FEDATA_REG);
4115 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4116 			readl(iommu->reg + DMAR_FEADDR_REG);
4117 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4118 			readl(iommu->reg + DMAR_FEUADDR_REG);
4119 
4120 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4121 	}
4122 	return 0;
4123 
4124 nomem:
4125 	for_each_active_iommu(iommu, drhd)
4126 		kfree(iommu->iommu_state);
4127 
4128 	return -ENOMEM;
4129 }
4130 
4131 static void iommu_resume(void)
4132 {
4133 	struct dmar_drhd_unit *drhd;
4134 	struct intel_iommu *iommu = NULL;
4135 	unsigned long flag;
4136 
4137 	if (init_iommu_hw()) {
4138 		if (force_on)
4139 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4140 		else
4141 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4142 		return;
4143 	}
4144 
4145 	for_each_active_iommu(iommu, drhd) {
4146 
4147 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4148 
4149 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4150 			iommu->reg + DMAR_FECTL_REG);
4151 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4152 			iommu->reg + DMAR_FEDATA_REG);
4153 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4154 			iommu->reg + DMAR_FEADDR_REG);
4155 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4156 			iommu->reg + DMAR_FEUADDR_REG);
4157 
4158 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4159 	}
4160 
4161 	for_each_active_iommu(iommu, drhd)
4162 		kfree(iommu->iommu_state);
4163 }
4164 
4165 static struct syscore_ops iommu_syscore_ops = {
4166 	.resume		= iommu_resume,
4167 	.suspend	= iommu_suspend,
4168 };
4169 
4170 static void __init init_iommu_pm_ops(void)
4171 {
4172 	register_syscore_ops(&iommu_syscore_ops);
4173 }
4174 
4175 #else
4176 static inline void init_iommu_pm_ops(void) {}
4177 #endif	/* CONFIG_PM */
4178 
4179 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4180 {
4181 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4182 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4183 	    rmrr->end_address <= rmrr->base_address ||
4184 	    arch_rmrr_sanity_check(rmrr))
4185 		return -EINVAL;
4186 
4187 	return 0;
4188 }
4189 
4190 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4191 {
4192 	struct acpi_dmar_reserved_memory *rmrr;
4193 	struct dmar_rmrr_unit *rmrru;
4194 
4195 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4196 	if (rmrr_sanity_check(rmrr)) {
4197 		pr_warn(FW_BUG
4198 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4199 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4200 			   rmrr->base_address, rmrr->end_address,
4201 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4202 			   dmi_get_system_info(DMI_BIOS_VERSION),
4203 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4204 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4205 	}
4206 
4207 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4208 	if (!rmrru)
4209 		goto out;
4210 
4211 	rmrru->hdr = header;
4212 
4213 	rmrru->base_address = rmrr->base_address;
4214 	rmrru->end_address = rmrr->end_address;
4215 
4216 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4217 				((void *)rmrr) + rmrr->header.length,
4218 				&rmrru->devices_cnt);
4219 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4220 		goto free_rmrru;
4221 
4222 	list_add(&rmrru->list, &dmar_rmrr_units);
4223 
4224 	return 0;
4225 free_rmrru:
4226 	kfree(rmrru);
4227 out:
4228 	return -ENOMEM;
4229 }
4230 
4231 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4232 {
4233 	struct dmar_atsr_unit *atsru;
4234 	struct acpi_dmar_atsr *tmp;
4235 
4236 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4237 				dmar_rcu_check()) {
4238 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4239 		if (atsr->segment != tmp->segment)
4240 			continue;
4241 		if (atsr->header.length != tmp->header.length)
4242 			continue;
4243 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4244 			return atsru;
4245 	}
4246 
4247 	return NULL;
4248 }
4249 
4250 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4251 {
4252 	struct acpi_dmar_atsr *atsr;
4253 	struct dmar_atsr_unit *atsru;
4254 
4255 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4256 		return 0;
4257 
4258 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4259 	atsru = dmar_find_atsr(atsr);
4260 	if (atsru)
4261 		return 0;
4262 
4263 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4264 	if (!atsru)
4265 		return -ENOMEM;
4266 
4267 	/*
4268 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4269 	 * copy the memory content because the memory buffer will be freed
4270 	 * on return.
4271 	 */
4272 	atsru->hdr = (void *)(atsru + 1);
4273 	memcpy(atsru->hdr, hdr, hdr->length);
4274 	atsru->include_all = atsr->flags & 0x1;
4275 	if (!atsru->include_all) {
4276 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4277 				(void *)atsr + atsr->header.length,
4278 				&atsru->devices_cnt);
4279 		if (atsru->devices_cnt && atsru->devices == NULL) {
4280 			kfree(atsru);
4281 			return -ENOMEM;
4282 		}
4283 	}
4284 
4285 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4286 
4287 	return 0;
4288 }
4289 
4290 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4291 {
4292 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4293 	kfree(atsru);
4294 }
4295 
4296 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4297 {
4298 	struct acpi_dmar_atsr *atsr;
4299 	struct dmar_atsr_unit *atsru;
4300 
4301 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4302 	atsru = dmar_find_atsr(atsr);
4303 	if (atsru) {
4304 		list_del_rcu(&atsru->list);
4305 		synchronize_rcu();
4306 		intel_iommu_free_atsr(atsru);
4307 	}
4308 
4309 	return 0;
4310 }
4311 
4312 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4313 {
4314 	int i;
4315 	struct device *dev;
4316 	struct acpi_dmar_atsr *atsr;
4317 	struct dmar_atsr_unit *atsru;
4318 
4319 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4320 	atsru = dmar_find_atsr(atsr);
4321 	if (!atsru)
4322 		return 0;
4323 
4324 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4325 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4326 					  i, dev)
4327 			return -EBUSY;
4328 	}
4329 
4330 	return 0;
4331 }
4332 
4333 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4334 {
4335 	int sp, ret;
4336 	struct intel_iommu *iommu = dmaru->iommu;
4337 
4338 	if (g_iommus[iommu->seq_id])
4339 		return 0;
4340 
4341 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4342 		pr_warn("%s: Doesn't support hardware pass through.\n",
4343 			iommu->name);
4344 		return -ENXIO;
4345 	}
4346 	if (!ecap_sc_support(iommu->ecap) &&
4347 	    domain_update_iommu_snooping(iommu)) {
4348 		pr_warn("%s: Doesn't support snooping.\n",
4349 			iommu->name);
4350 		return -ENXIO;
4351 	}
4352 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4353 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4354 		pr_warn("%s: Doesn't support large page.\n",
4355 			iommu->name);
4356 		return -ENXIO;
4357 	}
4358 
4359 	/*
4360 	 * Disable translation if already enabled prior to OS handover.
4361 	 */
4362 	if (iommu->gcmd & DMA_GCMD_TE)
4363 		iommu_disable_translation(iommu);
4364 
4365 	g_iommus[iommu->seq_id] = iommu;
4366 	ret = iommu_init_domains(iommu);
4367 	if (ret == 0)
4368 		ret = iommu_alloc_root_entry(iommu);
4369 	if (ret)
4370 		goto out;
4371 
4372 	intel_svm_check(iommu);
4373 
4374 	if (dmaru->ignored) {
4375 		/*
4376 		 * we always have to disable PMRs or DMA may fail on this device
4377 		 */
4378 		if (force_on)
4379 			iommu_disable_protect_mem_regions(iommu);
4380 		return 0;
4381 	}
4382 
4383 	intel_iommu_init_qi(iommu);
4384 	iommu_flush_write_buffer(iommu);
4385 
4386 #ifdef CONFIG_INTEL_IOMMU_SVM
4387 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4388 		ret = intel_svm_enable_prq(iommu);
4389 		if (ret)
4390 			goto disable_iommu;
4391 	}
4392 #endif
4393 	ret = dmar_set_interrupt(iommu);
4394 	if (ret)
4395 		goto disable_iommu;
4396 
4397 	iommu_set_root_entry(iommu);
4398 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4399 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4400 	iommu_enable_translation(iommu);
4401 
4402 	iommu_disable_protect_mem_regions(iommu);
4403 	return 0;
4404 
4405 disable_iommu:
4406 	disable_dmar_iommu(iommu);
4407 out:
4408 	free_dmar_iommu(iommu);
4409 	return ret;
4410 }
4411 
4412 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4413 {
4414 	int ret = 0;
4415 	struct intel_iommu *iommu = dmaru->iommu;
4416 
4417 	if (!intel_iommu_enabled)
4418 		return 0;
4419 	if (iommu == NULL)
4420 		return -EINVAL;
4421 
4422 	if (insert) {
4423 		ret = intel_iommu_add(dmaru);
4424 	} else {
4425 		disable_dmar_iommu(iommu);
4426 		free_dmar_iommu(iommu);
4427 	}
4428 
4429 	return ret;
4430 }
4431 
4432 static void intel_iommu_free_dmars(void)
4433 {
4434 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4435 	struct dmar_atsr_unit *atsru, *atsr_n;
4436 
4437 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4438 		list_del(&rmrru->list);
4439 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4440 		kfree(rmrru);
4441 	}
4442 
4443 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4444 		list_del(&atsru->list);
4445 		intel_iommu_free_atsr(atsru);
4446 	}
4447 }
4448 
4449 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4450 {
4451 	int i, ret = 1;
4452 	struct pci_bus *bus;
4453 	struct pci_dev *bridge = NULL;
4454 	struct device *tmp;
4455 	struct acpi_dmar_atsr *atsr;
4456 	struct dmar_atsr_unit *atsru;
4457 
4458 	dev = pci_physfn(dev);
4459 	for (bus = dev->bus; bus; bus = bus->parent) {
4460 		bridge = bus->self;
4461 		/* If it's an integrated device, allow ATS */
4462 		if (!bridge)
4463 			return 1;
4464 		/* Connected via non-PCIe: no ATS */
4465 		if (!pci_is_pcie(bridge) ||
4466 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4467 			return 0;
4468 		/* If we found the root port, look it up in the ATSR */
4469 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4470 			break;
4471 	}
4472 
4473 	rcu_read_lock();
4474 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4475 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4476 		if (atsr->segment != pci_domain_nr(dev->bus))
4477 			continue;
4478 
4479 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4480 			if (tmp == &bridge->dev)
4481 				goto out;
4482 
4483 		if (atsru->include_all)
4484 			goto out;
4485 	}
4486 	ret = 0;
4487 out:
4488 	rcu_read_unlock();
4489 
4490 	return ret;
4491 }
4492 
4493 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4494 {
4495 	int ret;
4496 	struct dmar_rmrr_unit *rmrru;
4497 	struct dmar_atsr_unit *atsru;
4498 	struct acpi_dmar_atsr *atsr;
4499 	struct acpi_dmar_reserved_memory *rmrr;
4500 
4501 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4502 		return 0;
4503 
4504 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4505 		rmrr = container_of(rmrru->hdr,
4506 				    struct acpi_dmar_reserved_memory, header);
4507 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4508 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4509 				((void *)rmrr) + rmrr->header.length,
4510 				rmrr->segment, rmrru->devices,
4511 				rmrru->devices_cnt);
4512 			if (ret < 0)
4513 				return ret;
4514 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4515 			dmar_remove_dev_scope(info, rmrr->segment,
4516 				rmrru->devices, rmrru->devices_cnt);
4517 		}
4518 	}
4519 
4520 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4521 		if (atsru->include_all)
4522 			continue;
4523 
4524 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4525 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4526 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4527 					(void *)atsr + atsr->header.length,
4528 					atsr->segment, atsru->devices,
4529 					atsru->devices_cnt);
4530 			if (ret > 0)
4531 				break;
4532 			else if (ret < 0)
4533 				return ret;
4534 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4535 			if (dmar_remove_dev_scope(info, atsr->segment,
4536 					atsru->devices, atsru->devices_cnt))
4537 				break;
4538 		}
4539 	}
4540 
4541 	return 0;
4542 }
4543 
4544 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4545 				       unsigned long val, void *v)
4546 {
4547 	struct memory_notify *mhp = v;
4548 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4549 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4550 			mhp->nr_pages - 1);
4551 
4552 	switch (val) {
4553 	case MEM_GOING_ONLINE:
4554 		if (iommu_domain_identity_map(si_domain,
4555 					      start_vpfn, last_vpfn)) {
4556 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4557 				start_vpfn, last_vpfn);
4558 			return NOTIFY_BAD;
4559 		}
4560 		break;
4561 
4562 	case MEM_OFFLINE:
4563 	case MEM_CANCEL_ONLINE:
4564 		{
4565 			struct dmar_drhd_unit *drhd;
4566 			struct intel_iommu *iommu;
4567 			struct page *freelist;
4568 
4569 			freelist = domain_unmap(si_domain,
4570 						start_vpfn, last_vpfn);
4571 
4572 			rcu_read_lock();
4573 			for_each_active_iommu(iommu, drhd)
4574 				iommu_flush_iotlb_psi(iommu, si_domain,
4575 					start_vpfn, mhp->nr_pages,
4576 					!freelist, 0);
4577 			rcu_read_unlock();
4578 			dma_free_pagelist(freelist);
4579 		}
4580 		break;
4581 	}
4582 
4583 	return NOTIFY_OK;
4584 }
4585 
4586 static struct notifier_block intel_iommu_memory_nb = {
4587 	.notifier_call = intel_iommu_memory_notifier,
4588 	.priority = 0
4589 };
4590 
4591 static void free_all_cpu_cached_iovas(unsigned int cpu)
4592 {
4593 	int i;
4594 
4595 	for (i = 0; i < g_num_of_iommus; i++) {
4596 		struct intel_iommu *iommu = g_iommus[i];
4597 		struct dmar_domain *domain;
4598 		int did;
4599 
4600 		if (!iommu)
4601 			continue;
4602 
4603 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4604 			domain = get_iommu_domain(iommu, (u16)did);
4605 
4606 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4607 				continue;
4608 
4609 			free_cpu_cached_iovas(cpu, &domain->iovad);
4610 		}
4611 	}
4612 }
4613 
4614 static int intel_iommu_cpu_dead(unsigned int cpu)
4615 {
4616 	free_all_cpu_cached_iovas(cpu);
4617 	return 0;
4618 }
4619 
4620 static void intel_disable_iommus(void)
4621 {
4622 	struct intel_iommu *iommu = NULL;
4623 	struct dmar_drhd_unit *drhd;
4624 
4625 	for_each_iommu(iommu, drhd)
4626 		iommu_disable_translation(iommu);
4627 }
4628 
4629 void intel_iommu_shutdown(void)
4630 {
4631 	struct dmar_drhd_unit *drhd;
4632 	struct intel_iommu *iommu = NULL;
4633 
4634 	if (no_iommu || dmar_disabled)
4635 		return;
4636 
4637 	down_write(&dmar_global_lock);
4638 
4639 	/* Disable PMRs explicitly here. */
4640 	for_each_iommu(iommu, drhd)
4641 		iommu_disable_protect_mem_regions(iommu);
4642 
4643 	/* Make sure the IOMMUs are switched off */
4644 	intel_disable_iommus();
4645 
4646 	up_write(&dmar_global_lock);
4647 }
4648 
4649 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4650 {
4651 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4652 
4653 	return container_of(iommu_dev, struct intel_iommu, iommu);
4654 }
4655 
4656 static ssize_t intel_iommu_show_version(struct device *dev,
4657 					struct device_attribute *attr,
4658 					char *buf)
4659 {
4660 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4661 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4662 	return sprintf(buf, "%d:%d\n",
4663 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4664 }
4665 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4666 
4667 static ssize_t intel_iommu_show_address(struct device *dev,
4668 					struct device_attribute *attr,
4669 					char *buf)
4670 {
4671 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4672 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4673 }
4674 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4675 
4676 static ssize_t intel_iommu_show_cap(struct device *dev,
4677 				    struct device_attribute *attr,
4678 				    char *buf)
4679 {
4680 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4681 	return sprintf(buf, "%llx\n", iommu->cap);
4682 }
4683 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4684 
4685 static ssize_t intel_iommu_show_ecap(struct device *dev,
4686 				    struct device_attribute *attr,
4687 				    char *buf)
4688 {
4689 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690 	return sprintf(buf, "%llx\n", iommu->ecap);
4691 }
4692 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4693 
4694 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4695 				      struct device_attribute *attr,
4696 				      char *buf)
4697 {
4698 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4700 }
4701 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4702 
4703 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4704 					   struct device_attribute *attr,
4705 					   char *buf)
4706 {
4707 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4708 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4709 						  cap_ndoms(iommu->cap)));
4710 }
4711 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4712 
4713 static struct attribute *intel_iommu_attrs[] = {
4714 	&dev_attr_version.attr,
4715 	&dev_attr_address.attr,
4716 	&dev_attr_cap.attr,
4717 	&dev_attr_ecap.attr,
4718 	&dev_attr_domains_supported.attr,
4719 	&dev_attr_domains_used.attr,
4720 	NULL,
4721 };
4722 
4723 static struct attribute_group intel_iommu_group = {
4724 	.name = "intel-iommu",
4725 	.attrs = intel_iommu_attrs,
4726 };
4727 
4728 const struct attribute_group *intel_iommu_groups[] = {
4729 	&intel_iommu_group,
4730 	NULL,
4731 };
4732 
4733 static inline bool has_untrusted_dev(void)
4734 {
4735 	struct pci_dev *pdev = NULL;
4736 
4737 	for_each_pci_dev(pdev)
4738 		if (pdev->untrusted)
4739 			return true;
4740 
4741 	return false;
4742 }
4743 
4744 static int __init platform_optin_force_iommu(void)
4745 {
4746 	if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4747 		return 0;
4748 
4749 	if (no_iommu || dmar_disabled)
4750 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4751 
4752 	/*
4753 	 * If Intel-IOMMU is disabled by default, we will apply identity
4754 	 * map for all devices except those marked as being untrusted.
4755 	 */
4756 	if (dmar_disabled)
4757 		iommu_set_default_passthrough(false);
4758 
4759 	dmar_disabled = 0;
4760 	no_iommu = 0;
4761 
4762 	return 1;
4763 }
4764 
4765 static int __init probe_acpi_namespace_devices(void)
4766 {
4767 	struct dmar_drhd_unit *drhd;
4768 	/* To avoid a -Wunused-but-set-variable warning. */
4769 	struct intel_iommu *iommu __maybe_unused;
4770 	struct device *dev;
4771 	int i, ret = 0;
4772 
4773 	for_each_active_iommu(iommu, drhd) {
4774 		for_each_active_dev_scope(drhd->devices,
4775 					  drhd->devices_cnt, i, dev) {
4776 			struct acpi_device_physical_node *pn;
4777 			struct iommu_group *group;
4778 			struct acpi_device *adev;
4779 
4780 			if (dev->bus != &acpi_bus_type)
4781 				continue;
4782 
4783 			adev = to_acpi_device(dev);
4784 			mutex_lock(&adev->physical_node_lock);
4785 			list_for_each_entry(pn,
4786 					    &adev->physical_node_list, node) {
4787 				group = iommu_group_get(pn->dev);
4788 				if (group) {
4789 					iommu_group_put(group);
4790 					continue;
4791 				}
4792 
4793 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4794 				ret = iommu_probe_device(pn->dev);
4795 				if (ret)
4796 					break;
4797 			}
4798 			mutex_unlock(&adev->physical_node_lock);
4799 
4800 			if (ret)
4801 				return ret;
4802 		}
4803 	}
4804 
4805 	return 0;
4806 }
4807 
4808 int __init intel_iommu_init(void)
4809 {
4810 	int ret = -ENODEV;
4811 	struct dmar_drhd_unit *drhd;
4812 	struct intel_iommu *iommu;
4813 
4814 	/*
4815 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4816 	 * opt in, so enforce that.
4817 	 */
4818 	force_on = tboot_force_iommu() || platform_optin_force_iommu();
4819 
4820 	if (iommu_init_mempool()) {
4821 		if (force_on)
4822 			panic("tboot: Failed to initialize iommu memory\n");
4823 		return -ENOMEM;
4824 	}
4825 
4826 	down_write(&dmar_global_lock);
4827 	if (dmar_table_init()) {
4828 		if (force_on)
4829 			panic("tboot: Failed to initialize DMAR table\n");
4830 		goto out_free_dmar;
4831 	}
4832 
4833 	if (dmar_dev_scope_init() < 0) {
4834 		if (force_on)
4835 			panic("tboot: Failed to initialize DMAR device scope\n");
4836 		goto out_free_dmar;
4837 	}
4838 
4839 	up_write(&dmar_global_lock);
4840 
4841 	/*
4842 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4843 	 * complain later when we register it under the lock.
4844 	 */
4845 	dmar_register_bus_notifier();
4846 
4847 	down_write(&dmar_global_lock);
4848 
4849 	if (!no_iommu)
4850 		intel_iommu_debugfs_init();
4851 
4852 	if (no_iommu || dmar_disabled) {
4853 		/*
4854 		 * We exit the function here to ensure IOMMU's remapping and
4855 		 * mempool aren't setup, which means that the IOMMU's PMRs
4856 		 * won't be disabled via the call to init_dmars(). So disable
4857 		 * it explicitly here. The PMRs were setup by tboot prior to
4858 		 * calling SENTER, but the kernel is expected to reset/tear
4859 		 * down the PMRs.
4860 		 */
4861 		if (intel_iommu_tboot_noforce) {
4862 			for_each_iommu(iommu, drhd)
4863 				iommu_disable_protect_mem_regions(iommu);
4864 		}
4865 
4866 		/*
4867 		 * Make sure the IOMMUs are switched off, even when we
4868 		 * boot into a kexec kernel and the previous kernel left
4869 		 * them enabled
4870 		 */
4871 		intel_disable_iommus();
4872 		goto out_free_dmar;
4873 	}
4874 
4875 	if (list_empty(&dmar_rmrr_units))
4876 		pr_info("No RMRR found\n");
4877 
4878 	if (list_empty(&dmar_atsr_units))
4879 		pr_info("No ATSR found\n");
4880 
4881 	if (dmar_init_reserved_ranges()) {
4882 		if (force_on)
4883 			panic("tboot: Failed to reserve iommu ranges\n");
4884 		goto out_free_reserved_range;
4885 	}
4886 
4887 	if (dmar_map_gfx)
4888 		intel_iommu_gfx_mapped = 1;
4889 
4890 	init_no_remapping_devices();
4891 
4892 	ret = init_dmars();
4893 	if (ret) {
4894 		if (force_on)
4895 			panic("tboot: Failed to initialize DMARs\n");
4896 		pr_err("Initialization failed\n");
4897 		goto out_free_reserved_range;
4898 	}
4899 	up_write(&dmar_global_lock);
4900 
4901 	init_iommu_pm_ops();
4902 
4903 	down_read(&dmar_global_lock);
4904 	for_each_active_iommu(iommu, drhd) {
4905 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4906 				       intel_iommu_groups,
4907 				       "%s", iommu->name);
4908 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4909 		iommu_device_register(&iommu->iommu);
4910 	}
4911 	up_read(&dmar_global_lock);
4912 
4913 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4914 	if (si_domain && !hw_pass_through)
4915 		register_memory_notifier(&intel_iommu_memory_nb);
4916 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4917 			  intel_iommu_cpu_dead);
4918 
4919 	down_read(&dmar_global_lock);
4920 	if (probe_acpi_namespace_devices())
4921 		pr_warn("ACPI name space devices didn't probe correctly\n");
4922 
4923 	/* Finally, we enable the DMA remapping hardware. */
4924 	for_each_iommu(iommu, drhd) {
4925 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4926 			iommu_enable_translation(iommu);
4927 
4928 		iommu_disable_protect_mem_regions(iommu);
4929 	}
4930 	up_read(&dmar_global_lock);
4931 
4932 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4933 
4934 	intel_iommu_enabled = 1;
4935 
4936 	return 0;
4937 
4938 out_free_reserved_range:
4939 	put_iova_domain(&reserved_iova_list);
4940 out_free_dmar:
4941 	intel_iommu_free_dmars();
4942 	up_write(&dmar_global_lock);
4943 	iommu_exit_mempool();
4944 	return ret;
4945 }
4946 
4947 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4948 {
4949 	struct intel_iommu *iommu = opaque;
4950 
4951 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4952 	return 0;
4953 }
4954 
4955 /*
4956  * NB - intel-iommu lacks any sort of reference counting for the users of
4957  * dependent devices.  If multiple endpoints have intersecting dependent
4958  * devices, unbinding the driver from any one of them will possibly leave
4959  * the others unable to operate.
4960  */
4961 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4962 {
4963 	if (!iommu || !dev || !dev_is_pci(dev))
4964 		return;
4965 
4966 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4967 }
4968 
4969 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4970 {
4971 	struct dmar_domain *domain;
4972 	struct intel_iommu *iommu;
4973 	unsigned long flags;
4974 
4975 	assert_spin_locked(&device_domain_lock);
4976 
4977 	if (WARN_ON(!info))
4978 		return;
4979 
4980 	iommu = info->iommu;
4981 	domain = info->domain;
4982 
4983 	if (info->dev) {
4984 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4985 			intel_pasid_tear_down_entry(iommu, info->dev,
4986 					PASID_RID2PASID, false);
4987 
4988 		iommu_disable_dev_iotlb(info);
4989 		if (!dev_is_real_dma_subdevice(info->dev))
4990 			domain_context_clear(iommu, info->dev);
4991 		intel_pasid_free_table(info->dev);
4992 	}
4993 
4994 	unlink_domain_info(info);
4995 
4996 	spin_lock_irqsave(&iommu->lock, flags);
4997 	domain_detach_iommu(domain, iommu);
4998 	spin_unlock_irqrestore(&iommu->lock, flags);
4999 
5000 	free_devinfo_mem(info);
5001 }
5002 
5003 static void dmar_remove_one_dev_info(struct device *dev)
5004 {
5005 	struct device_domain_info *info;
5006 	unsigned long flags;
5007 
5008 	spin_lock_irqsave(&device_domain_lock, flags);
5009 	info = get_domain_info(dev);
5010 	if (info)
5011 		__dmar_remove_one_dev_info(info);
5012 	spin_unlock_irqrestore(&device_domain_lock, flags);
5013 }
5014 
5015 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5016 {
5017 	int adjust_width;
5018 
5019 	/* calculate AGAW */
5020 	domain->gaw = guest_width;
5021 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5022 	domain->agaw = width_to_agaw(adjust_width);
5023 
5024 	domain->iommu_coherency = 0;
5025 	domain->iommu_snooping = 0;
5026 	domain->iommu_superpage = 0;
5027 	domain->max_addr = 0;
5028 
5029 	/* always allocate the top pgd */
5030 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5031 	if (!domain->pgd)
5032 		return -ENOMEM;
5033 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5034 	return 0;
5035 }
5036 
5037 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5038 {
5039 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5040 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5041 
5042 	if (!intel_iommu_strict &&
5043 	    init_iova_flush_queue(&dmar_domain->iovad,
5044 				  iommu_flush_iova, iova_entry_free))
5045 		pr_info("iova flush queue initialization failed\n");
5046 }
5047 
5048 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5049 {
5050 	struct dmar_domain *dmar_domain;
5051 	struct iommu_domain *domain;
5052 
5053 	switch (type) {
5054 	case IOMMU_DOMAIN_DMA:
5055 	/* fallthrough */
5056 	case IOMMU_DOMAIN_UNMANAGED:
5057 		dmar_domain = alloc_domain(0);
5058 		if (!dmar_domain) {
5059 			pr_err("Can't allocate dmar_domain\n");
5060 			return NULL;
5061 		}
5062 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5063 			pr_err("Domain initialization failed\n");
5064 			domain_exit(dmar_domain);
5065 			return NULL;
5066 		}
5067 
5068 		if (type == IOMMU_DOMAIN_DMA)
5069 			intel_init_iova_domain(dmar_domain);
5070 
5071 		domain_update_iommu_cap(dmar_domain);
5072 
5073 		domain = &dmar_domain->domain;
5074 		domain->geometry.aperture_start = 0;
5075 		domain->geometry.aperture_end   =
5076 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5077 		domain->geometry.force_aperture = true;
5078 
5079 		return domain;
5080 	case IOMMU_DOMAIN_IDENTITY:
5081 		return &si_domain->domain;
5082 	default:
5083 		return NULL;
5084 	}
5085 
5086 	return NULL;
5087 }
5088 
5089 static void intel_iommu_domain_free(struct iommu_domain *domain)
5090 {
5091 	if (domain != &si_domain->domain)
5092 		domain_exit(to_dmar_domain(domain));
5093 }
5094 
5095 /*
5096  * Check whether a @domain could be attached to the @dev through the
5097  * aux-domain attach/detach APIs.
5098  */
5099 static inline bool
5100 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5101 {
5102 	struct device_domain_info *info = get_domain_info(dev);
5103 
5104 	return info && info->auxd_enabled &&
5105 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5106 }
5107 
5108 static void auxiliary_link_device(struct dmar_domain *domain,
5109 				  struct device *dev)
5110 {
5111 	struct device_domain_info *info = get_domain_info(dev);
5112 
5113 	assert_spin_locked(&device_domain_lock);
5114 	if (WARN_ON(!info))
5115 		return;
5116 
5117 	domain->auxd_refcnt++;
5118 	list_add(&domain->auxd, &info->auxiliary_domains);
5119 }
5120 
5121 static void auxiliary_unlink_device(struct dmar_domain *domain,
5122 				    struct device *dev)
5123 {
5124 	struct device_domain_info *info = get_domain_info(dev);
5125 
5126 	assert_spin_locked(&device_domain_lock);
5127 	if (WARN_ON(!info))
5128 		return;
5129 
5130 	list_del(&domain->auxd);
5131 	domain->auxd_refcnt--;
5132 
5133 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5134 		ioasid_free(domain->default_pasid);
5135 }
5136 
5137 static int aux_domain_add_dev(struct dmar_domain *domain,
5138 			      struct device *dev)
5139 {
5140 	int ret;
5141 	u8 bus, devfn;
5142 	unsigned long flags;
5143 	struct intel_iommu *iommu;
5144 
5145 	iommu = device_to_iommu(dev, &bus, &devfn);
5146 	if (!iommu)
5147 		return -ENODEV;
5148 
5149 	if (domain->default_pasid <= 0) {
5150 		int pasid;
5151 
5152 		/* No private data needed for the default pasid */
5153 		pasid = ioasid_alloc(NULL, PASID_MIN,
5154 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5155 				     NULL);
5156 		if (pasid == INVALID_IOASID) {
5157 			pr_err("Can't allocate default pasid\n");
5158 			return -ENODEV;
5159 		}
5160 		domain->default_pasid = pasid;
5161 	}
5162 
5163 	spin_lock_irqsave(&device_domain_lock, flags);
5164 	/*
5165 	 * iommu->lock must be held to attach domain to iommu and setup the
5166 	 * pasid entry for second level translation.
5167 	 */
5168 	spin_lock(&iommu->lock);
5169 	ret = domain_attach_iommu(domain, iommu);
5170 	if (ret)
5171 		goto attach_failed;
5172 
5173 	/* Setup the PASID entry for mediated devices: */
5174 	if (domain_use_first_level(domain))
5175 		ret = domain_setup_first_level(iommu, domain, dev,
5176 					       domain->default_pasid);
5177 	else
5178 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5179 						     domain->default_pasid);
5180 	if (ret)
5181 		goto table_failed;
5182 	spin_unlock(&iommu->lock);
5183 
5184 	auxiliary_link_device(domain, dev);
5185 
5186 	spin_unlock_irqrestore(&device_domain_lock, flags);
5187 
5188 	return 0;
5189 
5190 table_failed:
5191 	domain_detach_iommu(domain, iommu);
5192 attach_failed:
5193 	spin_unlock(&iommu->lock);
5194 	spin_unlock_irqrestore(&device_domain_lock, flags);
5195 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5196 		ioasid_free(domain->default_pasid);
5197 
5198 	return ret;
5199 }
5200 
5201 static void aux_domain_remove_dev(struct dmar_domain *domain,
5202 				  struct device *dev)
5203 {
5204 	struct device_domain_info *info;
5205 	struct intel_iommu *iommu;
5206 	unsigned long flags;
5207 
5208 	if (!is_aux_domain(dev, &domain->domain))
5209 		return;
5210 
5211 	spin_lock_irqsave(&device_domain_lock, flags);
5212 	info = get_domain_info(dev);
5213 	iommu = info->iommu;
5214 
5215 	auxiliary_unlink_device(domain, dev);
5216 
5217 	spin_lock(&iommu->lock);
5218 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5219 	domain_detach_iommu(domain, iommu);
5220 	spin_unlock(&iommu->lock);
5221 
5222 	spin_unlock_irqrestore(&device_domain_lock, flags);
5223 }
5224 
5225 static int prepare_domain_attach_device(struct iommu_domain *domain,
5226 					struct device *dev)
5227 {
5228 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5229 	struct intel_iommu *iommu;
5230 	int addr_width;
5231 	u8 bus, devfn;
5232 
5233 	iommu = device_to_iommu(dev, &bus, &devfn);
5234 	if (!iommu)
5235 		return -ENODEV;
5236 
5237 	/* check if this iommu agaw is sufficient for max mapped address */
5238 	addr_width = agaw_to_width(iommu->agaw);
5239 	if (addr_width > cap_mgaw(iommu->cap))
5240 		addr_width = cap_mgaw(iommu->cap);
5241 
5242 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5243 		dev_err(dev, "%s: iommu width (%d) is not "
5244 		        "sufficient for the mapped address (%llx)\n",
5245 		        __func__, addr_width, dmar_domain->max_addr);
5246 		return -EFAULT;
5247 	}
5248 	dmar_domain->gaw = addr_width;
5249 
5250 	/*
5251 	 * Knock out extra levels of page tables if necessary
5252 	 */
5253 	while (iommu->agaw < dmar_domain->agaw) {
5254 		struct dma_pte *pte;
5255 
5256 		pte = dmar_domain->pgd;
5257 		if (dma_pte_present(pte)) {
5258 			dmar_domain->pgd = (struct dma_pte *)
5259 				phys_to_virt(dma_pte_addr(pte));
5260 			free_pgtable_page(pte);
5261 		}
5262 		dmar_domain->agaw--;
5263 	}
5264 
5265 	return 0;
5266 }
5267 
5268 static int intel_iommu_attach_device(struct iommu_domain *domain,
5269 				     struct device *dev)
5270 {
5271 	int ret;
5272 
5273 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5274 	    device_is_rmrr_locked(dev)) {
5275 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5276 		return -EPERM;
5277 	}
5278 
5279 	if (is_aux_domain(dev, domain))
5280 		return -EPERM;
5281 
5282 	/* normally dev is not mapped */
5283 	if (unlikely(domain_context_mapped(dev))) {
5284 		struct dmar_domain *old_domain;
5285 
5286 		old_domain = find_domain(dev);
5287 		if (old_domain)
5288 			dmar_remove_one_dev_info(dev);
5289 	}
5290 
5291 	ret = prepare_domain_attach_device(domain, dev);
5292 	if (ret)
5293 		return ret;
5294 
5295 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5296 }
5297 
5298 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5299 					 struct device *dev)
5300 {
5301 	int ret;
5302 
5303 	if (!is_aux_domain(dev, domain))
5304 		return -EPERM;
5305 
5306 	ret = prepare_domain_attach_device(domain, dev);
5307 	if (ret)
5308 		return ret;
5309 
5310 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5311 }
5312 
5313 static void intel_iommu_detach_device(struct iommu_domain *domain,
5314 				      struct device *dev)
5315 {
5316 	dmar_remove_one_dev_info(dev);
5317 }
5318 
5319 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5320 					  struct device *dev)
5321 {
5322 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5323 }
5324 
5325 /*
5326  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5327  * VT-d granularity. Invalidation is typically included in the unmap operation
5328  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5329  * owns the first level page tables. Invalidations of translation caches in the
5330  * guest are trapped and passed down to the host.
5331  *
5332  * vIOMMU in the guest will only expose first level page tables, therefore
5333  * we do not support IOTLB granularity for request without PASID (second level).
5334  *
5335  * For example, to find the VT-d granularity encoding for IOTLB
5336  * type and page selective granularity within PASID:
5337  * X: indexed by iommu cache type
5338  * Y: indexed by enum iommu_inv_granularity
5339  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5340  */
5341 
5342 static const int
5343 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5344 	/*
5345 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5346 	 * page selective (address granularity)
5347 	 */
5348 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5349 	/* PASID based dev TLBs */
5350 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5351 	/* PASID cache */
5352 	{-EINVAL, -EINVAL, -EINVAL}
5353 };
5354 
5355 static inline int to_vtd_granularity(int type, int granu)
5356 {
5357 	return inv_type_granu_table[type][granu];
5358 }
5359 
5360 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5361 {
5362 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5363 
5364 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5365 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5366 	 * granu size in contiguous memory.
5367 	 */
5368 	return order_base_2(nr_pages);
5369 }
5370 
5371 #ifdef CONFIG_INTEL_IOMMU_SVM
5372 static int
5373 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5374 			   struct iommu_cache_invalidate_info *inv_info)
5375 {
5376 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5377 	struct device_domain_info *info;
5378 	struct intel_iommu *iommu;
5379 	unsigned long flags;
5380 	int cache_type;
5381 	u8 bus, devfn;
5382 	u16 did, sid;
5383 	int ret = 0;
5384 	u64 size = 0;
5385 
5386 	if (!inv_info || !dmar_domain ||
5387 	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5388 		return -EINVAL;
5389 
5390 	if (!dev || !dev_is_pci(dev))
5391 		return -ENODEV;
5392 
5393 	iommu = device_to_iommu(dev, &bus, &devfn);
5394 	if (!iommu)
5395 		return -ENODEV;
5396 
5397 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5398 		return -EINVAL;
5399 
5400 	spin_lock_irqsave(&device_domain_lock, flags);
5401 	spin_lock(&iommu->lock);
5402 	info = get_domain_info(dev);
5403 	if (!info) {
5404 		ret = -EINVAL;
5405 		goto out_unlock;
5406 	}
5407 	did = dmar_domain->iommu_did[iommu->seq_id];
5408 	sid = PCI_DEVID(bus, devfn);
5409 
5410 	/* Size is only valid in address selective invalidation */
5411 	if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5412 		size = to_vtd_size(inv_info->addr_info.granule_size,
5413 				   inv_info->addr_info.nb_granules);
5414 
5415 	for_each_set_bit(cache_type,
5416 			 (unsigned long *)&inv_info->cache,
5417 			 IOMMU_CACHE_INV_TYPE_NR) {
5418 		int granu = 0;
5419 		u64 pasid = 0;
5420 
5421 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5422 		if (granu == -EINVAL) {
5423 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5424 					   cache_type, inv_info->granularity);
5425 			break;
5426 		}
5427 
5428 		/*
5429 		 * PASID is stored in different locations based on the
5430 		 * granularity.
5431 		 */
5432 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5433 		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5434 			pasid = inv_info->pasid_info.pasid;
5435 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5436 			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5437 			pasid = inv_info->addr_info.pasid;
5438 
5439 		switch (BIT(cache_type)) {
5440 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5441 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5442 			    size &&
5443 			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5444 				pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5445 						   inv_info->addr_info.addr, size);
5446 				ret = -ERANGE;
5447 				goto out_unlock;
5448 			}
5449 
5450 			/*
5451 			 * If granu is PASID-selective, address is ignored.
5452 			 * We use npages = -1 to indicate that.
5453 			 */
5454 			qi_flush_piotlb(iommu, did, pasid,
5455 					mm_to_dma_pfn(inv_info->addr_info.addr),
5456 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5457 					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5458 
5459 			/*
5460 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5461 			 * in the guest may assume IOTLB flush is inclusive,
5462 			 * which is more efficient.
5463 			 */
5464 			if (info->ats_enabled)
5465 				qi_flush_dev_iotlb_pasid(iommu, sid,
5466 						info->pfsid, pasid,
5467 						info->ats_qdep,
5468 						inv_info->addr_info.addr,
5469 						size, granu);
5470 			break;
5471 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5472 			if (info->ats_enabled)
5473 				qi_flush_dev_iotlb_pasid(iommu, sid,
5474 						info->pfsid, pasid,
5475 						info->ats_qdep,
5476 						inv_info->addr_info.addr,
5477 						size, granu);
5478 			else
5479 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5480 			break;
5481 		default:
5482 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5483 					    cache_type);
5484 			ret = -EINVAL;
5485 		}
5486 	}
5487 out_unlock:
5488 	spin_unlock(&iommu->lock);
5489 	spin_unlock_irqrestore(&device_domain_lock, flags);
5490 
5491 	return ret;
5492 }
5493 #endif
5494 
5495 static int intel_iommu_map(struct iommu_domain *domain,
5496 			   unsigned long iova, phys_addr_t hpa,
5497 			   size_t size, int iommu_prot, gfp_t gfp)
5498 {
5499 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5500 	u64 max_addr;
5501 	int prot = 0;
5502 	int ret;
5503 
5504 	if (iommu_prot & IOMMU_READ)
5505 		prot |= DMA_PTE_READ;
5506 	if (iommu_prot & IOMMU_WRITE)
5507 		prot |= DMA_PTE_WRITE;
5508 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5509 		prot |= DMA_PTE_SNP;
5510 
5511 	max_addr = iova + size;
5512 	if (dmar_domain->max_addr < max_addr) {
5513 		u64 end;
5514 
5515 		/* check if minimum agaw is sufficient for mapped address */
5516 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5517 		if (end < max_addr) {
5518 			pr_err("%s: iommu width (%d) is not "
5519 			       "sufficient for the mapped address (%llx)\n",
5520 			       __func__, dmar_domain->gaw, max_addr);
5521 			return -EFAULT;
5522 		}
5523 		dmar_domain->max_addr = max_addr;
5524 	}
5525 	/* Round up size to next multiple of PAGE_SIZE, if it and
5526 	   the low bits of hpa would take us onto the next page */
5527 	size = aligned_nrpages(hpa, size);
5528 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5529 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5530 	return ret;
5531 }
5532 
5533 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5534 				unsigned long iova, size_t size,
5535 				struct iommu_iotlb_gather *gather)
5536 {
5537 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5538 	struct page *freelist = NULL;
5539 	unsigned long start_pfn, last_pfn;
5540 	unsigned int npages;
5541 	int iommu_id, level = 0;
5542 
5543 	/* Cope with horrid API which requires us to unmap more than the
5544 	   size argument if it happens to be a large-page mapping. */
5545 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5546 
5547 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5548 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5549 
5550 	start_pfn = iova >> VTD_PAGE_SHIFT;
5551 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5552 
5553 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5554 
5555 	npages = last_pfn - start_pfn + 1;
5556 
5557 	for_each_domain_iommu(iommu_id, dmar_domain)
5558 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5559 				      start_pfn, npages, !freelist, 0);
5560 
5561 	dma_free_pagelist(freelist);
5562 
5563 	if (dmar_domain->max_addr == iova + size)
5564 		dmar_domain->max_addr = iova;
5565 
5566 	return size;
5567 }
5568 
5569 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5570 					    dma_addr_t iova)
5571 {
5572 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573 	struct dma_pte *pte;
5574 	int level = 0;
5575 	u64 phys = 0;
5576 
5577 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5578 	if (pte && dma_pte_present(pte))
5579 		phys = dma_pte_addr(pte) +
5580 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5581 						VTD_PAGE_SHIFT) - 1));
5582 
5583 	return phys;
5584 }
5585 
5586 static inline bool scalable_mode_support(void)
5587 {
5588 	struct dmar_drhd_unit *drhd;
5589 	struct intel_iommu *iommu;
5590 	bool ret = true;
5591 
5592 	rcu_read_lock();
5593 	for_each_active_iommu(iommu, drhd) {
5594 		if (!sm_supported(iommu)) {
5595 			ret = false;
5596 			break;
5597 		}
5598 	}
5599 	rcu_read_unlock();
5600 
5601 	return ret;
5602 }
5603 
5604 static inline bool iommu_pasid_support(void)
5605 {
5606 	struct dmar_drhd_unit *drhd;
5607 	struct intel_iommu *iommu;
5608 	bool ret = true;
5609 
5610 	rcu_read_lock();
5611 	for_each_active_iommu(iommu, drhd) {
5612 		if (!pasid_supported(iommu)) {
5613 			ret = false;
5614 			break;
5615 		}
5616 	}
5617 	rcu_read_unlock();
5618 
5619 	return ret;
5620 }
5621 
5622 static inline bool nested_mode_support(void)
5623 {
5624 	struct dmar_drhd_unit *drhd;
5625 	struct intel_iommu *iommu;
5626 	bool ret = true;
5627 
5628 	rcu_read_lock();
5629 	for_each_active_iommu(iommu, drhd) {
5630 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5631 			ret = false;
5632 			break;
5633 		}
5634 	}
5635 	rcu_read_unlock();
5636 
5637 	return ret;
5638 }
5639 
5640 static bool intel_iommu_capable(enum iommu_cap cap)
5641 {
5642 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5643 		return domain_update_iommu_snooping(NULL) == 1;
5644 	if (cap == IOMMU_CAP_INTR_REMAP)
5645 		return irq_remapping_enabled == 1;
5646 
5647 	return false;
5648 }
5649 
5650 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5651 {
5652 	struct intel_iommu *iommu;
5653 	u8 bus, devfn;
5654 
5655 	iommu = device_to_iommu(dev, &bus, &devfn);
5656 	if (!iommu)
5657 		return ERR_PTR(-ENODEV);
5658 
5659 	if (translation_pre_enabled(iommu))
5660 		dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5661 
5662 	return &iommu->iommu;
5663 }
5664 
5665 static void intel_iommu_release_device(struct device *dev)
5666 {
5667 	struct intel_iommu *iommu;
5668 	u8 bus, devfn;
5669 
5670 	iommu = device_to_iommu(dev, &bus, &devfn);
5671 	if (!iommu)
5672 		return;
5673 
5674 	dmar_remove_one_dev_info(dev);
5675 
5676 	set_dma_ops(dev, NULL);
5677 }
5678 
5679 static void intel_iommu_probe_finalize(struct device *dev)
5680 {
5681 	struct iommu_domain *domain;
5682 
5683 	domain = iommu_get_domain_for_dev(dev);
5684 	if (device_needs_bounce(dev))
5685 		set_dma_ops(dev, &bounce_dma_ops);
5686 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5687 		set_dma_ops(dev, &intel_dma_ops);
5688 	else
5689 		set_dma_ops(dev, NULL);
5690 }
5691 
5692 static void intel_iommu_get_resv_regions(struct device *device,
5693 					 struct list_head *head)
5694 {
5695 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5696 	struct iommu_resv_region *reg;
5697 	struct dmar_rmrr_unit *rmrr;
5698 	struct device *i_dev;
5699 	int i;
5700 
5701 	down_read(&dmar_global_lock);
5702 	for_each_rmrr_units(rmrr) {
5703 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5704 					  i, i_dev) {
5705 			struct iommu_resv_region *resv;
5706 			enum iommu_resv_type type;
5707 			size_t length;
5708 
5709 			if (i_dev != device &&
5710 			    !is_downstream_to_pci_bridge(device, i_dev))
5711 				continue;
5712 
5713 			length = rmrr->end_address - rmrr->base_address + 1;
5714 
5715 			type = device_rmrr_is_relaxable(device) ?
5716 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5717 
5718 			resv = iommu_alloc_resv_region(rmrr->base_address,
5719 						       length, prot, type);
5720 			if (!resv)
5721 				break;
5722 
5723 			list_add_tail(&resv->list, head);
5724 		}
5725 	}
5726 	up_read(&dmar_global_lock);
5727 
5728 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5729 	if (dev_is_pci(device)) {
5730 		struct pci_dev *pdev = to_pci_dev(device);
5731 
5732 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5733 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5734 						   IOMMU_RESV_DIRECT_RELAXABLE);
5735 			if (reg)
5736 				list_add_tail(&reg->list, head);
5737 		}
5738 	}
5739 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5740 
5741 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5742 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5743 				      0, IOMMU_RESV_MSI);
5744 	if (!reg)
5745 		return;
5746 	list_add_tail(&reg->list, head);
5747 }
5748 
5749 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5750 {
5751 	struct device_domain_info *info;
5752 	struct context_entry *context;
5753 	struct dmar_domain *domain;
5754 	unsigned long flags;
5755 	u64 ctx_lo;
5756 	int ret;
5757 
5758 	domain = find_domain(dev);
5759 	if (!domain)
5760 		return -EINVAL;
5761 
5762 	spin_lock_irqsave(&device_domain_lock, flags);
5763 	spin_lock(&iommu->lock);
5764 
5765 	ret = -EINVAL;
5766 	info = get_domain_info(dev);
5767 	if (!info || !info->pasid_supported)
5768 		goto out;
5769 
5770 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5771 	if (WARN_ON(!context))
5772 		goto out;
5773 
5774 	ctx_lo = context[0].lo;
5775 
5776 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5777 		ctx_lo |= CONTEXT_PASIDE;
5778 		context[0].lo = ctx_lo;
5779 		wmb();
5780 		iommu->flush.flush_context(iommu,
5781 					   domain->iommu_did[iommu->seq_id],
5782 					   PCI_DEVID(info->bus, info->devfn),
5783 					   DMA_CCMD_MASK_NOBIT,
5784 					   DMA_CCMD_DEVICE_INVL);
5785 	}
5786 
5787 	/* Enable PASID support in the device, if it wasn't already */
5788 	if (!info->pasid_enabled)
5789 		iommu_enable_dev_iotlb(info);
5790 
5791 	ret = 0;
5792 
5793  out:
5794 	spin_unlock(&iommu->lock);
5795 	spin_unlock_irqrestore(&device_domain_lock, flags);
5796 
5797 	return ret;
5798 }
5799 
5800 static void intel_iommu_apply_resv_region(struct device *dev,
5801 					  struct iommu_domain *domain,
5802 					  struct iommu_resv_region *region)
5803 {
5804 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5805 	unsigned long start, end;
5806 
5807 	start = IOVA_PFN(region->start);
5808 	end   = IOVA_PFN(region->start + region->length - 1);
5809 
5810 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5811 }
5812 
5813 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5814 {
5815 	if (dev_is_pci(dev))
5816 		return pci_device_group(dev);
5817 	return generic_device_group(dev);
5818 }
5819 
5820 #ifdef CONFIG_INTEL_IOMMU_SVM
5821 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5822 {
5823 	struct intel_iommu *iommu;
5824 	u8 bus, devfn;
5825 
5826 	if (iommu_dummy(dev)) {
5827 		dev_warn(dev,
5828 			 "No IOMMU translation for device; cannot enable SVM\n");
5829 		return NULL;
5830 	}
5831 
5832 	iommu = device_to_iommu(dev, &bus, &devfn);
5833 	if ((!iommu)) {
5834 		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5835 		return NULL;
5836 	}
5837 
5838 	return iommu;
5839 }
5840 #endif /* CONFIG_INTEL_IOMMU_SVM */
5841 
5842 static int intel_iommu_enable_auxd(struct device *dev)
5843 {
5844 	struct device_domain_info *info;
5845 	struct intel_iommu *iommu;
5846 	unsigned long flags;
5847 	u8 bus, devfn;
5848 	int ret;
5849 
5850 	iommu = device_to_iommu(dev, &bus, &devfn);
5851 	if (!iommu || dmar_disabled)
5852 		return -EINVAL;
5853 
5854 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5855 		return -EINVAL;
5856 
5857 	ret = intel_iommu_enable_pasid(iommu, dev);
5858 	if (ret)
5859 		return -ENODEV;
5860 
5861 	spin_lock_irqsave(&device_domain_lock, flags);
5862 	info = get_domain_info(dev);
5863 	info->auxd_enabled = 1;
5864 	spin_unlock_irqrestore(&device_domain_lock, flags);
5865 
5866 	return 0;
5867 }
5868 
5869 static int intel_iommu_disable_auxd(struct device *dev)
5870 {
5871 	struct device_domain_info *info;
5872 	unsigned long flags;
5873 
5874 	spin_lock_irqsave(&device_domain_lock, flags);
5875 	info = get_domain_info(dev);
5876 	if (!WARN_ON(!info))
5877 		info->auxd_enabled = 0;
5878 	spin_unlock_irqrestore(&device_domain_lock, flags);
5879 
5880 	return 0;
5881 }
5882 
5883 /*
5884  * A PCI express designated vendor specific extended capability is defined
5885  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886  * for system software and tools to detect endpoint devices supporting the
5887  * Intel scalable IO virtualization without host driver dependency.
5888  *
5889  * Returns the address of the matching extended capability structure within
5890  * the device's PCI configuration space or 0 if the device does not support
5891  * it.
5892  */
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5894 {
5895 	int pos;
5896 	u16 vendor, id;
5897 
5898 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5899 	while (pos) {
5900 		pci_read_config_word(pdev, pos + 4, &vendor);
5901 		pci_read_config_word(pdev, pos + 8, &id);
5902 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5903 			return pos;
5904 
5905 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5906 	}
5907 
5908 	return 0;
5909 }
5910 
5911 static bool
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5913 {
5914 	if (feat == IOMMU_DEV_FEAT_AUX) {
5915 		int ret;
5916 
5917 		if (!dev_is_pci(dev) || dmar_disabled ||
5918 		    !scalable_mode_support() || !iommu_pasid_support())
5919 			return false;
5920 
5921 		ret = pci_pasid_features(to_pci_dev(dev));
5922 		if (ret < 0)
5923 			return false;
5924 
5925 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5926 	}
5927 
5928 	if (feat == IOMMU_DEV_FEAT_SVA) {
5929 		struct device_domain_info *info = get_domain_info(dev);
5930 
5931 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5932 			info->pasid_supported && info->pri_supported &&
5933 			info->ats_supported;
5934 	}
5935 
5936 	return false;
5937 }
5938 
5939 static int
5940 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5941 {
5942 	if (feat == IOMMU_DEV_FEAT_AUX)
5943 		return intel_iommu_enable_auxd(dev);
5944 
5945 	if (feat == IOMMU_DEV_FEAT_SVA) {
5946 		struct device_domain_info *info = get_domain_info(dev);
5947 
5948 		if (!info)
5949 			return -EINVAL;
5950 
5951 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5952 			return 0;
5953 	}
5954 
5955 	return -ENODEV;
5956 }
5957 
5958 static int
5959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5960 {
5961 	if (feat == IOMMU_DEV_FEAT_AUX)
5962 		return intel_iommu_disable_auxd(dev);
5963 
5964 	return -ENODEV;
5965 }
5966 
5967 static bool
5968 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5969 {
5970 	struct device_domain_info *info = get_domain_info(dev);
5971 
5972 	if (feat == IOMMU_DEV_FEAT_AUX)
5973 		return scalable_mode_support() && info && info->auxd_enabled;
5974 
5975 	return false;
5976 }
5977 
5978 static int
5979 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5980 {
5981 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5982 
5983 	return dmar_domain->default_pasid > 0 ?
5984 			dmar_domain->default_pasid : -EINVAL;
5985 }
5986 
5987 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5988 					   struct device *dev)
5989 {
5990 	return attach_deferred(dev);
5991 }
5992 
5993 static int
5994 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5995 			    enum iommu_attr attr, void *data)
5996 {
5997 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5998 	unsigned long flags;
5999 	int ret = 0;
6000 
6001 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6002 		return -EINVAL;
6003 
6004 	switch (attr) {
6005 	case DOMAIN_ATTR_NESTING:
6006 		spin_lock_irqsave(&device_domain_lock, flags);
6007 		if (nested_mode_support() &&
6008 		    list_empty(&dmar_domain->devices)) {
6009 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6010 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6011 		} else {
6012 			ret = -ENODEV;
6013 		}
6014 		spin_unlock_irqrestore(&device_domain_lock, flags);
6015 		break;
6016 	default:
6017 		ret = -EINVAL;
6018 		break;
6019 	}
6020 
6021 	return ret;
6022 }
6023 
6024 const struct iommu_ops intel_iommu_ops = {
6025 	.capable		= intel_iommu_capable,
6026 	.domain_alloc		= intel_iommu_domain_alloc,
6027 	.domain_free		= intel_iommu_domain_free,
6028 	.domain_set_attr	= intel_iommu_domain_set_attr,
6029 	.attach_dev		= intel_iommu_attach_device,
6030 	.detach_dev		= intel_iommu_detach_device,
6031 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6032 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6033 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6034 	.map			= intel_iommu_map,
6035 	.unmap			= intel_iommu_unmap,
6036 	.iova_to_phys		= intel_iommu_iova_to_phys,
6037 	.probe_device		= intel_iommu_probe_device,
6038 	.probe_finalize		= intel_iommu_probe_finalize,
6039 	.release_device		= intel_iommu_release_device,
6040 	.get_resv_regions	= intel_iommu_get_resv_regions,
6041 	.put_resv_regions	= generic_iommu_put_resv_regions,
6042 	.apply_resv_region	= intel_iommu_apply_resv_region,
6043 	.device_group		= intel_iommu_device_group,
6044 	.dev_has_feat		= intel_iommu_dev_has_feat,
6045 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6046 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6047 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6048 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6049 	.def_domain_type	= device_def_domain_type,
6050 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6051 #ifdef CONFIG_INTEL_IOMMU_SVM
6052 	.cache_invalidate	= intel_iommu_sva_invalidate,
6053 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6054 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6055 	.sva_bind		= intel_svm_bind,
6056 	.sva_unbind		= intel_svm_unbind,
6057 	.sva_get_pasid		= intel_svm_get_pasid,
6058 #endif
6059 };
6060 
6061 static void quirk_iommu_igfx(struct pci_dev *dev)
6062 {
6063 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6064 	dmar_map_gfx = 0;
6065 }
6066 
6067 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6072 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6073 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6074 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6075 
6076 /* Broadwell igfx malfunctions with dmar */
6077 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6079 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6080 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6081 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6082 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6083 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6084 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6085 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6086 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6087 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6088 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6094 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6101 
6102 static void quirk_iommu_rwbf(struct pci_dev *dev)
6103 {
6104 	/*
6105 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6106 	 * but needs it. Same seems to hold for the desktop versions.
6107 	 */
6108 	pci_info(dev, "Forcing write-buffer flush capability\n");
6109 	rwbf_quirk = 1;
6110 }
6111 
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6119 
6120 #define GGC 0x52
6121 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6122 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6123 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6124 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6125 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6126 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6127 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6128 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6129 
6130 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6131 {
6132 	unsigned short ggc;
6133 
6134 	if (pci_read_config_word(dev, GGC, &ggc))
6135 		return;
6136 
6137 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6138 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6139 		dmar_map_gfx = 0;
6140 	} else if (dmar_map_gfx) {
6141 		/* we have to ensure the gfx device is idle before we flush */
6142 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6143 		intel_iommu_strict = 1;
6144        }
6145 }
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6150 
6151 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6152    ISOCH DMAR unit for the Azalia sound device, but not give it any
6153    TLB entries, which causes it to deadlock. Check for that.  We do
6154    this in a function called from init_dmars(), instead of in a PCI
6155    quirk, because we don't want to print the obnoxious "BIOS broken"
6156    message if VT-d is actually disabled.
6157 */
6158 static void __init check_tylersburg_isoch(void)
6159 {
6160 	struct pci_dev *pdev;
6161 	uint32_t vtisochctrl;
6162 
6163 	/* If there's no Azalia in the system anyway, forget it. */
6164 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6165 	if (!pdev)
6166 		return;
6167 	pci_dev_put(pdev);
6168 
6169 	/* System Management Registers. Might be hidden, in which case
6170 	   we can't do the sanity check. But that's OK, because the
6171 	   known-broken BIOSes _don't_ actually hide it, so far. */
6172 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6173 	if (!pdev)
6174 		return;
6175 
6176 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6177 		pci_dev_put(pdev);
6178 		return;
6179 	}
6180 
6181 	pci_dev_put(pdev);
6182 
6183 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6184 	if (vtisochctrl & 1)
6185 		return;
6186 
6187 	/* Drop all bits other than the number of TLB entries */
6188 	vtisochctrl &= 0x1c;
6189 
6190 	/* If we have the recommended number of TLB entries (16), fine. */
6191 	if (vtisochctrl == 0x10)
6192 		return;
6193 
6194 	/* Zero TLB entries? You get to ride the short bus to school. */
6195 	if (!vtisochctrl) {
6196 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6197 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6198 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6199 		     dmi_get_system_info(DMI_BIOS_VERSION),
6200 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6201 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6202 		return;
6203 	}
6204 
6205 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6206 	       vtisochctrl);
6207 }
6208