xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 0760aad038b5a032c31ea124feed63d88627d2f1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline unsigned long level_mask(int level)
132 {
133 	return -1UL << level_to_offset_bits(level);
134 }
135 
136 static inline unsigned long level_size(int level)
137 {
138 	return 1UL << level_to_offset_bits(level);
139 }
140 
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360 
361 #define IDENTMAP_GFX		2
362 #define IDENTMAP_AZALIA		4
363 
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 
367 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
368 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
369 struct device_domain_info *get_domain_info(struct device *dev)
370 {
371 	struct device_domain_info *info;
372 
373 	if (!dev)
374 		return NULL;
375 
376 	info = dev_iommu_priv_get(dev);
377 	if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
378 		     info == DEFER_DEVICE_DOMAIN_INFO))
379 		return NULL;
380 
381 	return info;
382 }
383 
384 DEFINE_SPINLOCK(device_domain_lock);
385 static LIST_HEAD(device_domain_list);
386 
387 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
388 				to_pci_dev(d)->untrusted)
389 
390 /*
391  * Iterate over elements in device_domain_list and call the specified
392  * callback @fn against each element.
393  */
394 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
395 				     void *data), void *data)
396 {
397 	int ret = 0;
398 	unsigned long flags;
399 	struct device_domain_info *info;
400 
401 	spin_lock_irqsave(&device_domain_lock, flags);
402 	list_for_each_entry(info, &device_domain_list, global) {
403 		ret = fn(info, data);
404 		if (ret) {
405 			spin_unlock_irqrestore(&device_domain_lock, flags);
406 			return ret;
407 		}
408 	}
409 	spin_unlock_irqrestore(&device_domain_lock, flags);
410 
411 	return 0;
412 }
413 
414 const struct iommu_ops intel_iommu_ops;
415 
416 static bool translation_pre_enabled(struct intel_iommu *iommu)
417 {
418 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
419 }
420 
421 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
422 {
423 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
424 }
425 
426 static void init_translation_status(struct intel_iommu *iommu)
427 {
428 	u32 gsts;
429 
430 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
431 	if (gsts & DMA_GSTS_TES)
432 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
433 }
434 
435 static int __init intel_iommu_setup(char *str)
436 {
437 	if (!str)
438 		return -EINVAL;
439 	while (*str) {
440 		if (!strncmp(str, "on", 2)) {
441 			dmar_disabled = 0;
442 			pr_info("IOMMU enabled\n");
443 		} else if (!strncmp(str, "off", 3)) {
444 			dmar_disabled = 1;
445 			no_platform_optin = 1;
446 			pr_info("IOMMU disabled\n");
447 		} else if (!strncmp(str, "igfx_off", 8)) {
448 			dmar_map_gfx = 0;
449 			pr_info("Disable GFX device mapping\n");
450 		} else if (!strncmp(str, "forcedac", 8)) {
451 			pr_info("Forcing DAC for PCI devices\n");
452 			dmar_forcedac = 1;
453 		} else if (!strncmp(str, "strict", 6)) {
454 			pr_info("Disable batched IOTLB flush\n");
455 			intel_iommu_strict = 1;
456 		} else if (!strncmp(str, "sp_off", 6)) {
457 			pr_info("Disable supported super page\n");
458 			intel_iommu_superpage = 0;
459 		} else if (!strncmp(str, "sm_on", 5)) {
460 			pr_info("Intel-IOMMU: scalable mode supported\n");
461 			intel_iommu_sm = 1;
462 		} else if (!strncmp(str, "tboot_noforce", 13)) {
463 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
464 			intel_iommu_tboot_noforce = 1;
465 		} else if (!strncmp(str, "nobounce", 8)) {
466 			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
467 			intel_no_bounce = 1;
468 		}
469 
470 		str += strcspn(str, ",");
471 		while (*str == ',')
472 			str++;
473 	}
474 	return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477 
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480 
481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 {
483 	struct dmar_domain **domains;
484 	int idx = did >> 8;
485 
486 	domains = iommu->domains[idx];
487 	if (!domains)
488 		return NULL;
489 
490 	return domains[did & 0xff];
491 }
492 
493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494 			     struct dmar_domain *domain)
495 {
496 	struct dmar_domain **domains;
497 	int idx = did >> 8;
498 
499 	if (!iommu->domains[idx]) {
500 		size_t size = 256 * sizeof(struct dmar_domain *);
501 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 	}
503 
504 	domains = iommu->domains[idx];
505 	if (WARN_ON(!domains))
506 		return;
507 	else
508 		domains[did & 0xff] = domain;
509 }
510 
511 void *alloc_pgtable_page(int node)
512 {
513 	struct page *page;
514 	void *vaddr = NULL;
515 
516 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 	if (page)
518 		vaddr = page_address(page);
519 	return vaddr;
520 }
521 
522 void free_pgtable_page(void *vaddr)
523 {
524 	free_page((unsigned long)vaddr);
525 }
526 
527 static inline void *alloc_domain_mem(void)
528 {
529 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 }
531 
532 static void free_domain_mem(void *vaddr)
533 {
534 	kmem_cache_free(iommu_domain_cache, vaddr);
535 }
536 
537 static inline void * alloc_devinfo_mem(void)
538 {
539 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 }
541 
542 static inline void free_devinfo_mem(void *vaddr)
543 {
544 	kmem_cache_free(iommu_devinfo_cache, vaddr);
545 }
546 
547 static inline int domain_type_is_si(struct dmar_domain *domain)
548 {
549 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 }
551 
552 static inline bool domain_use_first_level(struct dmar_domain *domain)
553 {
554 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 }
556 
557 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 				       unsigned long pfn)
559 {
560 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561 
562 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 }
564 
565 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
566 {
567 	unsigned long sagaw;
568 	int agaw = -1;
569 
570 	sagaw = cap_sagaw(iommu->cap);
571 	for (agaw = width_to_agaw(max_gaw);
572 	     agaw >= 0; agaw--) {
573 		if (test_bit(agaw, &sagaw))
574 			break;
575 	}
576 
577 	return agaw;
578 }
579 
580 /*
581  * Calculate max SAGAW for each iommu.
582  */
583 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584 {
585 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
586 }
587 
588 /*
589  * calculate agaw for each iommu.
590  * "SAGAW" may be different across iommus, use a default agaw, and
591  * get a supported less agaw for iommus that don't support the default agaw.
592  */
593 int iommu_calculate_agaw(struct intel_iommu *iommu)
594 {
595 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 }
597 
598 /* This functionin only returns single iommu in a domain */
599 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
600 {
601 	int iommu_id;
602 
603 	/* si_domain and vm domain should not get here. */
604 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 		return NULL;
606 
607 	for_each_domain_iommu(iommu_id, domain)
608 		break;
609 
610 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 		return NULL;
612 
613 	return g_iommus[iommu_id];
614 }
615 
616 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617 {
618 	return sm_supported(iommu) ?
619 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 }
621 
622 static void domain_update_iommu_coherency(struct dmar_domain *domain)
623 {
624 	struct dmar_drhd_unit *drhd;
625 	struct intel_iommu *iommu;
626 	bool found = false;
627 	int i;
628 
629 	domain->iommu_coherency = 1;
630 
631 	for_each_domain_iommu(i, domain) {
632 		found = true;
633 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
634 			domain->iommu_coherency = 0;
635 			break;
636 		}
637 	}
638 	if (found)
639 		return;
640 
641 	/* No hardware attached; use lowest common denominator */
642 	rcu_read_lock();
643 	for_each_active_iommu(iommu, drhd) {
644 		if (!iommu_paging_structure_coherency(iommu)) {
645 			domain->iommu_coherency = 0;
646 			break;
647 		}
648 	}
649 	rcu_read_unlock();
650 }
651 
652 static int domain_update_iommu_snooping(struct intel_iommu *skip)
653 {
654 	struct dmar_drhd_unit *drhd;
655 	struct intel_iommu *iommu;
656 	int ret = 1;
657 
658 	rcu_read_lock();
659 	for_each_active_iommu(iommu, drhd) {
660 		if (iommu != skip) {
661 			if (!ecap_sc_support(iommu->ecap)) {
662 				ret = 0;
663 				break;
664 			}
665 		}
666 	}
667 	rcu_read_unlock();
668 
669 	return ret;
670 }
671 
672 static int domain_update_iommu_superpage(struct dmar_domain *domain,
673 					 struct intel_iommu *skip)
674 {
675 	struct dmar_drhd_unit *drhd;
676 	struct intel_iommu *iommu;
677 	int mask = 0x3;
678 
679 	if (!intel_iommu_superpage) {
680 		return 0;
681 	}
682 
683 	/* set iommu_superpage to the smallest common denominator */
684 	rcu_read_lock();
685 	for_each_active_iommu(iommu, drhd) {
686 		if (iommu != skip) {
687 			if (domain && domain_use_first_level(domain)) {
688 				if (!cap_fl1gp_support(iommu->cap))
689 					mask = 0x1;
690 			} else {
691 				mask &= cap_super_page_val(iommu->cap);
692 			}
693 
694 			if (!mask)
695 				break;
696 		}
697 	}
698 	rcu_read_unlock();
699 
700 	return fls(mask);
701 }
702 
703 /* Some capabilities may be different across iommus */
704 static void domain_update_iommu_cap(struct dmar_domain *domain)
705 {
706 	domain_update_iommu_coherency(domain);
707 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
708 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
709 }
710 
711 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
712 					 u8 devfn, int alloc)
713 {
714 	struct root_entry *root = &iommu->root_entry[bus];
715 	struct context_entry *context;
716 	u64 *entry;
717 
718 	entry = &root->lo;
719 	if (sm_supported(iommu)) {
720 		if (devfn >= 0x80) {
721 			devfn -= 0x80;
722 			entry = &root->hi;
723 		}
724 		devfn *= 2;
725 	}
726 	if (*entry & 1)
727 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
728 	else {
729 		unsigned long phy_addr;
730 		if (!alloc)
731 			return NULL;
732 
733 		context = alloc_pgtable_page(iommu->node);
734 		if (!context)
735 			return NULL;
736 
737 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
738 		phy_addr = virt_to_phys((void *)context);
739 		*entry = phy_addr | 1;
740 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
741 	}
742 	return &context[devfn];
743 }
744 
745 static int iommu_dummy(struct device *dev)
746 {
747 	return dev_iommu_priv_get(dev) == DUMMY_DEVICE_DOMAIN_INFO;
748 }
749 
750 static bool attach_deferred(struct device *dev)
751 {
752 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
753 }
754 
755 /**
756  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
757  *				 sub-hierarchy of a candidate PCI-PCI bridge
758  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
759  * @bridge: the candidate PCI-PCI bridge
760  *
761  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
762  */
763 static bool
764 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
765 {
766 	struct pci_dev *pdev, *pbridge;
767 
768 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
769 		return false;
770 
771 	pdev = to_pci_dev(dev);
772 	pbridge = to_pci_dev(bridge);
773 
774 	if (pbridge->subordinate &&
775 	    pbridge->subordinate->number <= pdev->bus->number &&
776 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
777 		return true;
778 
779 	return false;
780 }
781 
782 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
783 {
784 	struct dmar_drhd_unit *drhd = NULL;
785 	struct pci_dev *pdev = NULL;
786 	struct intel_iommu *iommu;
787 	struct device *tmp;
788 	u16 segment = 0;
789 	int i;
790 
791 	if (!dev || iommu_dummy(dev))
792 		return NULL;
793 
794 	if (dev_is_pci(dev)) {
795 		struct pci_dev *pf_pdev;
796 
797 		pdev = pci_real_dma_dev(to_pci_dev(dev));
798 
799 		/* VFs aren't listed in scope tables; we need to look up
800 		 * the PF instead to find the IOMMU. */
801 		pf_pdev = pci_physfn(pdev);
802 		dev = &pf_pdev->dev;
803 		segment = pci_domain_nr(pdev->bus);
804 	} else if (has_acpi_companion(dev))
805 		dev = &ACPI_COMPANION(dev)->dev;
806 
807 	rcu_read_lock();
808 	for_each_active_iommu(iommu, drhd) {
809 		if (pdev && segment != drhd->segment)
810 			continue;
811 
812 		for_each_active_dev_scope(drhd->devices,
813 					  drhd->devices_cnt, i, tmp) {
814 			if (tmp == dev) {
815 				/* For a VF use its original BDF# not that of the PF
816 				 * which we used for the IOMMU lookup. Strictly speaking
817 				 * we could do this for all PCI devices; we only need to
818 				 * get the BDF# from the scope table for ACPI matches. */
819 				if (pdev && pdev->is_virtfn)
820 					goto got_pdev;
821 
822 				if (bus && devfn) {
823 					*bus = drhd->devices[i].bus;
824 					*devfn = drhd->devices[i].devfn;
825 				}
826 				goto out;
827 			}
828 
829 			if (is_downstream_to_pci_bridge(dev, tmp))
830 				goto got_pdev;
831 		}
832 
833 		if (pdev && drhd->include_all) {
834 		got_pdev:
835 			if (bus && devfn) {
836 				*bus = pdev->bus->number;
837 				*devfn = pdev->devfn;
838 			}
839 			goto out;
840 		}
841 	}
842 	iommu = NULL;
843  out:
844 	rcu_read_unlock();
845 
846 	return iommu;
847 }
848 
849 static void domain_flush_cache(struct dmar_domain *domain,
850 			       void *addr, int size)
851 {
852 	if (!domain->iommu_coherency)
853 		clflush_cache_range(addr, size);
854 }
855 
856 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
857 {
858 	struct context_entry *context;
859 	int ret = 0;
860 	unsigned long flags;
861 
862 	spin_lock_irqsave(&iommu->lock, flags);
863 	context = iommu_context_addr(iommu, bus, devfn, 0);
864 	if (context)
865 		ret = context_present(context);
866 	spin_unlock_irqrestore(&iommu->lock, flags);
867 	return ret;
868 }
869 
870 static void free_context_table(struct intel_iommu *iommu)
871 {
872 	int i;
873 	unsigned long flags;
874 	struct context_entry *context;
875 
876 	spin_lock_irqsave(&iommu->lock, flags);
877 	if (!iommu->root_entry) {
878 		goto out;
879 	}
880 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
881 		context = iommu_context_addr(iommu, i, 0, 0);
882 		if (context)
883 			free_pgtable_page(context);
884 
885 		if (!sm_supported(iommu))
886 			continue;
887 
888 		context = iommu_context_addr(iommu, i, 0x80, 0);
889 		if (context)
890 			free_pgtable_page(context);
891 
892 	}
893 	free_pgtable_page(iommu->root_entry);
894 	iommu->root_entry = NULL;
895 out:
896 	spin_unlock_irqrestore(&iommu->lock, flags);
897 }
898 
899 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
900 				      unsigned long pfn, int *target_level)
901 {
902 	struct dma_pte *parent, *pte;
903 	int level = agaw_to_level(domain->agaw);
904 	int offset;
905 
906 	BUG_ON(!domain->pgd);
907 
908 	if (!domain_pfn_supported(domain, pfn))
909 		/* Address beyond IOMMU's addressing capabilities. */
910 		return NULL;
911 
912 	parent = domain->pgd;
913 
914 	while (1) {
915 		void *tmp_page;
916 
917 		offset = pfn_level_offset(pfn, level);
918 		pte = &parent[offset];
919 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
920 			break;
921 		if (level == *target_level)
922 			break;
923 
924 		if (!dma_pte_present(pte)) {
925 			uint64_t pteval;
926 
927 			tmp_page = alloc_pgtable_page(domain->nid);
928 
929 			if (!tmp_page)
930 				return NULL;
931 
932 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
933 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
934 			if (domain_use_first_level(domain))
935 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
936 			if (cmpxchg64(&pte->val, 0ULL, pteval))
937 				/* Someone else set it while we were thinking; use theirs. */
938 				free_pgtable_page(tmp_page);
939 			else
940 				domain_flush_cache(domain, pte, sizeof(*pte));
941 		}
942 		if (level == 1)
943 			break;
944 
945 		parent = phys_to_virt(dma_pte_addr(pte));
946 		level--;
947 	}
948 
949 	if (!*target_level)
950 		*target_level = level;
951 
952 	return pte;
953 }
954 
955 /* return address's pte at specific level */
956 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
957 					 unsigned long pfn,
958 					 int level, int *large_page)
959 {
960 	struct dma_pte *parent, *pte;
961 	int total = agaw_to_level(domain->agaw);
962 	int offset;
963 
964 	parent = domain->pgd;
965 	while (level <= total) {
966 		offset = pfn_level_offset(pfn, total);
967 		pte = &parent[offset];
968 		if (level == total)
969 			return pte;
970 
971 		if (!dma_pte_present(pte)) {
972 			*large_page = total;
973 			break;
974 		}
975 
976 		if (dma_pte_superpage(pte)) {
977 			*large_page = total;
978 			return pte;
979 		}
980 
981 		parent = phys_to_virt(dma_pte_addr(pte));
982 		total--;
983 	}
984 	return NULL;
985 }
986 
987 /* clear last level pte, a tlb flush should be followed */
988 static void dma_pte_clear_range(struct dmar_domain *domain,
989 				unsigned long start_pfn,
990 				unsigned long last_pfn)
991 {
992 	unsigned int large_page;
993 	struct dma_pte *first_pte, *pte;
994 
995 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
996 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
997 	BUG_ON(start_pfn > last_pfn);
998 
999 	/* we don't need lock here; nobody else touches the iova range */
1000 	do {
1001 		large_page = 1;
1002 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1003 		if (!pte) {
1004 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1005 			continue;
1006 		}
1007 		do {
1008 			dma_clear_pte(pte);
1009 			start_pfn += lvl_to_nr_pages(large_page);
1010 			pte++;
1011 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1012 
1013 		domain_flush_cache(domain, first_pte,
1014 				   (void *)pte - (void *)first_pte);
1015 
1016 	} while (start_pfn && start_pfn <= last_pfn);
1017 }
1018 
1019 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1020 			       int retain_level, struct dma_pte *pte,
1021 			       unsigned long pfn, unsigned long start_pfn,
1022 			       unsigned long last_pfn)
1023 {
1024 	pfn = max(start_pfn, pfn);
1025 	pte = &pte[pfn_level_offset(pfn, level)];
1026 
1027 	do {
1028 		unsigned long level_pfn;
1029 		struct dma_pte *level_pte;
1030 
1031 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1032 			goto next;
1033 
1034 		level_pfn = pfn & level_mask(level);
1035 		level_pte = phys_to_virt(dma_pte_addr(pte));
1036 
1037 		if (level > 2) {
1038 			dma_pte_free_level(domain, level - 1, retain_level,
1039 					   level_pte, level_pfn, start_pfn,
1040 					   last_pfn);
1041 		}
1042 
1043 		/*
1044 		 * Free the page table if we're below the level we want to
1045 		 * retain and the range covers the entire table.
1046 		 */
1047 		if (level < retain_level && !(start_pfn > level_pfn ||
1048 		      last_pfn < level_pfn + level_size(level) - 1)) {
1049 			dma_clear_pte(pte);
1050 			domain_flush_cache(domain, pte, sizeof(*pte));
1051 			free_pgtable_page(level_pte);
1052 		}
1053 next:
1054 		pfn += level_size(level);
1055 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1056 }
1057 
1058 /*
1059  * clear last level (leaf) ptes and free page table pages below the
1060  * level we wish to keep intact.
1061  */
1062 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1063 				   unsigned long start_pfn,
1064 				   unsigned long last_pfn,
1065 				   int retain_level)
1066 {
1067 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1068 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1069 	BUG_ON(start_pfn > last_pfn);
1070 
1071 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1072 
1073 	/* We don't need lock here; nobody else touches the iova range */
1074 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1075 			   domain->pgd, 0, start_pfn, last_pfn);
1076 
1077 	/* free pgd */
1078 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1079 		free_pgtable_page(domain->pgd);
1080 		domain->pgd = NULL;
1081 	}
1082 }
1083 
1084 /* When a page at a given level is being unlinked from its parent, we don't
1085    need to *modify* it at all. All we need to do is make a list of all the
1086    pages which can be freed just as soon as we've flushed the IOTLB and we
1087    know the hardware page-walk will no longer touch them.
1088    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1089    be freed. */
1090 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1091 					    int level, struct dma_pte *pte,
1092 					    struct page *freelist)
1093 {
1094 	struct page *pg;
1095 
1096 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1097 	pg->freelist = freelist;
1098 	freelist = pg;
1099 
1100 	if (level == 1)
1101 		return freelist;
1102 
1103 	pte = page_address(pg);
1104 	do {
1105 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1106 			freelist = dma_pte_list_pagetables(domain, level - 1,
1107 							   pte, freelist);
1108 		pte++;
1109 	} while (!first_pte_in_page(pte));
1110 
1111 	return freelist;
1112 }
1113 
1114 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1115 					struct dma_pte *pte, unsigned long pfn,
1116 					unsigned long start_pfn,
1117 					unsigned long last_pfn,
1118 					struct page *freelist)
1119 {
1120 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1121 
1122 	pfn = max(start_pfn, pfn);
1123 	pte = &pte[pfn_level_offset(pfn, level)];
1124 
1125 	do {
1126 		unsigned long level_pfn;
1127 
1128 		if (!dma_pte_present(pte))
1129 			goto next;
1130 
1131 		level_pfn = pfn & level_mask(level);
1132 
1133 		/* If range covers entire pagetable, free it */
1134 		if (start_pfn <= level_pfn &&
1135 		    last_pfn >= level_pfn + level_size(level) - 1) {
1136 			/* These suborbinate page tables are going away entirely. Don't
1137 			   bother to clear them; we're just going to *free* them. */
1138 			if (level > 1 && !dma_pte_superpage(pte))
1139 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1140 
1141 			dma_clear_pte(pte);
1142 			if (!first_pte)
1143 				first_pte = pte;
1144 			last_pte = pte;
1145 		} else if (level > 1) {
1146 			/* Recurse down into a level that isn't *entirely* obsolete */
1147 			freelist = dma_pte_clear_level(domain, level - 1,
1148 						       phys_to_virt(dma_pte_addr(pte)),
1149 						       level_pfn, start_pfn, last_pfn,
1150 						       freelist);
1151 		}
1152 next:
1153 		pfn += level_size(level);
1154 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155 
1156 	if (first_pte)
1157 		domain_flush_cache(domain, first_pte,
1158 				   (void *)++last_pte - (void *)first_pte);
1159 
1160 	return freelist;
1161 }
1162 
1163 /* We can't just free the pages because the IOMMU may still be walking
1164    the page tables, and may have cached the intermediate levels. The
1165    pages can only be freed after the IOTLB flush has been done. */
1166 static struct page *domain_unmap(struct dmar_domain *domain,
1167 				 unsigned long start_pfn,
1168 				 unsigned long last_pfn)
1169 {
1170 	struct page *freelist;
1171 
1172 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174 	BUG_ON(start_pfn > last_pfn);
1175 
1176 	/* we don't need lock here; nobody else touches the iova range */
1177 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1179 
1180 	/* free pgd */
1181 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 		struct page *pgd_page = virt_to_page(domain->pgd);
1183 		pgd_page->freelist = freelist;
1184 		freelist = pgd_page;
1185 
1186 		domain->pgd = NULL;
1187 	}
1188 
1189 	return freelist;
1190 }
1191 
1192 static void dma_free_pagelist(struct page *freelist)
1193 {
1194 	struct page *pg;
1195 
1196 	while ((pg = freelist)) {
1197 		freelist = pg->freelist;
1198 		free_pgtable_page(page_address(pg));
1199 	}
1200 }
1201 
1202 static void iova_entry_free(unsigned long data)
1203 {
1204 	struct page *freelist = (struct page *)data;
1205 
1206 	dma_free_pagelist(freelist);
1207 }
1208 
1209 /* iommu handling */
1210 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1211 {
1212 	struct root_entry *root;
1213 	unsigned long flags;
1214 
1215 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1216 	if (!root) {
1217 		pr_err("Allocating root entry for %s failed\n",
1218 			iommu->name);
1219 		return -ENOMEM;
1220 	}
1221 
1222 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1223 
1224 	spin_lock_irqsave(&iommu->lock, flags);
1225 	iommu->root_entry = root;
1226 	spin_unlock_irqrestore(&iommu->lock, flags);
1227 
1228 	return 0;
1229 }
1230 
1231 static void iommu_set_root_entry(struct intel_iommu *iommu)
1232 {
1233 	u64 addr;
1234 	u32 sts;
1235 	unsigned long flag;
1236 
1237 	addr = virt_to_phys(iommu->root_entry);
1238 	if (sm_supported(iommu))
1239 		addr |= DMA_RTADDR_SMT;
1240 
1241 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1243 
1244 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1245 
1246 	/* Make sure hardware complete it */
1247 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1248 		      readl, (sts & DMA_GSTS_RTPS), sts);
1249 
1250 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 }
1252 
1253 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1254 {
1255 	u32 val;
1256 	unsigned long flag;
1257 
1258 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1259 		return;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1262 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1263 
1264 	/* Make sure hardware complete it */
1265 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1266 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1267 
1268 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1269 }
1270 
1271 /* return value determine if we need a write buffer flush */
1272 static void __iommu_flush_context(struct intel_iommu *iommu,
1273 				  u16 did, u16 source_id, u8 function_mask,
1274 				  u64 type)
1275 {
1276 	u64 val = 0;
1277 	unsigned long flag;
1278 
1279 	switch (type) {
1280 	case DMA_CCMD_GLOBAL_INVL:
1281 		val = DMA_CCMD_GLOBAL_INVL;
1282 		break;
1283 	case DMA_CCMD_DOMAIN_INVL:
1284 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1285 		break;
1286 	case DMA_CCMD_DEVICE_INVL:
1287 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1288 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1289 		break;
1290 	default:
1291 		BUG();
1292 	}
1293 	val |= DMA_CCMD_ICC;
1294 
1295 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1296 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1297 
1298 	/* Make sure hardware complete it */
1299 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1300 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1301 
1302 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1303 }
1304 
1305 /* return value determine if we need a write buffer flush */
1306 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1307 				u64 addr, unsigned int size_order, u64 type)
1308 {
1309 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1310 	u64 val = 0, val_iva = 0;
1311 	unsigned long flag;
1312 
1313 	switch (type) {
1314 	case DMA_TLB_GLOBAL_FLUSH:
1315 		/* global flush doesn't need set IVA_REG */
1316 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1317 		break;
1318 	case DMA_TLB_DSI_FLUSH:
1319 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1320 		break;
1321 	case DMA_TLB_PSI_FLUSH:
1322 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1323 		/* IH bit is passed in as part of address */
1324 		val_iva = size_order | addr;
1325 		break;
1326 	default:
1327 		BUG();
1328 	}
1329 	/* Note: set drain read/write */
1330 #if 0
1331 	/*
1332 	 * This is probably to be super secure.. Looks like we can
1333 	 * ignore it without any impact.
1334 	 */
1335 	if (cap_read_drain(iommu->cap))
1336 		val |= DMA_TLB_READ_DRAIN;
1337 #endif
1338 	if (cap_write_drain(iommu->cap))
1339 		val |= DMA_TLB_WRITE_DRAIN;
1340 
1341 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342 	/* Note: Only uses first TLB reg currently */
1343 	if (val_iva)
1344 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1345 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1346 
1347 	/* Make sure hardware complete it */
1348 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1349 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1350 
1351 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1352 
1353 	/* check IOTLB invalidation granularity */
1354 	if (DMA_TLB_IAIG(val) == 0)
1355 		pr_err("Flush IOTLB failed\n");
1356 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1357 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1358 			(unsigned long long)DMA_TLB_IIRG(type),
1359 			(unsigned long long)DMA_TLB_IAIG(val));
1360 }
1361 
1362 static struct device_domain_info *
1363 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1364 			 u8 bus, u8 devfn)
1365 {
1366 	struct device_domain_info *info;
1367 
1368 	assert_spin_locked(&device_domain_lock);
1369 
1370 	if (!iommu->qi)
1371 		return NULL;
1372 
1373 	list_for_each_entry(info, &domain->devices, link)
1374 		if (info->iommu == iommu && info->bus == bus &&
1375 		    info->devfn == devfn) {
1376 			if (info->ats_supported && info->dev)
1377 				return info;
1378 			break;
1379 		}
1380 
1381 	return NULL;
1382 }
1383 
1384 static void domain_update_iotlb(struct dmar_domain *domain)
1385 {
1386 	struct device_domain_info *info;
1387 	bool has_iotlb_device = false;
1388 
1389 	assert_spin_locked(&device_domain_lock);
1390 
1391 	list_for_each_entry(info, &domain->devices, link) {
1392 		struct pci_dev *pdev;
1393 
1394 		if (!info->dev || !dev_is_pci(info->dev))
1395 			continue;
1396 
1397 		pdev = to_pci_dev(info->dev);
1398 		if (pdev->ats_enabled) {
1399 			has_iotlb_device = true;
1400 			break;
1401 		}
1402 	}
1403 
1404 	domain->has_iotlb_device = has_iotlb_device;
1405 }
1406 
1407 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1408 {
1409 	struct pci_dev *pdev;
1410 
1411 	assert_spin_locked(&device_domain_lock);
1412 
1413 	if (!info || !dev_is_pci(info->dev))
1414 		return;
1415 
1416 	pdev = to_pci_dev(info->dev);
1417 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1418 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1419 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1420 	 * reserved, which should be set to 0.
1421 	 */
1422 	if (!ecap_dit(info->iommu->ecap))
1423 		info->pfsid = 0;
1424 	else {
1425 		struct pci_dev *pf_pdev;
1426 
1427 		/* pdev will be returned if device is not a vf */
1428 		pf_pdev = pci_physfn(pdev);
1429 		info->pfsid = pci_dev_id(pf_pdev);
1430 	}
1431 
1432 #ifdef CONFIG_INTEL_IOMMU_SVM
1433 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1434 	   the device if you enable PASID support after ATS support is
1435 	   undefined. So always enable PASID support on devices which
1436 	   have it, even if we can't yet know if we're ever going to
1437 	   use it. */
1438 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1439 		info->pasid_enabled = 1;
1440 
1441 	if (info->pri_supported &&
1442 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1443 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1444 		info->pri_enabled = 1;
1445 #endif
1446 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1447 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1448 		info->ats_enabled = 1;
1449 		domain_update_iotlb(info->domain);
1450 		info->ats_qdep = pci_ats_queue_depth(pdev);
1451 	}
1452 }
1453 
1454 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1455 {
1456 	struct pci_dev *pdev;
1457 
1458 	assert_spin_locked(&device_domain_lock);
1459 
1460 	if (!dev_is_pci(info->dev))
1461 		return;
1462 
1463 	pdev = to_pci_dev(info->dev);
1464 
1465 	if (info->ats_enabled) {
1466 		pci_disable_ats(pdev);
1467 		info->ats_enabled = 0;
1468 		domain_update_iotlb(info->domain);
1469 	}
1470 #ifdef CONFIG_INTEL_IOMMU_SVM
1471 	if (info->pri_enabled) {
1472 		pci_disable_pri(pdev);
1473 		info->pri_enabled = 0;
1474 	}
1475 	if (info->pasid_enabled) {
1476 		pci_disable_pasid(pdev);
1477 		info->pasid_enabled = 0;
1478 	}
1479 #endif
1480 }
1481 
1482 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1483 				  u64 addr, unsigned mask)
1484 {
1485 	u16 sid, qdep;
1486 	unsigned long flags;
1487 	struct device_domain_info *info;
1488 
1489 	if (!domain->has_iotlb_device)
1490 		return;
1491 
1492 	spin_lock_irqsave(&device_domain_lock, flags);
1493 	list_for_each_entry(info, &domain->devices, link) {
1494 		if (!info->ats_enabled)
1495 			continue;
1496 
1497 		sid = info->bus << 8 | info->devfn;
1498 		qdep = info->ats_qdep;
1499 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1500 				qdep, addr, mask);
1501 	}
1502 	spin_unlock_irqrestore(&device_domain_lock, flags);
1503 }
1504 
1505 static void domain_flush_piotlb(struct intel_iommu *iommu,
1506 				struct dmar_domain *domain,
1507 				u64 addr, unsigned long npages, bool ih)
1508 {
1509 	u16 did = domain->iommu_did[iommu->seq_id];
1510 
1511 	if (domain->default_pasid)
1512 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1513 				addr, npages, ih);
1514 
1515 	if (!list_empty(&domain->devices))
1516 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1517 }
1518 
1519 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1520 				  struct dmar_domain *domain,
1521 				  unsigned long pfn, unsigned int pages,
1522 				  int ih, int map)
1523 {
1524 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1525 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1526 	u16 did = domain->iommu_did[iommu->seq_id];
1527 
1528 	BUG_ON(pages == 0);
1529 
1530 	if (ih)
1531 		ih = 1 << 6;
1532 
1533 	if (domain_use_first_level(domain)) {
1534 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1535 	} else {
1536 		/*
1537 		 * Fallback to domain selective flush if no PSI support or
1538 		 * the size is too big. PSI requires page size to be 2 ^ x,
1539 		 * and the base address is naturally aligned to the size.
1540 		 */
1541 		if (!cap_pgsel_inv(iommu->cap) ||
1542 		    mask > cap_max_amask_val(iommu->cap))
1543 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1544 							DMA_TLB_DSI_FLUSH);
1545 		else
1546 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1547 							DMA_TLB_PSI_FLUSH);
1548 	}
1549 
1550 	/*
1551 	 * In caching mode, changes of pages from non-present to present require
1552 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1553 	 */
1554 	if (!cap_caching_mode(iommu->cap) || !map)
1555 		iommu_flush_dev_iotlb(domain, addr, mask);
1556 }
1557 
1558 /* Notification for newly created mappings */
1559 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1560 					struct dmar_domain *domain,
1561 					unsigned long pfn, unsigned int pages)
1562 {
1563 	/*
1564 	 * It's a non-present to present mapping. Only flush if caching mode
1565 	 * and second level.
1566 	 */
1567 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1568 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1569 	else
1570 		iommu_flush_write_buffer(iommu);
1571 }
1572 
1573 static void iommu_flush_iova(struct iova_domain *iovad)
1574 {
1575 	struct dmar_domain *domain;
1576 	int idx;
1577 
1578 	domain = container_of(iovad, struct dmar_domain, iovad);
1579 
1580 	for_each_domain_iommu(idx, domain) {
1581 		struct intel_iommu *iommu = g_iommus[idx];
1582 		u16 did = domain->iommu_did[iommu->seq_id];
1583 
1584 		if (domain_use_first_level(domain))
1585 			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1586 		else
1587 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1588 						 DMA_TLB_DSI_FLUSH);
1589 
1590 		if (!cap_caching_mode(iommu->cap))
1591 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1592 					      0, MAX_AGAW_PFN_WIDTH);
1593 	}
1594 }
1595 
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598 	u32 pmen;
1599 	unsigned long flags;
1600 
1601 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 		return;
1603 
1604 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 	pmen &= ~DMA_PMEN_EPM;
1607 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608 
1609 	/* wait for the protected region status bit to clear */
1610 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612 
1613 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615 
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618 	u32 sts;
1619 	unsigned long flags;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 	iommu->gcmd |= DMA_GCMD_TE;
1623 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624 
1625 	/* Make sure hardware complete it */
1626 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 		      readl, (sts & DMA_GSTS_TES), sts);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flag;
1636 
1637 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 		return;
1640 
1641 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 	iommu->gcmd &= ~DMA_GCMD_TE;
1643 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644 
1645 	/* Make sure hardware complete it */
1646 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648 
1649 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 }
1651 
1652 static int iommu_init_domains(struct intel_iommu *iommu)
1653 {
1654 	u32 ndomains, nlongs;
1655 	size_t size;
1656 
1657 	ndomains = cap_ndoms(iommu->cap);
1658 	pr_debug("%s: Number of Domains supported <%d>\n",
1659 		 iommu->name, ndomains);
1660 	nlongs = BITS_TO_LONGS(ndomains);
1661 
1662 	spin_lock_init(&iommu->lock);
1663 
1664 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1665 	if (!iommu->domain_ids) {
1666 		pr_err("%s: Allocating domain id array failed\n",
1667 		       iommu->name);
1668 		return -ENOMEM;
1669 	}
1670 
1671 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1672 	iommu->domains = kzalloc(size, GFP_KERNEL);
1673 
1674 	if (iommu->domains) {
1675 		size = 256 * sizeof(struct dmar_domain *);
1676 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1677 	}
1678 
1679 	if (!iommu->domains || !iommu->domains[0]) {
1680 		pr_err("%s: Allocating domain array failed\n",
1681 		       iommu->name);
1682 		kfree(iommu->domain_ids);
1683 		kfree(iommu->domains);
1684 		iommu->domain_ids = NULL;
1685 		iommu->domains    = NULL;
1686 		return -ENOMEM;
1687 	}
1688 
1689 	/*
1690 	 * If Caching mode is set, then invalid translations are tagged
1691 	 * with domain-id 0, hence we need to pre-allocate it. We also
1692 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1693 	 * make sure it is not used for a real domain.
1694 	 */
1695 	set_bit(0, iommu->domain_ids);
1696 
1697 	/*
1698 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1699 	 * entry for first-level or pass-through translation modes should
1700 	 * be programmed with a domain id different from those used for
1701 	 * second-level or nested translation. We reserve a domain id for
1702 	 * this purpose.
1703 	 */
1704 	if (sm_supported(iommu))
1705 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1706 
1707 	return 0;
1708 }
1709 
1710 static void disable_dmar_iommu(struct intel_iommu *iommu)
1711 {
1712 	struct device_domain_info *info, *tmp;
1713 	unsigned long flags;
1714 
1715 	if (!iommu->domains || !iommu->domain_ids)
1716 		return;
1717 
1718 	spin_lock_irqsave(&device_domain_lock, flags);
1719 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1720 		if (info->iommu != iommu)
1721 			continue;
1722 
1723 		if (!info->dev || !info->domain)
1724 			continue;
1725 
1726 		__dmar_remove_one_dev_info(info);
1727 	}
1728 	spin_unlock_irqrestore(&device_domain_lock, flags);
1729 
1730 	if (iommu->gcmd & DMA_GCMD_TE)
1731 		iommu_disable_translation(iommu);
1732 }
1733 
1734 static void free_dmar_iommu(struct intel_iommu *iommu)
1735 {
1736 	if ((iommu->domains) && (iommu->domain_ids)) {
1737 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1738 		int i;
1739 
1740 		for (i = 0; i < elems; i++)
1741 			kfree(iommu->domains[i]);
1742 		kfree(iommu->domains);
1743 		kfree(iommu->domain_ids);
1744 		iommu->domains = NULL;
1745 		iommu->domain_ids = NULL;
1746 	}
1747 
1748 	g_iommus[iommu->seq_id] = NULL;
1749 
1750 	/* free context mapping */
1751 	free_context_table(iommu);
1752 
1753 #ifdef CONFIG_INTEL_IOMMU_SVM
1754 	if (pasid_supported(iommu)) {
1755 		if (ecap_prs(iommu->ecap))
1756 			intel_svm_finish_prq(iommu);
1757 	}
1758 	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1759 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1760 
1761 #endif
1762 }
1763 
1764 /*
1765  * Check and return whether first level is used by default for
1766  * DMA translation.
1767  */
1768 static bool first_level_by_default(void)
1769 {
1770 	struct dmar_drhd_unit *drhd;
1771 	struct intel_iommu *iommu;
1772 	static int first_level_support = -1;
1773 
1774 	if (likely(first_level_support != -1))
1775 		return first_level_support;
1776 
1777 	first_level_support = 1;
1778 
1779 	rcu_read_lock();
1780 	for_each_active_iommu(iommu, drhd) {
1781 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1782 			first_level_support = 0;
1783 			break;
1784 		}
1785 	}
1786 	rcu_read_unlock();
1787 
1788 	return first_level_support;
1789 }
1790 
1791 static struct dmar_domain *alloc_domain(int flags)
1792 {
1793 	struct dmar_domain *domain;
1794 
1795 	domain = alloc_domain_mem();
1796 	if (!domain)
1797 		return NULL;
1798 
1799 	memset(domain, 0, sizeof(*domain));
1800 	domain->nid = NUMA_NO_NODE;
1801 	domain->flags = flags;
1802 	if (first_level_by_default())
1803 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1804 	domain->has_iotlb_device = false;
1805 	INIT_LIST_HEAD(&domain->devices);
1806 
1807 	return domain;
1808 }
1809 
1810 /* Must be called with iommu->lock */
1811 static int domain_attach_iommu(struct dmar_domain *domain,
1812 			       struct intel_iommu *iommu)
1813 {
1814 	unsigned long ndomains;
1815 	int num;
1816 
1817 	assert_spin_locked(&device_domain_lock);
1818 	assert_spin_locked(&iommu->lock);
1819 
1820 	domain->iommu_refcnt[iommu->seq_id] += 1;
1821 	domain->iommu_count += 1;
1822 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1823 		ndomains = cap_ndoms(iommu->cap);
1824 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1825 
1826 		if (num >= ndomains) {
1827 			pr_err("%s: No free domain ids\n", iommu->name);
1828 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1829 			domain->iommu_count -= 1;
1830 			return -ENOSPC;
1831 		}
1832 
1833 		set_bit(num, iommu->domain_ids);
1834 		set_iommu_domain(iommu, num, domain);
1835 
1836 		domain->iommu_did[iommu->seq_id] = num;
1837 		domain->nid			 = iommu->node;
1838 
1839 		domain_update_iommu_cap(domain);
1840 	}
1841 
1842 	return 0;
1843 }
1844 
1845 static int domain_detach_iommu(struct dmar_domain *domain,
1846 			       struct intel_iommu *iommu)
1847 {
1848 	int num, count;
1849 
1850 	assert_spin_locked(&device_domain_lock);
1851 	assert_spin_locked(&iommu->lock);
1852 
1853 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1854 	count = --domain->iommu_count;
1855 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1856 		num = domain->iommu_did[iommu->seq_id];
1857 		clear_bit(num, iommu->domain_ids);
1858 		set_iommu_domain(iommu, num, NULL);
1859 
1860 		domain_update_iommu_cap(domain);
1861 		domain->iommu_did[iommu->seq_id] = 0;
1862 	}
1863 
1864 	return count;
1865 }
1866 
1867 static struct iova_domain reserved_iova_list;
1868 static struct lock_class_key reserved_rbtree_key;
1869 
1870 static int dmar_init_reserved_ranges(void)
1871 {
1872 	struct pci_dev *pdev = NULL;
1873 	struct iova *iova;
1874 	int i;
1875 
1876 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1877 
1878 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1879 		&reserved_rbtree_key);
1880 
1881 	/* IOAPIC ranges shouldn't be accessed by DMA */
1882 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1883 		IOVA_PFN(IOAPIC_RANGE_END));
1884 	if (!iova) {
1885 		pr_err("Reserve IOAPIC range failed\n");
1886 		return -ENODEV;
1887 	}
1888 
1889 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1890 	for_each_pci_dev(pdev) {
1891 		struct resource *r;
1892 
1893 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1894 			r = &pdev->resource[i];
1895 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1896 				continue;
1897 			iova = reserve_iova(&reserved_iova_list,
1898 					    IOVA_PFN(r->start),
1899 					    IOVA_PFN(r->end));
1900 			if (!iova) {
1901 				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1902 				return -ENODEV;
1903 			}
1904 		}
1905 	}
1906 	return 0;
1907 }
1908 
1909 static inline int guestwidth_to_adjustwidth(int gaw)
1910 {
1911 	int agaw;
1912 	int r = (gaw - 12) % 9;
1913 
1914 	if (r == 0)
1915 		agaw = gaw;
1916 	else
1917 		agaw = gaw + 9 - r;
1918 	if (agaw > 64)
1919 		agaw = 64;
1920 	return agaw;
1921 }
1922 
1923 static void domain_exit(struct dmar_domain *domain)
1924 {
1925 
1926 	/* Remove associated devices and clear attached or cached domains */
1927 	domain_remove_dev_info(domain);
1928 
1929 	/* destroy iovas */
1930 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1931 		put_iova_domain(&domain->iovad);
1932 
1933 	if (domain->pgd) {
1934 		struct page *freelist;
1935 
1936 		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1937 		dma_free_pagelist(freelist);
1938 	}
1939 
1940 	free_domain_mem(domain);
1941 }
1942 
1943 /*
1944  * Get the PASID directory size for scalable mode context entry.
1945  * Value of X in the PDTS field of a scalable mode context entry
1946  * indicates PASID directory with 2^(X + 7) entries.
1947  */
1948 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1949 {
1950 	int pds, max_pde;
1951 
1952 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1953 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1954 	if (pds < 7)
1955 		return 0;
1956 
1957 	return pds - 7;
1958 }
1959 
1960 /*
1961  * Set the RID_PASID field of a scalable mode context entry. The
1962  * IOMMU hardware will use the PASID value set in this field for
1963  * DMA translations of DMA requests without PASID.
1964  */
1965 static inline void
1966 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1967 {
1968 	context->hi |= pasid & ((1 << 20) - 1);
1969 }
1970 
1971 /*
1972  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1973  * entry.
1974  */
1975 static inline void context_set_sm_dte(struct context_entry *context)
1976 {
1977 	context->lo |= (1 << 2);
1978 }
1979 
1980 /*
1981  * Set the PRE(Page Request Enable) field of a scalable mode context
1982  * entry.
1983  */
1984 static inline void context_set_sm_pre(struct context_entry *context)
1985 {
1986 	context->lo |= (1 << 4);
1987 }
1988 
1989 /* Convert value to context PASID directory size field coding. */
1990 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1991 
1992 static int domain_context_mapping_one(struct dmar_domain *domain,
1993 				      struct intel_iommu *iommu,
1994 				      struct pasid_table *table,
1995 				      u8 bus, u8 devfn)
1996 {
1997 	u16 did = domain->iommu_did[iommu->seq_id];
1998 	int translation = CONTEXT_TT_MULTI_LEVEL;
1999 	struct device_domain_info *info = NULL;
2000 	struct context_entry *context;
2001 	unsigned long flags;
2002 	int ret;
2003 
2004 	WARN_ON(did == 0);
2005 
2006 	if (hw_pass_through && domain_type_is_si(domain))
2007 		translation = CONTEXT_TT_PASS_THROUGH;
2008 
2009 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2010 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2011 
2012 	BUG_ON(!domain->pgd);
2013 
2014 	spin_lock_irqsave(&device_domain_lock, flags);
2015 	spin_lock(&iommu->lock);
2016 
2017 	ret = -ENOMEM;
2018 	context = iommu_context_addr(iommu, bus, devfn, 1);
2019 	if (!context)
2020 		goto out_unlock;
2021 
2022 	ret = 0;
2023 	if (context_present(context))
2024 		goto out_unlock;
2025 
2026 	/*
2027 	 * For kdump cases, old valid entries may be cached due to the
2028 	 * in-flight DMA and copied pgtable, but there is no unmapping
2029 	 * behaviour for them, thus we need an explicit cache flush for
2030 	 * the newly-mapped device. For kdump, at this point, the device
2031 	 * is supposed to finish reset at its driver probe stage, so no
2032 	 * in-flight DMA will exist, and we don't need to worry anymore
2033 	 * hereafter.
2034 	 */
2035 	if (context_copied(context)) {
2036 		u16 did_old = context_domain_id(context);
2037 
2038 		if (did_old < cap_ndoms(iommu->cap)) {
2039 			iommu->flush.flush_context(iommu, did_old,
2040 						   (((u16)bus) << 8) | devfn,
2041 						   DMA_CCMD_MASK_NOBIT,
2042 						   DMA_CCMD_DEVICE_INVL);
2043 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2044 						 DMA_TLB_DSI_FLUSH);
2045 		}
2046 	}
2047 
2048 	context_clear_entry(context);
2049 
2050 	if (sm_supported(iommu)) {
2051 		unsigned long pds;
2052 
2053 		WARN_ON(!table);
2054 
2055 		/* Setup the PASID DIR pointer: */
2056 		pds = context_get_sm_pds(table);
2057 		context->lo = (u64)virt_to_phys(table->table) |
2058 				context_pdts(pds);
2059 
2060 		/* Setup the RID_PASID field: */
2061 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2062 
2063 		/*
2064 		 * Setup the Device-TLB enable bit and Page request
2065 		 * Enable bit:
2066 		 */
2067 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2068 		if (info && info->ats_supported)
2069 			context_set_sm_dte(context);
2070 		if (info && info->pri_supported)
2071 			context_set_sm_pre(context);
2072 	} else {
2073 		struct dma_pte *pgd = domain->pgd;
2074 		int agaw;
2075 
2076 		context_set_domain_id(context, did);
2077 
2078 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2079 			/*
2080 			 * Skip top levels of page tables for iommu which has
2081 			 * less agaw than default. Unnecessary for PT mode.
2082 			 */
2083 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2084 				ret = -ENOMEM;
2085 				pgd = phys_to_virt(dma_pte_addr(pgd));
2086 				if (!dma_pte_present(pgd))
2087 					goto out_unlock;
2088 			}
2089 
2090 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2091 			if (info && info->ats_supported)
2092 				translation = CONTEXT_TT_DEV_IOTLB;
2093 			else
2094 				translation = CONTEXT_TT_MULTI_LEVEL;
2095 
2096 			context_set_address_root(context, virt_to_phys(pgd));
2097 			context_set_address_width(context, agaw);
2098 		} else {
2099 			/*
2100 			 * In pass through mode, AW must be programmed to
2101 			 * indicate the largest AGAW value supported by
2102 			 * hardware. And ASR is ignored by hardware.
2103 			 */
2104 			context_set_address_width(context, iommu->msagaw);
2105 		}
2106 
2107 		context_set_translation_type(context, translation);
2108 	}
2109 
2110 	context_set_fault_enable(context);
2111 	context_set_present(context);
2112 	if (!ecap_coherent(iommu->ecap))
2113 		clflush_cache_range(context, sizeof(*context));
2114 
2115 	/*
2116 	 * It's a non-present to present mapping. If hardware doesn't cache
2117 	 * non-present entry we only need to flush the write-buffer. If the
2118 	 * _does_ cache non-present entries, then it does so in the special
2119 	 * domain #0, which we have to flush:
2120 	 */
2121 	if (cap_caching_mode(iommu->cap)) {
2122 		iommu->flush.flush_context(iommu, 0,
2123 					   (((u16)bus) << 8) | devfn,
2124 					   DMA_CCMD_MASK_NOBIT,
2125 					   DMA_CCMD_DEVICE_INVL);
2126 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2127 	} else {
2128 		iommu_flush_write_buffer(iommu);
2129 	}
2130 	iommu_enable_dev_iotlb(info);
2131 
2132 	ret = 0;
2133 
2134 out_unlock:
2135 	spin_unlock(&iommu->lock);
2136 	spin_unlock_irqrestore(&device_domain_lock, flags);
2137 
2138 	return ret;
2139 }
2140 
2141 struct domain_context_mapping_data {
2142 	struct dmar_domain *domain;
2143 	struct intel_iommu *iommu;
2144 	struct pasid_table *table;
2145 };
2146 
2147 static int domain_context_mapping_cb(struct pci_dev *pdev,
2148 				     u16 alias, void *opaque)
2149 {
2150 	struct domain_context_mapping_data *data = opaque;
2151 
2152 	return domain_context_mapping_one(data->domain, data->iommu,
2153 					  data->table, PCI_BUS_NUM(alias),
2154 					  alias & 0xff);
2155 }
2156 
2157 static int
2158 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2159 {
2160 	struct domain_context_mapping_data data;
2161 	struct pasid_table *table;
2162 	struct intel_iommu *iommu;
2163 	u8 bus, devfn;
2164 
2165 	iommu = device_to_iommu(dev, &bus, &devfn);
2166 	if (!iommu)
2167 		return -ENODEV;
2168 
2169 	table = intel_pasid_get_table(dev);
2170 
2171 	if (!dev_is_pci(dev))
2172 		return domain_context_mapping_one(domain, iommu, table,
2173 						  bus, devfn);
2174 
2175 	data.domain = domain;
2176 	data.iommu = iommu;
2177 	data.table = table;
2178 
2179 	return pci_for_each_dma_alias(to_pci_dev(dev),
2180 				      &domain_context_mapping_cb, &data);
2181 }
2182 
2183 static int domain_context_mapped_cb(struct pci_dev *pdev,
2184 				    u16 alias, void *opaque)
2185 {
2186 	struct intel_iommu *iommu = opaque;
2187 
2188 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2189 }
2190 
2191 static int domain_context_mapped(struct device *dev)
2192 {
2193 	struct intel_iommu *iommu;
2194 	u8 bus, devfn;
2195 
2196 	iommu = device_to_iommu(dev, &bus, &devfn);
2197 	if (!iommu)
2198 		return -ENODEV;
2199 
2200 	if (!dev_is_pci(dev))
2201 		return device_context_mapped(iommu, bus, devfn);
2202 
2203 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2204 				       domain_context_mapped_cb, iommu);
2205 }
2206 
2207 /* Returns a number of VTD pages, but aligned to MM page size */
2208 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2209 					    size_t size)
2210 {
2211 	host_addr &= ~PAGE_MASK;
2212 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2213 }
2214 
2215 /* Return largest possible superpage level for a given mapping */
2216 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2217 					  unsigned long iov_pfn,
2218 					  unsigned long phy_pfn,
2219 					  unsigned long pages)
2220 {
2221 	int support, level = 1;
2222 	unsigned long pfnmerge;
2223 
2224 	support = domain->iommu_superpage;
2225 
2226 	/* To use a large page, the virtual *and* physical addresses
2227 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2228 	   of them will mean we have to use smaller pages. So just
2229 	   merge them and check both at once. */
2230 	pfnmerge = iov_pfn | phy_pfn;
2231 
2232 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2233 		pages >>= VTD_STRIDE_SHIFT;
2234 		if (!pages)
2235 			break;
2236 		pfnmerge >>= VTD_STRIDE_SHIFT;
2237 		level++;
2238 		support--;
2239 	}
2240 	return level;
2241 }
2242 
2243 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2244 			    struct scatterlist *sg, unsigned long phys_pfn,
2245 			    unsigned long nr_pages, int prot)
2246 {
2247 	struct dma_pte *first_pte = NULL, *pte = NULL;
2248 	phys_addr_t pteval;
2249 	unsigned long sg_res = 0;
2250 	unsigned int largepage_lvl = 0;
2251 	unsigned long lvl_pages = 0;
2252 	u64 attr;
2253 
2254 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2255 
2256 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2257 		return -EINVAL;
2258 
2259 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2260 	if (domain_use_first_level(domain))
2261 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2262 
2263 	if (!sg) {
2264 		sg_res = nr_pages;
2265 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2266 	}
2267 
2268 	while (nr_pages > 0) {
2269 		uint64_t tmp;
2270 
2271 		if (!sg_res) {
2272 			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2273 
2274 			sg_res = aligned_nrpages(sg->offset, sg->length);
2275 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2276 			sg->dma_length = sg->length;
2277 			pteval = (sg_phys(sg) - pgoff) | attr;
2278 			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2279 		}
2280 
2281 		if (!pte) {
2282 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2283 
2284 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2285 			if (!pte)
2286 				return -ENOMEM;
2287 			/* It is large page*/
2288 			if (largepage_lvl > 1) {
2289 				unsigned long nr_superpages, end_pfn;
2290 
2291 				pteval |= DMA_PTE_LARGE_PAGE;
2292 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2293 
2294 				nr_superpages = sg_res / lvl_pages;
2295 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2296 
2297 				/*
2298 				 * Ensure that old small page tables are
2299 				 * removed to make room for superpage(s).
2300 				 * We're adding new large pages, so make sure
2301 				 * we don't remove their parent tables.
2302 				 */
2303 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2304 						       largepage_lvl + 1);
2305 			} else {
2306 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2307 			}
2308 
2309 		}
2310 		/* We don't need lock here, nobody else
2311 		 * touches the iova range
2312 		 */
2313 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2314 		if (tmp) {
2315 			static int dumps = 5;
2316 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2317 				iov_pfn, tmp, (unsigned long long)pteval);
2318 			if (dumps) {
2319 				dumps--;
2320 				debug_dma_dump_mappings(NULL);
2321 			}
2322 			WARN_ON(1);
2323 		}
2324 
2325 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2326 
2327 		BUG_ON(nr_pages < lvl_pages);
2328 		BUG_ON(sg_res < lvl_pages);
2329 
2330 		nr_pages -= lvl_pages;
2331 		iov_pfn += lvl_pages;
2332 		phys_pfn += lvl_pages;
2333 		pteval += lvl_pages * VTD_PAGE_SIZE;
2334 		sg_res -= lvl_pages;
2335 
2336 		/* If the next PTE would be the first in a new page, then we
2337 		   need to flush the cache on the entries we've just written.
2338 		   And then we'll need to recalculate 'pte', so clear it and
2339 		   let it get set again in the if (!pte) block above.
2340 
2341 		   If we're done (!nr_pages) we need to flush the cache too.
2342 
2343 		   Also if we've been setting superpages, we may need to
2344 		   recalculate 'pte' and switch back to smaller pages for the
2345 		   end of the mapping, if the trailing size is not enough to
2346 		   use another superpage (i.e. sg_res < lvl_pages). */
2347 		pte++;
2348 		if (!nr_pages || first_pte_in_page(pte) ||
2349 		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2350 			domain_flush_cache(domain, first_pte,
2351 					   (void *)pte - (void *)first_pte);
2352 			pte = NULL;
2353 		}
2354 
2355 		if (!sg_res && nr_pages)
2356 			sg = sg_next(sg);
2357 	}
2358 	return 0;
2359 }
2360 
2361 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2362 			  struct scatterlist *sg, unsigned long phys_pfn,
2363 			  unsigned long nr_pages, int prot)
2364 {
2365 	int iommu_id, ret;
2366 	struct intel_iommu *iommu;
2367 
2368 	/* Do the real mapping first */
2369 	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2370 	if (ret)
2371 		return ret;
2372 
2373 	for_each_domain_iommu(iommu_id, domain) {
2374 		iommu = g_iommus[iommu_id];
2375 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2376 	}
2377 
2378 	return 0;
2379 }
2380 
2381 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2382 				    struct scatterlist *sg, unsigned long nr_pages,
2383 				    int prot)
2384 {
2385 	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2386 }
2387 
2388 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2389 				     unsigned long phys_pfn, unsigned long nr_pages,
2390 				     int prot)
2391 {
2392 	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2393 }
2394 
2395 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2396 {
2397 	unsigned long flags;
2398 	struct context_entry *context;
2399 	u16 did_old;
2400 
2401 	if (!iommu)
2402 		return;
2403 
2404 	spin_lock_irqsave(&iommu->lock, flags);
2405 	context = iommu_context_addr(iommu, bus, devfn, 0);
2406 	if (!context) {
2407 		spin_unlock_irqrestore(&iommu->lock, flags);
2408 		return;
2409 	}
2410 	did_old = context_domain_id(context);
2411 	context_clear_entry(context);
2412 	__iommu_flush_cache(iommu, context, sizeof(*context));
2413 	spin_unlock_irqrestore(&iommu->lock, flags);
2414 	iommu->flush.flush_context(iommu,
2415 				   did_old,
2416 				   (((u16)bus) << 8) | devfn,
2417 				   DMA_CCMD_MASK_NOBIT,
2418 				   DMA_CCMD_DEVICE_INVL);
2419 	iommu->flush.flush_iotlb(iommu,
2420 				 did_old,
2421 				 0,
2422 				 0,
2423 				 DMA_TLB_DSI_FLUSH);
2424 }
2425 
2426 static inline void unlink_domain_info(struct device_domain_info *info)
2427 {
2428 	assert_spin_locked(&device_domain_lock);
2429 	list_del(&info->link);
2430 	list_del(&info->global);
2431 	if (info->dev)
2432 		dev_iommu_priv_set(info->dev, NULL);
2433 }
2434 
2435 static void domain_remove_dev_info(struct dmar_domain *domain)
2436 {
2437 	struct device_domain_info *info, *tmp;
2438 	unsigned long flags;
2439 
2440 	spin_lock_irqsave(&device_domain_lock, flags);
2441 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2442 		__dmar_remove_one_dev_info(info);
2443 	spin_unlock_irqrestore(&device_domain_lock, flags);
2444 }
2445 
2446 struct dmar_domain *find_domain(struct device *dev)
2447 {
2448 	struct device_domain_info *info;
2449 
2450 	if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2451 		return NULL;
2452 
2453 	/* No lock here, assumes no domain exit in normal case */
2454 	info = get_domain_info(dev);
2455 	if (likely(info))
2456 		return info->domain;
2457 
2458 	return NULL;
2459 }
2460 
2461 static void do_deferred_attach(struct device *dev)
2462 {
2463 	struct iommu_domain *domain;
2464 
2465 	dev_iommu_priv_set(dev, NULL);
2466 	domain = iommu_get_domain_for_dev(dev);
2467 	if (domain)
2468 		intel_iommu_attach_device(domain, dev);
2469 }
2470 
2471 static inline struct device_domain_info *
2472 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2473 {
2474 	struct device_domain_info *info;
2475 
2476 	list_for_each_entry(info, &device_domain_list, global)
2477 		if (info->segment == segment && info->bus == bus &&
2478 		    info->devfn == devfn)
2479 			return info;
2480 
2481 	return NULL;
2482 }
2483 
2484 static int domain_setup_first_level(struct intel_iommu *iommu,
2485 				    struct dmar_domain *domain,
2486 				    struct device *dev,
2487 				    int pasid)
2488 {
2489 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2490 	struct dma_pte *pgd = domain->pgd;
2491 	int agaw, level;
2492 
2493 	/*
2494 	 * Skip top levels of page tables for iommu which has
2495 	 * less agaw than default. Unnecessary for PT mode.
2496 	 */
2497 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2498 		pgd = phys_to_virt(dma_pte_addr(pgd));
2499 		if (!dma_pte_present(pgd))
2500 			return -ENOMEM;
2501 	}
2502 
2503 	level = agaw_to_level(agaw);
2504 	if (level != 4 && level != 5)
2505 		return -EINVAL;
2506 
2507 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2508 
2509 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2510 					     domain->iommu_did[iommu->seq_id],
2511 					     flags);
2512 }
2513 
2514 static bool dev_is_real_dma_subdevice(struct device *dev)
2515 {
2516 	return dev && dev_is_pci(dev) &&
2517 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2518 }
2519 
2520 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2521 						    int bus, int devfn,
2522 						    struct device *dev,
2523 						    struct dmar_domain *domain)
2524 {
2525 	struct dmar_domain *found = NULL;
2526 	struct device_domain_info *info;
2527 	unsigned long flags;
2528 	int ret;
2529 
2530 	info = alloc_devinfo_mem();
2531 	if (!info)
2532 		return NULL;
2533 
2534 	if (!dev_is_real_dma_subdevice(dev)) {
2535 		info->bus = bus;
2536 		info->devfn = devfn;
2537 		info->segment = iommu->segment;
2538 	} else {
2539 		struct pci_dev *pdev = to_pci_dev(dev);
2540 
2541 		info->bus = pdev->bus->number;
2542 		info->devfn = pdev->devfn;
2543 		info->segment = pci_domain_nr(pdev->bus);
2544 	}
2545 
2546 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2547 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2548 	info->ats_qdep = 0;
2549 	info->dev = dev;
2550 	info->domain = domain;
2551 	info->iommu = iommu;
2552 	info->pasid_table = NULL;
2553 	info->auxd_enabled = 0;
2554 	INIT_LIST_HEAD(&info->auxiliary_domains);
2555 
2556 	if (dev && dev_is_pci(dev)) {
2557 		struct pci_dev *pdev = to_pci_dev(info->dev);
2558 
2559 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2560 		    pci_ats_supported(pdev) &&
2561 		    dmar_find_matched_atsr_unit(pdev))
2562 			info->ats_supported = 1;
2563 
2564 		if (sm_supported(iommu)) {
2565 			if (pasid_supported(iommu)) {
2566 				int features = pci_pasid_features(pdev);
2567 				if (features >= 0)
2568 					info->pasid_supported = features | 1;
2569 			}
2570 
2571 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2572 			    pci_pri_supported(pdev))
2573 				info->pri_supported = 1;
2574 		}
2575 	}
2576 
2577 	spin_lock_irqsave(&device_domain_lock, flags);
2578 	if (dev)
2579 		found = find_domain(dev);
2580 
2581 	if (!found) {
2582 		struct device_domain_info *info2;
2583 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2584 						       info->devfn);
2585 		if (info2) {
2586 			found      = info2->domain;
2587 			info2->dev = dev;
2588 		}
2589 	}
2590 
2591 	if (found) {
2592 		spin_unlock_irqrestore(&device_domain_lock, flags);
2593 		free_devinfo_mem(info);
2594 		/* Caller must free the original domain */
2595 		return found;
2596 	}
2597 
2598 	spin_lock(&iommu->lock);
2599 	ret = domain_attach_iommu(domain, iommu);
2600 	spin_unlock(&iommu->lock);
2601 
2602 	if (ret) {
2603 		spin_unlock_irqrestore(&device_domain_lock, flags);
2604 		free_devinfo_mem(info);
2605 		return NULL;
2606 	}
2607 
2608 	list_add(&info->link, &domain->devices);
2609 	list_add(&info->global, &device_domain_list);
2610 	if (dev)
2611 		dev_iommu_priv_set(dev, info);
2612 	spin_unlock_irqrestore(&device_domain_lock, flags);
2613 
2614 	/* PASID table is mandatory for a PCI device in scalable mode. */
2615 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2616 		ret = intel_pasid_alloc_table(dev);
2617 		if (ret) {
2618 			dev_err(dev, "PASID table allocation failed\n");
2619 			dmar_remove_one_dev_info(dev);
2620 			return NULL;
2621 		}
2622 
2623 		/* Setup the PASID entry for requests without PASID: */
2624 		spin_lock(&iommu->lock);
2625 		if (hw_pass_through && domain_type_is_si(domain))
2626 			ret = intel_pasid_setup_pass_through(iommu, domain,
2627 					dev, PASID_RID2PASID);
2628 		else if (domain_use_first_level(domain))
2629 			ret = domain_setup_first_level(iommu, domain, dev,
2630 					PASID_RID2PASID);
2631 		else
2632 			ret = intel_pasid_setup_second_level(iommu, domain,
2633 					dev, PASID_RID2PASID);
2634 		spin_unlock(&iommu->lock);
2635 		if (ret) {
2636 			dev_err(dev, "Setup RID2PASID failed\n");
2637 			dmar_remove_one_dev_info(dev);
2638 			return NULL;
2639 		}
2640 	}
2641 
2642 	if (dev && domain_context_mapping(domain, dev)) {
2643 		dev_err(dev, "Domain context map failed\n");
2644 		dmar_remove_one_dev_info(dev);
2645 		return NULL;
2646 	}
2647 
2648 	return domain;
2649 }
2650 
2651 static int iommu_domain_identity_map(struct dmar_domain *domain,
2652 				     unsigned long first_vpfn,
2653 				     unsigned long last_vpfn)
2654 {
2655 	/*
2656 	 * RMRR range might have overlap with physical memory range,
2657 	 * clear it first
2658 	 */
2659 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2660 
2661 	return __domain_mapping(domain, first_vpfn, NULL,
2662 				first_vpfn, last_vpfn - first_vpfn + 1,
2663 				DMA_PTE_READ|DMA_PTE_WRITE);
2664 }
2665 
2666 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2667 
2668 static int __init si_domain_init(int hw)
2669 {
2670 	struct dmar_rmrr_unit *rmrr;
2671 	struct device *dev;
2672 	int i, nid, ret;
2673 
2674 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2675 	if (!si_domain)
2676 		return -EFAULT;
2677 
2678 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2679 		domain_exit(si_domain);
2680 		return -EFAULT;
2681 	}
2682 
2683 	if (hw)
2684 		return 0;
2685 
2686 	for_each_online_node(nid) {
2687 		unsigned long start_pfn, end_pfn;
2688 		int i;
2689 
2690 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2691 			ret = iommu_domain_identity_map(si_domain,
2692 					mm_to_dma_pfn(start_pfn),
2693 					mm_to_dma_pfn(end_pfn));
2694 			if (ret)
2695 				return ret;
2696 		}
2697 	}
2698 
2699 	/*
2700 	 * Identity map the RMRRs so that devices with RMRRs could also use
2701 	 * the si_domain.
2702 	 */
2703 	for_each_rmrr_units(rmrr) {
2704 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2705 					  i, dev) {
2706 			unsigned long long start = rmrr->base_address;
2707 			unsigned long long end = rmrr->end_address;
2708 
2709 			if (WARN_ON(end < start ||
2710 				    end >> agaw_to_width(si_domain->agaw)))
2711 				continue;
2712 
2713 			ret = iommu_domain_identity_map(si_domain,
2714 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2715 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2716 			if (ret)
2717 				return ret;
2718 		}
2719 	}
2720 
2721 	return 0;
2722 }
2723 
2724 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2725 {
2726 	struct dmar_domain *ndomain;
2727 	struct intel_iommu *iommu;
2728 	u8 bus, devfn;
2729 
2730 	iommu = device_to_iommu(dev, &bus, &devfn);
2731 	if (!iommu)
2732 		return -ENODEV;
2733 
2734 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2735 	if (ndomain != domain)
2736 		return -EBUSY;
2737 
2738 	return 0;
2739 }
2740 
2741 static bool device_has_rmrr(struct device *dev)
2742 {
2743 	struct dmar_rmrr_unit *rmrr;
2744 	struct device *tmp;
2745 	int i;
2746 
2747 	rcu_read_lock();
2748 	for_each_rmrr_units(rmrr) {
2749 		/*
2750 		 * Return TRUE if this RMRR contains the device that
2751 		 * is passed in.
2752 		 */
2753 		for_each_active_dev_scope(rmrr->devices,
2754 					  rmrr->devices_cnt, i, tmp)
2755 			if (tmp == dev ||
2756 			    is_downstream_to_pci_bridge(dev, tmp)) {
2757 				rcu_read_unlock();
2758 				return true;
2759 			}
2760 	}
2761 	rcu_read_unlock();
2762 	return false;
2763 }
2764 
2765 /**
2766  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2767  * is relaxable (ie. is allowed to be not enforced under some conditions)
2768  * @dev: device handle
2769  *
2770  * We assume that PCI USB devices with RMRRs have them largely
2771  * for historical reasons and that the RMRR space is not actively used post
2772  * boot.  This exclusion may change if vendors begin to abuse it.
2773  *
2774  * The same exception is made for graphics devices, with the requirement that
2775  * any use of the RMRR regions will be torn down before assigning the device
2776  * to a guest.
2777  *
2778  * Return: true if the RMRR is relaxable, false otherwise
2779  */
2780 static bool device_rmrr_is_relaxable(struct device *dev)
2781 {
2782 	struct pci_dev *pdev;
2783 
2784 	if (!dev_is_pci(dev))
2785 		return false;
2786 
2787 	pdev = to_pci_dev(dev);
2788 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2789 		return true;
2790 	else
2791 		return false;
2792 }
2793 
2794 /*
2795  * There are a couple cases where we need to restrict the functionality of
2796  * devices associated with RMRRs.  The first is when evaluating a device for
2797  * identity mapping because problems exist when devices are moved in and out
2798  * of domains and their respective RMRR information is lost.  This means that
2799  * a device with associated RMRRs will never be in a "passthrough" domain.
2800  * The second is use of the device through the IOMMU API.  This interface
2801  * expects to have full control of the IOVA space for the device.  We cannot
2802  * satisfy both the requirement that RMRR access is maintained and have an
2803  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2804  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2805  * We therefore prevent devices associated with an RMRR from participating in
2806  * the IOMMU API, which eliminates them from device assignment.
2807  *
2808  * In both cases, devices which have relaxable RMRRs are not concerned by this
2809  * restriction. See device_rmrr_is_relaxable comment.
2810  */
2811 static bool device_is_rmrr_locked(struct device *dev)
2812 {
2813 	if (!device_has_rmrr(dev))
2814 		return false;
2815 
2816 	if (device_rmrr_is_relaxable(dev))
2817 		return false;
2818 
2819 	return true;
2820 }
2821 
2822 /*
2823  * Return the required default domain type for a specific device.
2824  *
2825  * @dev: the device in query
2826  * @startup: true if this is during early boot
2827  *
2828  * Returns:
2829  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2830  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2831  *  - 0: both identity and dynamic domains work for this device
2832  */
2833 static int device_def_domain_type(struct device *dev)
2834 {
2835 	if (dev_is_pci(dev)) {
2836 		struct pci_dev *pdev = to_pci_dev(dev);
2837 
2838 		/*
2839 		 * Prevent any device marked as untrusted from getting
2840 		 * placed into the statically identity mapping domain.
2841 		 */
2842 		if (pdev->untrusted)
2843 			return IOMMU_DOMAIN_DMA;
2844 
2845 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2846 			return IOMMU_DOMAIN_IDENTITY;
2847 
2848 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2849 			return IOMMU_DOMAIN_IDENTITY;
2850 	}
2851 
2852 	return 0;
2853 }
2854 
2855 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2856 {
2857 	/*
2858 	 * Start from the sane iommu hardware state.
2859 	 * If the queued invalidation is already initialized by us
2860 	 * (for example, while enabling interrupt-remapping) then
2861 	 * we got the things already rolling from a sane state.
2862 	 */
2863 	if (!iommu->qi) {
2864 		/*
2865 		 * Clear any previous faults.
2866 		 */
2867 		dmar_fault(-1, iommu);
2868 		/*
2869 		 * Disable queued invalidation if supported and already enabled
2870 		 * before OS handover.
2871 		 */
2872 		dmar_disable_qi(iommu);
2873 	}
2874 
2875 	if (dmar_enable_qi(iommu)) {
2876 		/*
2877 		 * Queued Invalidate not enabled, use Register Based Invalidate
2878 		 */
2879 		iommu->flush.flush_context = __iommu_flush_context;
2880 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2881 		pr_info("%s: Using Register based invalidation\n",
2882 			iommu->name);
2883 	} else {
2884 		iommu->flush.flush_context = qi_flush_context;
2885 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2886 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2887 	}
2888 }
2889 
2890 static int copy_context_table(struct intel_iommu *iommu,
2891 			      struct root_entry *old_re,
2892 			      struct context_entry **tbl,
2893 			      int bus, bool ext)
2894 {
2895 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2896 	struct context_entry *new_ce = NULL, ce;
2897 	struct context_entry *old_ce = NULL;
2898 	struct root_entry re;
2899 	phys_addr_t old_ce_phys;
2900 
2901 	tbl_idx = ext ? bus * 2 : bus;
2902 	memcpy(&re, old_re, sizeof(re));
2903 
2904 	for (devfn = 0; devfn < 256; devfn++) {
2905 		/* First calculate the correct index */
2906 		idx = (ext ? devfn * 2 : devfn) % 256;
2907 
2908 		if (idx == 0) {
2909 			/* First save what we may have and clean up */
2910 			if (new_ce) {
2911 				tbl[tbl_idx] = new_ce;
2912 				__iommu_flush_cache(iommu, new_ce,
2913 						    VTD_PAGE_SIZE);
2914 				pos = 1;
2915 			}
2916 
2917 			if (old_ce)
2918 				memunmap(old_ce);
2919 
2920 			ret = 0;
2921 			if (devfn < 0x80)
2922 				old_ce_phys = root_entry_lctp(&re);
2923 			else
2924 				old_ce_phys = root_entry_uctp(&re);
2925 
2926 			if (!old_ce_phys) {
2927 				if (ext && devfn == 0) {
2928 					/* No LCTP, try UCTP */
2929 					devfn = 0x7f;
2930 					continue;
2931 				} else {
2932 					goto out;
2933 				}
2934 			}
2935 
2936 			ret = -ENOMEM;
2937 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2938 					MEMREMAP_WB);
2939 			if (!old_ce)
2940 				goto out;
2941 
2942 			new_ce = alloc_pgtable_page(iommu->node);
2943 			if (!new_ce)
2944 				goto out_unmap;
2945 
2946 			ret = 0;
2947 		}
2948 
2949 		/* Now copy the context entry */
2950 		memcpy(&ce, old_ce + idx, sizeof(ce));
2951 
2952 		if (!__context_present(&ce))
2953 			continue;
2954 
2955 		did = context_domain_id(&ce);
2956 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2957 			set_bit(did, iommu->domain_ids);
2958 
2959 		/*
2960 		 * We need a marker for copied context entries. This
2961 		 * marker needs to work for the old format as well as
2962 		 * for extended context entries.
2963 		 *
2964 		 * Bit 67 of the context entry is used. In the old
2965 		 * format this bit is available to software, in the
2966 		 * extended format it is the PGE bit, but PGE is ignored
2967 		 * by HW if PASIDs are disabled (and thus still
2968 		 * available).
2969 		 *
2970 		 * So disable PASIDs first and then mark the entry
2971 		 * copied. This means that we don't copy PASID
2972 		 * translations from the old kernel, but this is fine as
2973 		 * faults there are not fatal.
2974 		 */
2975 		context_clear_pasid_enable(&ce);
2976 		context_set_copied(&ce);
2977 
2978 		new_ce[idx] = ce;
2979 	}
2980 
2981 	tbl[tbl_idx + pos] = new_ce;
2982 
2983 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2984 
2985 out_unmap:
2986 	memunmap(old_ce);
2987 
2988 out:
2989 	return ret;
2990 }
2991 
2992 static int copy_translation_tables(struct intel_iommu *iommu)
2993 {
2994 	struct context_entry **ctxt_tbls;
2995 	struct root_entry *old_rt;
2996 	phys_addr_t old_rt_phys;
2997 	int ctxt_table_entries;
2998 	unsigned long flags;
2999 	u64 rtaddr_reg;
3000 	int bus, ret;
3001 	bool new_ext, ext;
3002 
3003 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3004 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3005 	new_ext    = !!ecap_ecs(iommu->ecap);
3006 
3007 	/*
3008 	 * The RTT bit can only be changed when translation is disabled,
3009 	 * but disabling translation means to open a window for data
3010 	 * corruption. So bail out and don't copy anything if we would
3011 	 * have to change the bit.
3012 	 */
3013 	if (new_ext != ext)
3014 		return -EINVAL;
3015 
3016 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3017 	if (!old_rt_phys)
3018 		return -EINVAL;
3019 
3020 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3021 	if (!old_rt)
3022 		return -ENOMEM;
3023 
3024 	/* This is too big for the stack - allocate it from slab */
3025 	ctxt_table_entries = ext ? 512 : 256;
3026 	ret = -ENOMEM;
3027 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3028 	if (!ctxt_tbls)
3029 		goto out_unmap;
3030 
3031 	for (bus = 0; bus < 256; bus++) {
3032 		ret = copy_context_table(iommu, &old_rt[bus],
3033 					 ctxt_tbls, bus, ext);
3034 		if (ret) {
3035 			pr_err("%s: Failed to copy context table for bus %d\n",
3036 				iommu->name, bus);
3037 			continue;
3038 		}
3039 	}
3040 
3041 	spin_lock_irqsave(&iommu->lock, flags);
3042 
3043 	/* Context tables are copied, now write them to the root_entry table */
3044 	for (bus = 0; bus < 256; bus++) {
3045 		int idx = ext ? bus * 2 : bus;
3046 		u64 val;
3047 
3048 		if (ctxt_tbls[idx]) {
3049 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3050 			iommu->root_entry[bus].lo = val;
3051 		}
3052 
3053 		if (!ext || !ctxt_tbls[idx + 1])
3054 			continue;
3055 
3056 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3057 		iommu->root_entry[bus].hi = val;
3058 	}
3059 
3060 	spin_unlock_irqrestore(&iommu->lock, flags);
3061 
3062 	kfree(ctxt_tbls);
3063 
3064 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3065 
3066 	ret = 0;
3067 
3068 out_unmap:
3069 	memunmap(old_rt);
3070 
3071 	return ret;
3072 }
3073 
3074 #ifdef CONFIG_INTEL_IOMMU_SVM
3075 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3076 {
3077 	struct intel_iommu *iommu = data;
3078 	ioasid_t ioasid;
3079 
3080 	if (!iommu)
3081 		return INVALID_IOASID;
3082 	/*
3083 	 * VT-d virtual command interface always uses the full 20 bit
3084 	 * PASID range. Host can partition guest PASID range based on
3085 	 * policies but it is out of guest's control.
3086 	 */
3087 	if (min < PASID_MIN || max > intel_pasid_max_id)
3088 		return INVALID_IOASID;
3089 
3090 	if (vcmd_alloc_pasid(iommu, &ioasid))
3091 		return INVALID_IOASID;
3092 
3093 	return ioasid;
3094 }
3095 
3096 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3097 {
3098 	struct intel_iommu *iommu = data;
3099 
3100 	if (!iommu)
3101 		return;
3102 	/*
3103 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3104 	 * We can only free the PASID when all the devices are unbound.
3105 	 */
3106 	if (ioasid_find(NULL, ioasid, NULL)) {
3107 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3108 		return;
3109 	}
3110 	vcmd_free_pasid(iommu, ioasid);
3111 }
3112 
3113 static void register_pasid_allocator(struct intel_iommu *iommu)
3114 {
3115 	/*
3116 	 * If we are running in the host, no need for custom allocator
3117 	 * in that PASIDs are allocated from the host system-wide.
3118 	 */
3119 	if (!cap_caching_mode(iommu->cap))
3120 		return;
3121 
3122 	if (!sm_supported(iommu)) {
3123 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3124 		return;
3125 	}
3126 
3127 	/*
3128 	 * Register a custom PASID allocator if we are running in a guest,
3129 	 * guest PASID must be obtained via virtual command interface.
3130 	 * There can be multiple vIOMMUs in each guest but only one allocator
3131 	 * is active. All vIOMMU allocators will eventually be calling the same
3132 	 * host allocator.
3133 	 */
3134 	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3135 		return;
3136 
3137 	pr_info("Register custom PASID allocator\n");
3138 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3139 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3140 	iommu->pasid_allocator.pdata = (void *)iommu;
3141 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3142 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3143 		/*
3144 		 * Disable scalable mode on this IOMMU if there
3145 		 * is no custom allocator. Mixing SM capable vIOMMU
3146 		 * and non-SM vIOMMU are not supported.
3147 		 */
3148 		intel_iommu_sm = 0;
3149 	}
3150 }
3151 #endif
3152 
3153 static int __init init_dmars(void)
3154 {
3155 	struct dmar_drhd_unit *drhd;
3156 	struct intel_iommu *iommu;
3157 	int ret;
3158 
3159 	/*
3160 	 * for each drhd
3161 	 *    allocate root
3162 	 *    initialize and program root entry to not present
3163 	 * endfor
3164 	 */
3165 	for_each_drhd_unit(drhd) {
3166 		/*
3167 		 * lock not needed as this is only incremented in the single
3168 		 * threaded kernel __init code path all other access are read
3169 		 * only
3170 		 */
3171 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3172 			g_num_of_iommus++;
3173 			continue;
3174 		}
3175 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3176 	}
3177 
3178 	/* Preallocate enough resources for IOMMU hot-addition */
3179 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3180 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3181 
3182 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3183 			GFP_KERNEL);
3184 	if (!g_iommus) {
3185 		pr_err("Allocating global iommu array failed\n");
3186 		ret = -ENOMEM;
3187 		goto error;
3188 	}
3189 
3190 	for_each_iommu(iommu, drhd) {
3191 		if (drhd->ignored) {
3192 			iommu_disable_translation(iommu);
3193 			continue;
3194 		}
3195 
3196 		/*
3197 		 * Find the max pasid size of all IOMMU's in the system.
3198 		 * We need to ensure the system pasid table is no bigger
3199 		 * than the smallest supported.
3200 		 */
3201 		if (pasid_supported(iommu)) {
3202 			u32 temp = 2 << ecap_pss(iommu->ecap);
3203 
3204 			intel_pasid_max_id = min_t(u32, temp,
3205 						   intel_pasid_max_id);
3206 		}
3207 
3208 		g_iommus[iommu->seq_id] = iommu;
3209 
3210 		intel_iommu_init_qi(iommu);
3211 
3212 		ret = iommu_init_domains(iommu);
3213 		if (ret)
3214 			goto free_iommu;
3215 
3216 		init_translation_status(iommu);
3217 
3218 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3219 			iommu_disable_translation(iommu);
3220 			clear_translation_pre_enabled(iommu);
3221 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3222 				iommu->name);
3223 		}
3224 
3225 		/*
3226 		 * TBD:
3227 		 * we could share the same root & context tables
3228 		 * among all IOMMU's. Need to Split it later.
3229 		 */
3230 		ret = iommu_alloc_root_entry(iommu);
3231 		if (ret)
3232 			goto free_iommu;
3233 
3234 		if (translation_pre_enabled(iommu)) {
3235 			pr_info("Translation already enabled - trying to copy translation structures\n");
3236 
3237 			ret = copy_translation_tables(iommu);
3238 			if (ret) {
3239 				/*
3240 				 * We found the IOMMU with translation
3241 				 * enabled - but failed to copy over the
3242 				 * old root-entry table. Try to proceed
3243 				 * by disabling translation now and
3244 				 * allocating a clean root-entry table.
3245 				 * This might cause DMAR faults, but
3246 				 * probably the dump will still succeed.
3247 				 */
3248 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3249 				       iommu->name);
3250 				iommu_disable_translation(iommu);
3251 				clear_translation_pre_enabled(iommu);
3252 			} else {
3253 				pr_info("Copied translation tables from previous kernel for %s\n",
3254 					iommu->name);
3255 			}
3256 		}
3257 
3258 		if (!ecap_pass_through(iommu->ecap))
3259 			hw_pass_through = 0;
3260 		intel_svm_check(iommu);
3261 	}
3262 
3263 	/*
3264 	 * Now that qi is enabled on all iommus, set the root entry and flush
3265 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3266 	 * flush_context function will loop forever and the boot hangs.
3267 	 */
3268 	for_each_active_iommu(iommu, drhd) {
3269 		iommu_flush_write_buffer(iommu);
3270 #ifdef CONFIG_INTEL_IOMMU_SVM
3271 		register_pasid_allocator(iommu);
3272 #endif
3273 		iommu_set_root_entry(iommu);
3274 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3275 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3276 	}
3277 
3278 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3279 	dmar_map_gfx = 0;
3280 #endif
3281 
3282 	if (!dmar_map_gfx)
3283 		iommu_identity_mapping |= IDENTMAP_GFX;
3284 
3285 	check_tylersburg_isoch();
3286 
3287 	ret = si_domain_init(hw_pass_through);
3288 	if (ret)
3289 		goto free_iommu;
3290 
3291 	/*
3292 	 * for each drhd
3293 	 *   enable fault log
3294 	 *   global invalidate context cache
3295 	 *   global invalidate iotlb
3296 	 *   enable translation
3297 	 */
3298 	for_each_iommu(iommu, drhd) {
3299 		if (drhd->ignored) {
3300 			/*
3301 			 * we always have to disable PMRs or DMA may fail on
3302 			 * this device
3303 			 */
3304 			if (force_on)
3305 				iommu_disable_protect_mem_regions(iommu);
3306 			continue;
3307 		}
3308 
3309 		iommu_flush_write_buffer(iommu);
3310 
3311 #ifdef CONFIG_INTEL_IOMMU_SVM
3312 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3313 			/*
3314 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3315 			 * could cause possible lock race condition.
3316 			 */
3317 			up_write(&dmar_global_lock);
3318 			ret = intel_svm_enable_prq(iommu);
3319 			down_write(&dmar_global_lock);
3320 			if (ret)
3321 				goto free_iommu;
3322 		}
3323 #endif
3324 		ret = dmar_set_interrupt(iommu);
3325 		if (ret)
3326 			goto free_iommu;
3327 	}
3328 
3329 	return 0;
3330 
3331 free_iommu:
3332 	for_each_active_iommu(iommu, drhd) {
3333 		disable_dmar_iommu(iommu);
3334 		free_dmar_iommu(iommu);
3335 	}
3336 
3337 	kfree(g_iommus);
3338 
3339 error:
3340 	return ret;
3341 }
3342 
3343 /* This takes a number of _MM_ pages, not VTD pages */
3344 static unsigned long intel_alloc_iova(struct device *dev,
3345 				     struct dmar_domain *domain,
3346 				     unsigned long nrpages, uint64_t dma_mask)
3347 {
3348 	unsigned long iova_pfn;
3349 
3350 	/*
3351 	 * Restrict dma_mask to the width that the iommu can handle.
3352 	 * First-level translation restricts the input-address to a
3353 	 * canonical address (i.e., address bits 63:N have the same
3354 	 * value as address bit [N-1], where N is 48-bits with 4-level
3355 	 * paging and 57-bits with 5-level paging). Hence, skip bit
3356 	 * [N-1].
3357 	 */
3358 	if (domain_use_first_level(domain))
3359 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3360 				 dma_mask);
3361 	else
3362 		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3363 				 dma_mask);
3364 
3365 	/* Ensure we reserve the whole size-aligned region */
3366 	nrpages = __roundup_pow_of_two(nrpages);
3367 
3368 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3369 		/*
3370 		 * First try to allocate an io virtual address in
3371 		 * DMA_BIT_MASK(32) and if that fails then try allocating
3372 		 * from higher range
3373 		 */
3374 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3375 					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3376 		if (iova_pfn)
3377 			return iova_pfn;
3378 	}
3379 	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3380 				   IOVA_PFN(dma_mask), true);
3381 	if (unlikely(!iova_pfn)) {
3382 		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3383 			     nrpages);
3384 		return 0;
3385 	}
3386 
3387 	return iova_pfn;
3388 }
3389 
3390 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3391 				     size_t size, int dir, u64 dma_mask)
3392 {
3393 	struct dmar_domain *domain;
3394 	phys_addr_t start_paddr;
3395 	unsigned long iova_pfn;
3396 	int prot = 0;
3397 	int ret;
3398 	struct intel_iommu *iommu;
3399 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3400 
3401 	BUG_ON(dir == DMA_NONE);
3402 
3403 	if (unlikely(attach_deferred(dev)))
3404 		do_deferred_attach(dev);
3405 
3406 	domain = find_domain(dev);
3407 	if (!domain)
3408 		return DMA_MAPPING_ERROR;
3409 
3410 	iommu = domain_get_iommu(domain);
3411 	size = aligned_nrpages(paddr, size);
3412 
3413 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3414 	if (!iova_pfn)
3415 		goto error;
3416 
3417 	/*
3418 	 * Check if DMAR supports zero-length reads on write only
3419 	 * mappings..
3420 	 */
3421 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3422 			!cap_zlr(iommu->cap))
3423 		prot |= DMA_PTE_READ;
3424 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3425 		prot |= DMA_PTE_WRITE;
3426 	/*
3427 	 * paddr - (paddr + size) might be partial page, we should map the whole
3428 	 * page.  Note: if two part of one page are separately mapped, we
3429 	 * might have two guest_addr mapping to the same host paddr, but this
3430 	 * is not a big problem
3431 	 */
3432 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3433 				 mm_to_dma_pfn(paddr_pfn), size, prot);
3434 	if (ret)
3435 		goto error;
3436 
3437 	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3438 	start_paddr += paddr & ~PAGE_MASK;
3439 
3440 	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3441 
3442 	return start_paddr;
3443 
3444 error:
3445 	if (iova_pfn)
3446 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3447 	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3448 		size, (unsigned long long)paddr, dir);
3449 	return DMA_MAPPING_ERROR;
3450 }
3451 
3452 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3453 				 unsigned long offset, size_t size,
3454 				 enum dma_data_direction dir,
3455 				 unsigned long attrs)
3456 {
3457 	return __intel_map_single(dev, page_to_phys(page) + offset,
3458 				  size, dir, *dev->dma_mask);
3459 }
3460 
3461 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3462 				     size_t size, enum dma_data_direction dir,
3463 				     unsigned long attrs)
3464 {
3465 	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3466 }
3467 
3468 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3469 {
3470 	struct dmar_domain *domain;
3471 	unsigned long start_pfn, last_pfn;
3472 	unsigned long nrpages;
3473 	unsigned long iova_pfn;
3474 	struct intel_iommu *iommu;
3475 	struct page *freelist;
3476 	struct pci_dev *pdev = NULL;
3477 
3478 	domain = find_domain(dev);
3479 	BUG_ON(!domain);
3480 
3481 	iommu = domain_get_iommu(domain);
3482 
3483 	iova_pfn = IOVA_PFN(dev_addr);
3484 
3485 	nrpages = aligned_nrpages(dev_addr, size);
3486 	start_pfn = mm_to_dma_pfn(iova_pfn);
3487 	last_pfn = start_pfn + nrpages - 1;
3488 
3489 	if (dev_is_pci(dev))
3490 		pdev = to_pci_dev(dev);
3491 
3492 	freelist = domain_unmap(domain, start_pfn, last_pfn);
3493 	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3494 			!has_iova_flush_queue(&domain->iovad)) {
3495 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3496 				      nrpages, !freelist, 0);
3497 		/* free iova */
3498 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3499 		dma_free_pagelist(freelist);
3500 	} else {
3501 		queue_iova(&domain->iovad, iova_pfn, nrpages,
3502 			   (unsigned long)freelist);
3503 		/*
3504 		 * queue up the release of the unmap to save the 1/6th of the
3505 		 * cpu used up by the iotlb flush operation...
3506 		 */
3507 	}
3508 
3509 	trace_unmap_single(dev, dev_addr, size);
3510 }
3511 
3512 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3513 			     size_t size, enum dma_data_direction dir,
3514 			     unsigned long attrs)
3515 {
3516 	intel_unmap(dev, dev_addr, size);
3517 }
3518 
3519 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3520 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3521 {
3522 	intel_unmap(dev, dev_addr, size);
3523 }
3524 
3525 static void *intel_alloc_coherent(struct device *dev, size_t size,
3526 				  dma_addr_t *dma_handle, gfp_t flags,
3527 				  unsigned long attrs)
3528 {
3529 	struct page *page = NULL;
3530 	int order;
3531 
3532 	if (unlikely(attach_deferred(dev)))
3533 		do_deferred_attach(dev);
3534 
3535 	size = PAGE_ALIGN(size);
3536 	order = get_order(size);
3537 
3538 	if (gfpflags_allow_blocking(flags)) {
3539 		unsigned int count = size >> PAGE_SHIFT;
3540 
3541 		page = dma_alloc_from_contiguous(dev, count, order,
3542 						 flags & __GFP_NOWARN);
3543 	}
3544 
3545 	if (!page)
3546 		page = alloc_pages(flags, order);
3547 	if (!page)
3548 		return NULL;
3549 	memset(page_address(page), 0, size);
3550 
3551 	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3552 					 DMA_BIDIRECTIONAL,
3553 					 dev->coherent_dma_mask);
3554 	if (*dma_handle != DMA_MAPPING_ERROR)
3555 		return page_address(page);
3556 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3557 		__free_pages(page, order);
3558 
3559 	return NULL;
3560 }
3561 
3562 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3563 				dma_addr_t dma_handle, unsigned long attrs)
3564 {
3565 	int order;
3566 	struct page *page = virt_to_page(vaddr);
3567 
3568 	size = PAGE_ALIGN(size);
3569 	order = get_order(size);
3570 
3571 	intel_unmap(dev, dma_handle, size);
3572 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3573 		__free_pages(page, order);
3574 }
3575 
3576 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3577 			   int nelems, enum dma_data_direction dir,
3578 			   unsigned long attrs)
3579 {
3580 	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3581 	unsigned long nrpages = 0;
3582 	struct scatterlist *sg;
3583 	int i;
3584 
3585 	for_each_sg(sglist, sg, nelems, i) {
3586 		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3587 	}
3588 
3589 	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3590 
3591 	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3592 }
3593 
3594 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3595 			enum dma_data_direction dir, unsigned long attrs)
3596 {
3597 	int i;
3598 	struct dmar_domain *domain;
3599 	size_t size = 0;
3600 	int prot = 0;
3601 	unsigned long iova_pfn;
3602 	int ret;
3603 	struct scatterlist *sg;
3604 	unsigned long start_vpfn;
3605 	struct intel_iommu *iommu;
3606 
3607 	BUG_ON(dir == DMA_NONE);
3608 
3609 	if (unlikely(attach_deferred(dev)))
3610 		do_deferred_attach(dev);
3611 
3612 	domain = find_domain(dev);
3613 	if (!domain)
3614 		return 0;
3615 
3616 	iommu = domain_get_iommu(domain);
3617 
3618 	for_each_sg(sglist, sg, nelems, i)
3619 		size += aligned_nrpages(sg->offset, sg->length);
3620 
3621 	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3622 				*dev->dma_mask);
3623 	if (!iova_pfn) {
3624 		sglist->dma_length = 0;
3625 		return 0;
3626 	}
3627 
3628 	/*
3629 	 * Check if DMAR supports zero-length reads on write only
3630 	 * mappings..
3631 	 */
3632 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3633 			!cap_zlr(iommu->cap))
3634 		prot |= DMA_PTE_READ;
3635 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3636 		prot |= DMA_PTE_WRITE;
3637 
3638 	start_vpfn = mm_to_dma_pfn(iova_pfn);
3639 
3640 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3641 	if (unlikely(ret)) {
3642 		dma_pte_free_pagetable(domain, start_vpfn,
3643 				       start_vpfn + size - 1,
3644 				       agaw_to_level(domain->agaw) + 1);
3645 		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3646 		return 0;
3647 	}
3648 
3649 	for_each_sg(sglist, sg, nelems, i)
3650 		trace_map_sg(dev, i + 1, nelems, sg);
3651 
3652 	return nelems;
3653 }
3654 
3655 static u64 intel_get_required_mask(struct device *dev)
3656 {
3657 	return DMA_BIT_MASK(32);
3658 }
3659 
3660 static const struct dma_map_ops intel_dma_ops = {
3661 	.alloc = intel_alloc_coherent,
3662 	.free = intel_free_coherent,
3663 	.map_sg = intel_map_sg,
3664 	.unmap_sg = intel_unmap_sg,
3665 	.map_page = intel_map_page,
3666 	.unmap_page = intel_unmap_page,
3667 	.map_resource = intel_map_resource,
3668 	.unmap_resource = intel_unmap_resource,
3669 	.dma_supported = dma_direct_supported,
3670 	.mmap = dma_common_mmap,
3671 	.get_sgtable = dma_common_get_sgtable,
3672 	.get_required_mask = intel_get_required_mask,
3673 };
3674 
3675 static void
3676 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3677 		   enum dma_data_direction dir, enum dma_sync_target target)
3678 {
3679 	struct dmar_domain *domain;
3680 	phys_addr_t tlb_addr;
3681 
3682 	domain = find_domain(dev);
3683 	if (WARN_ON(!domain))
3684 		return;
3685 
3686 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3687 	if (is_swiotlb_buffer(tlb_addr))
3688 		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3689 }
3690 
3691 static dma_addr_t
3692 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3693 		  enum dma_data_direction dir, unsigned long attrs,
3694 		  u64 dma_mask)
3695 {
3696 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3697 	struct dmar_domain *domain;
3698 	struct intel_iommu *iommu;
3699 	unsigned long iova_pfn;
3700 	unsigned long nrpages;
3701 	phys_addr_t tlb_addr;
3702 	int prot = 0;
3703 	int ret;
3704 
3705 	if (unlikely(attach_deferred(dev)))
3706 		do_deferred_attach(dev);
3707 
3708 	domain = find_domain(dev);
3709 
3710 	if (WARN_ON(dir == DMA_NONE || !domain))
3711 		return DMA_MAPPING_ERROR;
3712 
3713 	iommu = domain_get_iommu(domain);
3714 	if (WARN_ON(!iommu))
3715 		return DMA_MAPPING_ERROR;
3716 
3717 	nrpages = aligned_nrpages(0, size);
3718 	iova_pfn = intel_alloc_iova(dev, domain,
3719 				    dma_to_mm_pfn(nrpages), dma_mask);
3720 	if (!iova_pfn)
3721 		return DMA_MAPPING_ERROR;
3722 
3723 	/*
3724 	 * Check if DMAR supports zero-length reads on write only
3725 	 * mappings..
3726 	 */
3727 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3728 			!cap_zlr(iommu->cap))
3729 		prot |= DMA_PTE_READ;
3730 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3731 		prot |= DMA_PTE_WRITE;
3732 
3733 	/*
3734 	 * If both the physical buffer start address and size are
3735 	 * page aligned, we don't need to use a bounce page.
3736 	 */
3737 	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3738 		tlb_addr = swiotlb_tbl_map_single(dev,
3739 				__phys_to_dma(dev, io_tlb_start),
3740 				paddr, size, aligned_size, dir, attrs);
3741 		if (tlb_addr == DMA_MAPPING_ERROR) {
3742 			goto swiotlb_error;
3743 		} else {
3744 			/* Cleanup the padding area. */
3745 			void *padding_start = phys_to_virt(tlb_addr);
3746 			size_t padding_size = aligned_size;
3747 
3748 			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3749 			    (dir == DMA_TO_DEVICE ||
3750 			     dir == DMA_BIDIRECTIONAL)) {
3751 				padding_start += size;
3752 				padding_size -= size;
3753 			}
3754 
3755 			memset(padding_start, 0, padding_size);
3756 		}
3757 	} else {
3758 		tlb_addr = paddr;
3759 	}
3760 
3761 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3762 				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3763 	if (ret)
3764 		goto mapping_error;
3765 
3766 	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3767 
3768 	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3769 
3770 mapping_error:
3771 	if (is_swiotlb_buffer(tlb_addr))
3772 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3773 					 aligned_size, dir, attrs);
3774 swiotlb_error:
3775 	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3776 	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3777 		size, (unsigned long long)paddr, dir);
3778 
3779 	return DMA_MAPPING_ERROR;
3780 }
3781 
3782 static void
3783 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3784 		    enum dma_data_direction dir, unsigned long attrs)
3785 {
3786 	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3787 	struct dmar_domain *domain;
3788 	phys_addr_t tlb_addr;
3789 
3790 	domain = find_domain(dev);
3791 	if (WARN_ON(!domain))
3792 		return;
3793 
3794 	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3795 	if (WARN_ON(!tlb_addr))
3796 		return;
3797 
3798 	intel_unmap(dev, dev_addr, size);
3799 	if (is_swiotlb_buffer(tlb_addr))
3800 		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3801 					 aligned_size, dir, attrs);
3802 
3803 	trace_bounce_unmap_single(dev, dev_addr, size);
3804 }
3805 
3806 static dma_addr_t
3807 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3808 		size_t size, enum dma_data_direction dir, unsigned long attrs)
3809 {
3810 	return bounce_map_single(dev, page_to_phys(page) + offset,
3811 				 size, dir, attrs, *dev->dma_mask);
3812 }
3813 
3814 static dma_addr_t
3815 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3816 		    enum dma_data_direction dir, unsigned long attrs)
3817 {
3818 	return bounce_map_single(dev, phys_addr, size,
3819 				 dir, attrs, *dev->dma_mask);
3820 }
3821 
3822 static void
3823 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3824 		  enum dma_data_direction dir, unsigned long attrs)
3825 {
3826 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3827 }
3828 
3829 static void
3830 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3831 		      enum dma_data_direction dir, unsigned long attrs)
3832 {
3833 	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3834 }
3835 
3836 static void
3837 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3838 		enum dma_data_direction dir, unsigned long attrs)
3839 {
3840 	struct scatterlist *sg;
3841 	int i;
3842 
3843 	for_each_sg(sglist, sg, nelems, i)
3844 		bounce_unmap_page(dev, sg->dma_address,
3845 				  sg_dma_len(sg), dir, attrs);
3846 }
3847 
3848 static int
3849 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3850 	      enum dma_data_direction dir, unsigned long attrs)
3851 {
3852 	int i;
3853 	struct scatterlist *sg;
3854 
3855 	for_each_sg(sglist, sg, nelems, i) {
3856 		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3857 						  sg->offset, sg->length,
3858 						  dir, attrs);
3859 		if (sg->dma_address == DMA_MAPPING_ERROR)
3860 			goto out_unmap;
3861 		sg_dma_len(sg) = sg->length;
3862 	}
3863 
3864 	for_each_sg(sglist, sg, nelems, i)
3865 		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3866 
3867 	return nelems;
3868 
3869 out_unmap:
3870 	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3871 	return 0;
3872 }
3873 
3874 static void
3875 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3876 			   size_t size, enum dma_data_direction dir)
3877 {
3878 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3879 }
3880 
3881 static void
3882 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3883 			      size_t size, enum dma_data_direction dir)
3884 {
3885 	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3886 }
3887 
3888 static void
3889 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3890 		       int nelems, enum dma_data_direction dir)
3891 {
3892 	struct scatterlist *sg;
3893 	int i;
3894 
3895 	for_each_sg(sglist, sg, nelems, i)
3896 		bounce_sync_single(dev, sg_dma_address(sg),
3897 				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3898 }
3899 
3900 static void
3901 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3902 			  int nelems, enum dma_data_direction dir)
3903 {
3904 	struct scatterlist *sg;
3905 	int i;
3906 
3907 	for_each_sg(sglist, sg, nelems, i)
3908 		bounce_sync_single(dev, sg_dma_address(sg),
3909 				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3910 }
3911 
3912 static const struct dma_map_ops bounce_dma_ops = {
3913 	.alloc			= intel_alloc_coherent,
3914 	.free			= intel_free_coherent,
3915 	.map_sg			= bounce_map_sg,
3916 	.unmap_sg		= bounce_unmap_sg,
3917 	.map_page		= bounce_map_page,
3918 	.unmap_page		= bounce_unmap_page,
3919 	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
3920 	.sync_single_for_device	= bounce_sync_single_for_device,
3921 	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
3922 	.sync_sg_for_device	= bounce_sync_sg_for_device,
3923 	.map_resource		= bounce_map_resource,
3924 	.unmap_resource		= bounce_unmap_resource,
3925 	.dma_supported		= dma_direct_supported,
3926 };
3927 
3928 static inline int iommu_domain_cache_init(void)
3929 {
3930 	int ret = 0;
3931 
3932 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3933 					 sizeof(struct dmar_domain),
3934 					 0,
3935 					 SLAB_HWCACHE_ALIGN,
3936 
3937 					 NULL);
3938 	if (!iommu_domain_cache) {
3939 		pr_err("Couldn't create iommu_domain cache\n");
3940 		ret = -ENOMEM;
3941 	}
3942 
3943 	return ret;
3944 }
3945 
3946 static inline int iommu_devinfo_cache_init(void)
3947 {
3948 	int ret = 0;
3949 
3950 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3951 					 sizeof(struct device_domain_info),
3952 					 0,
3953 					 SLAB_HWCACHE_ALIGN,
3954 					 NULL);
3955 	if (!iommu_devinfo_cache) {
3956 		pr_err("Couldn't create devinfo cache\n");
3957 		ret = -ENOMEM;
3958 	}
3959 
3960 	return ret;
3961 }
3962 
3963 static int __init iommu_init_mempool(void)
3964 {
3965 	int ret;
3966 	ret = iova_cache_get();
3967 	if (ret)
3968 		return ret;
3969 
3970 	ret = iommu_domain_cache_init();
3971 	if (ret)
3972 		goto domain_error;
3973 
3974 	ret = iommu_devinfo_cache_init();
3975 	if (!ret)
3976 		return ret;
3977 
3978 	kmem_cache_destroy(iommu_domain_cache);
3979 domain_error:
3980 	iova_cache_put();
3981 
3982 	return -ENOMEM;
3983 }
3984 
3985 static void __init iommu_exit_mempool(void)
3986 {
3987 	kmem_cache_destroy(iommu_devinfo_cache);
3988 	kmem_cache_destroy(iommu_domain_cache);
3989 	iova_cache_put();
3990 }
3991 
3992 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3993 {
3994 	struct dmar_drhd_unit *drhd;
3995 	u32 vtbar;
3996 	int rc;
3997 
3998 	/* We know that this device on this chipset has its own IOMMU.
3999 	 * If we find it under a different IOMMU, then the BIOS is lying
4000 	 * to us. Hope that the IOMMU for this device is actually
4001 	 * disabled, and it needs no translation...
4002 	 */
4003 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4004 	if (rc) {
4005 		/* "can't" happen */
4006 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4007 		return;
4008 	}
4009 	vtbar &= 0xffff0000;
4010 
4011 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
4012 	drhd = dmar_find_matched_drhd_unit(pdev);
4013 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4014 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4015 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4016 		dev_iommu_priv_set(&pdev->dev, DUMMY_DEVICE_DOMAIN_INFO);
4017 	}
4018 }
4019 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4020 
4021 static void __init init_no_remapping_devices(void)
4022 {
4023 	struct dmar_drhd_unit *drhd;
4024 	struct device *dev;
4025 	int i;
4026 
4027 	for_each_drhd_unit(drhd) {
4028 		if (!drhd->include_all) {
4029 			for_each_active_dev_scope(drhd->devices,
4030 						  drhd->devices_cnt, i, dev)
4031 				break;
4032 			/* ignore DMAR unit if no devices exist */
4033 			if (i == drhd->devices_cnt)
4034 				drhd->ignored = 1;
4035 		}
4036 	}
4037 
4038 	for_each_active_drhd_unit(drhd) {
4039 		if (drhd->include_all)
4040 			continue;
4041 
4042 		for_each_active_dev_scope(drhd->devices,
4043 					  drhd->devices_cnt, i, dev)
4044 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4045 				break;
4046 		if (i < drhd->devices_cnt)
4047 			continue;
4048 
4049 		/* This IOMMU has *only* gfx devices. Either bypass it or
4050 		   set the gfx_mapped flag, as appropriate */
4051 		drhd->gfx_dedicated = 1;
4052 		if (!dmar_map_gfx) {
4053 			drhd->ignored = 1;
4054 			for_each_active_dev_scope(drhd->devices,
4055 						  drhd->devices_cnt, i, dev)
4056 				dev_iommu_priv_set(dev, DUMMY_DEVICE_DOMAIN_INFO);
4057 		}
4058 	}
4059 }
4060 
4061 #ifdef CONFIG_SUSPEND
4062 static int init_iommu_hw(void)
4063 {
4064 	struct dmar_drhd_unit *drhd;
4065 	struct intel_iommu *iommu = NULL;
4066 
4067 	for_each_active_iommu(iommu, drhd)
4068 		if (iommu->qi)
4069 			dmar_reenable_qi(iommu);
4070 
4071 	for_each_iommu(iommu, drhd) {
4072 		if (drhd->ignored) {
4073 			/*
4074 			 * we always have to disable PMRs or DMA may fail on
4075 			 * this device
4076 			 */
4077 			if (force_on)
4078 				iommu_disable_protect_mem_regions(iommu);
4079 			continue;
4080 		}
4081 
4082 		iommu_flush_write_buffer(iommu);
4083 
4084 		iommu_set_root_entry(iommu);
4085 
4086 		iommu->flush.flush_context(iommu, 0, 0, 0,
4087 					   DMA_CCMD_GLOBAL_INVL);
4088 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4089 		iommu_enable_translation(iommu);
4090 		iommu_disable_protect_mem_regions(iommu);
4091 	}
4092 
4093 	return 0;
4094 }
4095 
4096 static void iommu_flush_all(void)
4097 {
4098 	struct dmar_drhd_unit *drhd;
4099 	struct intel_iommu *iommu;
4100 
4101 	for_each_active_iommu(iommu, drhd) {
4102 		iommu->flush.flush_context(iommu, 0, 0, 0,
4103 					   DMA_CCMD_GLOBAL_INVL);
4104 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4105 					 DMA_TLB_GLOBAL_FLUSH);
4106 	}
4107 }
4108 
4109 static int iommu_suspend(void)
4110 {
4111 	struct dmar_drhd_unit *drhd;
4112 	struct intel_iommu *iommu = NULL;
4113 	unsigned long flag;
4114 
4115 	for_each_active_iommu(iommu, drhd) {
4116 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4117 						 GFP_ATOMIC);
4118 		if (!iommu->iommu_state)
4119 			goto nomem;
4120 	}
4121 
4122 	iommu_flush_all();
4123 
4124 	for_each_active_iommu(iommu, drhd) {
4125 		iommu_disable_translation(iommu);
4126 
4127 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4128 
4129 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4130 			readl(iommu->reg + DMAR_FECTL_REG);
4131 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4132 			readl(iommu->reg + DMAR_FEDATA_REG);
4133 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4134 			readl(iommu->reg + DMAR_FEADDR_REG);
4135 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4136 			readl(iommu->reg + DMAR_FEUADDR_REG);
4137 
4138 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4139 	}
4140 	return 0;
4141 
4142 nomem:
4143 	for_each_active_iommu(iommu, drhd)
4144 		kfree(iommu->iommu_state);
4145 
4146 	return -ENOMEM;
4147 }
4148 
4149 static void iommu_resume(void)
4150 {
4151 	struct dmar_drhd_unit *drhd;
4152 	struct intel_iommu *iommu = NULL;
4153 	unsigned long flag;
4154 
4155 	if (init_iommu_hw()) {
4156 		if (force_on)
4157 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4158 		else
4159 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4160 		return;
4161 	}
4162 
4163 	for_each_active_iommu(iommu, drhd) {
4164 
4165 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4166 
4167 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4168 			iommu->reg + DMAR_FECTL_REG);
4169 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4170 			iommu->reg + DMAR_FEDATA_REG);
4171 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4172 			iommu->reg + DMAR_FEADDR_REG);
4173 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4174 			iommu->reg + DMAR_FEUADDR_REG);
4175 
4176 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4177 	}
4178 
4179 	for_each_active_iommu(iommu, drhd)
4180 		kfree(iommu->iommu_state);
4181 }
4182 
4183 static struct syscore_ops iommu_syscore_ops = {
4184 	.resume		= iommu_resume,
4185 	.suspend	= iommu_suspend,
4186 };
4187 
4188 static void __init init_iommu_pm_ops(void)
4189 {
4190 	register_syscore_ops(&iommu_syscore_ops);
4191 }
4192 
4193 #else
4194 static inline void init_iommu_pm_ops(void) {}
4195 #endif	/* CONFIG_PM */
4196 
4197 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4198 {
4199 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4200 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4201 	    rmrr->end_address <= rmrr->base_address ||
4202 	    arch_rmrr_sanity_check(rmrr))
4203 		return -EINVAL;
4204 
4205 	return 0;
4206 }
4207 
4208 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4209 {
4210 	struct acpi_dmar_reserved_memory *rmrr;
4211 	struct dmar_rmrr_unit *rmrru;
4212 
4213 	rmrr = (struct acpi_dmar_reserved_memory *)header;
4214 	if (rmrr_sanity_check(rmrr)) {
4215 		pr_warn(FW_BUG
4216 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4217 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4218 			   rmrr->base_address, rmrr->end_address,
4219 			   dmi_get_system_info(DMI_BIOS_VENDOR),
4220 			   dmi_get_system_info(DMI_BIOS_VERSION),
4221 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4222 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4223 	}
4224 
4225 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4226 	if (!rmrru)
4227 		goto out;
4228 
4229 	rmrru->hdr = header;
4230 
4231 	rmrru->base_address = rmrr->base_address;
4232 	rmrru->end_address = rmrr->end_address;
4233 
4234 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4235 				((void *)rmrr) + rmrr->header.length,
4236 				&rmrru->devices_cnt);
4237 	if (rmrru->devices_cnt && rmrru->devices == NULL)
4238 		goto free_rmrru;
4239 
4240 	list_add(&rmrru->list, &dmar_rmrr_units);
4241 
4242 	return 0;
4243 free_rmrru:
4244 	kfree(rmrru);
4245 out:
4246 	return -ENOMEM;
4247 }
4248 
4249 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4250 {
4251 	struct dmar_atsr_unit *atsru;
4252 	struct acpi_dmar_atsr *tmp;
4253 
4254 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4255 				dmar_rcu_check()) {
4256 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4257 		if (atsr->segment != tmp->segment)
4258 			continue;
4259 		if (atsr->header.length != tmp->header.length)
4260 			continue;
4261 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4262 			return atsru;
4263 	}
4264 
4265 	return NULL;
4266 }
4267 
4268 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4269 {
4270 	struct acpi_dmar_atsr *atsr;
4271 	struct dmar_atsr_unit *atsru;
4272 
4273 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4274 		return 0;
4275 
4276 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4277 	atsru = dmar_find_atsr(atsr);
4278 	if (atsru)
4279 		return 0;
4280 
4281 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4282 	if (!atsru)
4283 		return -ENOMEM;
4284 
4285 	/*
4286 	 * If memory is allocated from slab by ACPI _DSM method, we need to
4287 	 * copy the memory content because the memory buffer will be freed
4288 	 * on return.
4289 	 */
4290 	atsru->hdr = (void *)(atsru + 1);
4291 	memcpy(atsru->hdr, hdr, hdr->length);
4292 	atsru->include_all = atsr->flags & 0x1;
4293 	if (!atsru->include_all) {
4294 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4295 				(void *)atsr + atsr->header.length,
4296 				&atsru->devices_cnt);
4297 		if (atsru->devices_cnt && atsru->devices == NULL) {
4298 			kfree(atsru);
4299 			return -ENOMEM;
4300 		}
4301 	}
4302 
4303 	list_add_rcu(&atsru->list, &dmar_atsr_units);
4304 
4305 	return 0;
4306 }
4307 
4308 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4309 {
4310 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4311 	kfree(atsru);
4312 }
4313 
4314 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4315 {
4316 	struct acpi_dmar_atsr *atsr;
4317 	struct dmar_atsr_unit *atsru;
4318 
4319 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4320 	atsru = dmar_find_atsr(atsr);
4321 	if (atsru) {
4322 		list_del_rcu(&atsru->list);
4323 		synchronize_rcu();
4324 		intel_iommu_free_atsr(atsru);
4325 	}
4326 
4327 	return 0;
4328 }
4329 
4330 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4331 {
4332 	int i;
4333 	struct device *dev;
4334 	struct acpi_dmar_atsr *atsr;
4335 	struct dmar_atsr_unit *atsru;
4336 
4337 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4338 	atsru = dmar_find_atsr(atsr);
4339 	if (!atsru)
4340 		return 0;
4341 
4342 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4343 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4344 					  i, dev)
4345 			return -EBUSY;
4346 	}
4347 
4348 	return 0;
4349 }
4350 
4351 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4352 {
4353 	int sp, ret;
4354 	struct intel_iommu *iommu = dmaru->iommu;
4355 
4356 	if (g_iommus[iommu->seq_id])
4357 		return 0;
4358 
4359 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4360 		pr_warn("%s: Doesn't support hardware pass through.\n",
4361 			iommu->name);
4362 		return -ENXIO;
4363 	}
4364 	if (!ecap_sc_support(iommu->ecap) &&
4365 	    domain_update_iommu_snooping(iommu)) {
4366 		pr_warn("%s: Doesn't support snooping.\n",
4367 			iommu->name);
4368 		return -ENXIO;
4369 	}
4370 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4371 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4372 		pr_warn("%s: Doesn't support large page.\n",
4373 			iommu->name);
4374 		return -ENXIO;
4375 	}
4376 
4377 	/*
4378 	 * Disable translation if already enabled prior to OS handover.
4379 	 */
4380 	if (iommu->gcmd & DMA_GCMD_TE)
4381 		iommu_disable_translation(iommu);
4382 
4383 	g_iommus[iommu->seq_id] = iommu;
4384 	ret = iommu_init_domains(iommu);
4385 	if (ret == 0)
4386 		ret = iommu_alloc_root_entry(iommu);
4387 	if (ret)
4388 		goto out;
4389 
4390 	intel_svm_check(iommu);
4391 
4392 	if (dmaru->ignored) {
4393 		/*
4394 		 * we always have to disable PMRs or DMA may fail on this device
4395 		 */
4396 		if (force_on)
4397 			iommu_disable_protect_mem_regions(iommu);
4398 		return 0;
4399 	}
4400 
4401 	intel_iommu_init_qi(iommu);
4402 	iommu_flush_write_buffer(iommu);
4403 
4404 #ifdef CONFIG_INTEL_IOMMU_SVM
4405 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4406 		ret = intel_svm_enable_prq(iommu);
4407 		if (ret)
4408 			goto disable_iommu;
4409 	}
4410 #endif
4411 	ret = dmar_set_interrupt(iommu);
4412 	if (ret)
4413 		goto disable_iommu;
4414 
4415 	iommu_set_root_entry(iommu);
4416 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4417 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4418 	iommu_enable_translation(iommu);
4419 
4420 	iommu_disable_protect_mem_regions(iommu);
4421 	return 0;
4422 
4423 disable_iommu:
4424 	disable_dmar_iommu(iommu);
4425 out:
4426 	free_dmar_iommu(iommu);
4427 	return ret;
4428 }
4429 
4430 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4431 {
4432 	int ret = 0;
4433 	struct intel_iommu *iommu = dmaru->iommu;
4434 
4435 	if (!intel_iommu_enabled)
4436 		return 0;
4437 	if (iommu == NULL)
4438 		return -EINVAL;
4439 
4440 	if (insert) {
4441 		ret = intel_iommu_add(dmaru);
4442 	} else {
4443 		disable_dmar_iommu(iommu);
4444 		free_dmar_iommu(iommu);
4445 	}
4446 
4447 	return ret;
4448 }
4449 
4450 static void intel_iommu_free_dmars(void)
4451 {
4452 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4453 	struct dmar_atsr_unit *atsru, *atsr_n;
4454 
4455 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4456 		list_del(&rmrru->list);
4457 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4458 		kfree(rmrru);
4459 	}
4460 
4461 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4462 		list_del(&atsru->list);
4463 		intel_iommu_free_atsr(atsru);
4464 	}
4465 }
4466 
4467 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4468 {
4469 	int i, ret = 1;
4470 	struct pci_bus *bus;
4471 	struct pci_dev *bridge = NULL;
4472 	struct device *tmp;
4473 	struct acpi_dmar_atsr *atsr;
4474 	struct dmar_atsr_unit *atsru;
4475 
4476 	dev = pci_physfn(dev);
4477 	for (bus = dev->bus; bus; bus = bus->parent) {
4478 		bridge = bus->self;
4479 		/* If it's an integrated device, allow ATS */
4480 		if (!bridge)
4481 			return 1;
4482 		/* Connected via non-PCIe: no ATS */
4483 		if (!pci_is_pcie(bridge) ||
4484 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4485 			return 0;
4486 		/* If we found the root port, look it up in the ATSR */
4487 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4488 			break;
4489 	}
4490 
4491 	rcu_read_lock();
4492 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4493 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4494 		if (atsr->segment != pci_domain_nr(dev->bus))
4495 			continue;
4496 
4497 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4498 			if (tmp == &bridge->dev)
4499 				goto out;
4500 
4501 		if (atsru->include_all)
4502 			goto out;
4503 	}
4504 	ret = 0;
4505 out:
4506 	rcu_read_unlock();
4507 
4508 	return ret;
4509 }
4510 
4511 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4512 {
4513 	int ret;
4514 	struct dmar_rmrr_unit *rmrru;
4515 	struct dmar_atsr_unit *atsru;
4516 	struct acpi_dmar_atsr *atsr;
4517 	struct acpi_dmar_reserved_memory *rmrr;
4518 
4519 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4520 		return 0;
4521 
4522 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4523 		rmrr = container_of(rmrru->hdr,
4524 				    struct acpi_dmar_reserved_memory, header);
4525 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4526 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4527 				((void *)rmrr) + rmrr->header.length,
4528 				rmrr->segment, rmrru->devices,
4529 				rmrru->devices_cnt);
4530 			if (ret < 0)
4531 				return ret;
4532 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4533 			dmar_remove_dev_scope(info, rmrr->segment,
4534 				rmrru->devices, rmrru->devices_cnt);
4535 		}
4536 	}
4537 
4538 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4539 		if (atsru->include_all)
4540 			continue;
4541 
4542 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4543 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4544 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4545 					(void *)atsr + atsr->header.length,
4546 					atsr->segment, atsru->devices,
4547 					atsru->devices_cnt);
4548 			if (ret > 0)
4549 				break;
4550 			else if (ret < 0)
4551 				return ret;
4552 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4553 			if (dmar_remove_dev_scope(info, atsr->segment,
4554 					atsru->devices, atsru->devices_cnt))
4555 				break;
4556 		}
4557 	}
4558 
4559 	return 0;
4560 }
4561 
4562 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4563 				       unsigned long val, void *v)
4564 {
4565 	struct memory_notify *mhp = v;
4566 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4567 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4568 			mhp->nr_pages - 1);
4569 
4570 	switch (val) {
4571 	case MEM_GOING_ONLINE:
4572 		if (iommu_domain_identity_map(si_domain,
4573 					      start_vpfn, last_vpfn)) {
4574 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4575 				start_vpfn, last_vpfn);
4576 			return NOTIFY_BAD;
4577 		}
4578 		break;
4579 
4580 	case MEM_OFFLINE:
4581 	case MEM_CANCEL_ONLINE:
4582 		{
4583 			struct dmar_drhd_unit *drhd;
4584 			struct intel_iommu *iommu;
4585 			struct page *freelist;
4586 
4587 			freelist = domain_unmap(si_domain,
4588 						start_vpfn, last_vpfn);
4589 
4590 			rcu_read_lock();
4591 			for_each_active_iommu(iommu, drhd)
4592 				iommu_flush_iotlb_psi(iommu, si_domain,
4593 					start_vpfn, mhp->nr_pages,
4594 					!freelist, 0);
4595 			rcu_read_unlock();
4596 			dma_free_pagelist(freelist);
4597 		}
4598 		break;
4599 	}
4600 
4601 	return NOTIFY_OK;
4602 }
4603 
4604 static struct notifier_block intel_iommu_memory_nb = {
4605 	.notifier_call = intel_iommu_memory_notifier,
4606 	.priority = 0
4607 };
4608 
4609 static void free_all_cpu_cached_iovas(unsigned int cpu)
4610 {
4611 	int i;
4612 
4613 	for (i = 0; i < g_num_of_iommus; i++) {
4614 		struct intel_iommu *iommu = g_iommus[i];
4615 		struct dmar_domain *domain;
4616 		int did;
4617 
4618 		if (!iommu)
4619 			continue;
4620 
4621 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4622 			domain = get_iommu_domain(iommu, (u16)did);
4623 
4624 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4625 				continue;
4626 
4627 			free_cpu_cached_iovas(cpu, &domain->iovad);
4628 		}
4629 	}
4630 }
4631 
4632 static int intel_iommu_cpu_dead(unsigned int cpu)
4633 {
4634 	free_all_cpu_cached_iovas(cpu);
4635 	return 0;
4636 }
4637 
4638 static void intel_disable_iommus(void)
4639 {
4640 	struct intel_iommu *iommu = NULL;
4641 	struct dmar_drhd_unit *drhd;
4642 
4643 	for_each_iommu(iommu, drhd)
4644 		iommu_disable_translation(iommu);
4645 }
4646 
4647 void intel_iommu_shutdown(void)
4648 {
4649 	struct dmar_drhd_unit *drhd;
4650 	struct intel_iommu *iommu = NULL;
4651 
4652 	if (no_iommu || dmar_disabled)
4653 		return;
4654 
4655 	down_write(&dmar_global_lock);
4656 
4657 	/* Disable PMRs explicitly here. */
4658 	for_each_iommu(iommu, drhd)
4659 		iommu_disable_protect_mem_regions(iommu);
4660 
4661 	/* Make sure the IOMMUs are switched off */
4662 	intel_disable_iommus();
4663 
4664 	up_write(&dmar_global_lock);
4665 }
4666 
4667 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4668 {
4669 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4670 
4671 	return container_of(iommu_dev, struct intel_iommu, iommu);
4672 }
4673 
4674 static ssize_t intel_iommu_show_version(struct device *dev,
4675 					struct device_attribute *attr,
4676 					char *buf)
4677 {
4678 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4679 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4680 	return sprintf(buf, "%d:%d\n",
4681 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4682 }
4683 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4684 
4685 static ssize_t intel_iommu_show_address(struct device *dev,
4686 					struct device_attribute *attr,
4687 					char *buf)
4688 {
4689 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4691 }
4692 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4693 
4694 static ssize_t intel_iommu_show_cap(struct device *dev,
4695 				    struct device_attribute *attr,
4696 				    char *buf)
4697 {
4698 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699 	return sprintf(buf, "%llx\n", iommu->cap);
4700 }
4701 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4702 
4703 static ssize_t intel_iommu_show_ecap(struct device *dev,
4704 				    struct device_attribute *attr,
4705 				    char *buf)
4706 {
4707 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4708 	return sprintf(buf, "%llx\n", iommu->ecap);
4709 }
4710 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4711 
4712 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4713 				      struct device_attribute *attr,
4714 				      char *buf)
4715 {
4716 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4717 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4718 }
4719 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4720 
4721 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4722 					   struct device_attribute *attr,
4723 					   char *buf)
4724 {
4725 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4726 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4727 						  cap_ndoms(iommu->cap)));
4728 }
4729 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4730 
4731 static struct attribute *intel_iommu_attrs[] = {
4732 	&dev_attr_version.attr,
4733 	&dev_attr_address.attr,
4734 	&dev_attr_cap.attr,
4735 	&dev_attr_ecap.attr,
4736 	&dev_attr_domains_supported.attr,
4737 	&dev_attr_domains_used.attr,
4738 	NULL,
4739 };
4740 
4741 static struct attribute_group intel_iommu_group = {
4742 	.name = "intel-iommu",
4743 	.attrs = intel_iommu_attrs,
4744 };
4745 
4746 const struct attribute_group *intel_iommu_groups[] = {
4747 	&intel_iommu_group,
4748 	NULL,
4749 };
4750 
4751 static inline bool has_external_pci(void)
4752 {
4753 	struct pci_dev *pdev = NULL;
4754 
4755 	for_each_pci_dev(pdev)
4756 		if (pdev->external_facing)
4757 			return true;
4758 
4759 	return false;
4760 }
4761 
4762 static int __init platform_optin_force_iommu(void)
4763 {
4764 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4765 		return 0;
4766 
4767 	if (no_iommu || dmar_disabled)
4768 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4769 
4770 	/*
4771 	 * If Intel-IOMMU is disabled by default, we will apply identity
4772 	 * map for all devices except those marked as being untrusted.
4773 	 */
4774 	if (dmar_disabled)
4775 		iommu_set_default_passthrough(false);
4776 
4777 	dmar_disabled = 0;
4778 	no_iommu = 0;
4779 
4780 	return 1;
4781 }
4782 
4783 static int __init probe_acpi_namespace_devices(void)
4784 {
4785 	struct dmar_drhd_unit *drhd;
4786 	/* To avoid a -Wunused-but-set-variable warning. */
4787 	struct intel_iommu *iommu __maybe_unused;
4788 	struct device *dev;
4789 	int i, ret = 0;
4790 
4791 	for_each_active_iommu(iommu, drhd) {
4792 		for_each_active_dev_scope(drhd->devices,
4793 					  drhd->devices_cnt, i, dev) {
4794 			struct acpi_device_physical_node *pn;
4795 			struct iommu_group *group;
4796 			struct acpi_device *adev;
4797 
4798 			if (dev->bus != &acpi_bus_type)
4799 				continue;
4800 
4801 			adev = to_acpi_device(dev);
4802 			mutex_lock(&adev->physical_node_lock);
4803 			list_for_each_entry(pn,
4804 					    &adev->physical_node_list, node) {
4805 				group = iommu_group_get(pn->dev);
4806 				if (group) {
4807 					iommu_group_put(group);
4808 					continue;
4809 				}
4810 
4811 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4812 				ret = iommu_probe_device(pn->dev);
4813 				if (ret)
4814 					break;
4815 			}
4816 			mutex_unlock(&adev->physical_node_lock);
4817 
4818 			if (ret)
4819 				return ret;
4820 		}
4821 	}
4822 
4823 	return 0;
4824 }
4825 
4826 int __init intel_iommu_init(void)
4827 {
4828 	int ret = -ENODEV;
4829 	struct dmar_drhd_unit *drhd;
4830 	struct intel_iommu *iommu;
4831 
4832 	/*
4833 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4834 	 * opt in, so enforce that.
4835 	 */
4836 	force_on = tboot_force_iommu() || platform_optin_force_iommu();
4837 
4838 	if (iommu_init_mempool()) {
4839 		if (force_on)
4840 			panic("tboot: Failed to initialize iommu memory\n");
4841 		return -ENOMEM;
4842 	}
4843 
4844 	down_write(&dmar_global_lock);
4845 	if (dmar_table_init()) {
4846 		if (force_on)
4847 			panic("tboot: Failed to initialize DMAR table\n");
4848 		goto out_free_dmar;
4849 	}
4850 
4851 	if (dmar_dev_scope_init() < 0) {
4852 		if (force_on)
4853 			panic("tboot: Failed to initialize DMAR device scope\n");
4854 		goto out_free_dmar;
4855 	}
4856 
4857 	up_write(&dmar_global_lock);
4858 
4859 	/*
4860 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4861 	 * complain later when we register it under the lock.
4862 	 */
4863 	dmar_register_bus_notifier();
4864 
4865 	down_write(&dmar_global_lock);
4866 
4867 	if (!no_iommu)
4868 		intel_iommu_debugfs_init();
4869 
4870 	if (no_iommu || dmar_disabled) {
4871 		/*
4872 		 * We exit the function here to ensure IOMMU's remapping and
4873 		 * mempool aren't setup, which means that the IOMMU's PMRs
4874 		 * won't be disabled via the call to init_dmars(). So disable
4875 		 * it explicitly here. The PMRs were setup by tboot prior to
4876 		 * calling SENTER, but the kernel is expected to reset/tear
4877 		 * down the PMRs.
4878 		 */
4879 		if (intel_iommu_tboot_noforce) {
4880 			for_each_iommu(iommu, drhd)
4881 				iommu_disable_protect_mem_regions(iommu);
4882 		}
4883 
4884 		/*
4885 		 * Make sure the IOMMUs are switched off, even when we
4886 		 * boot into a kexec kernel and the previous kernel left
4887 		 * them enabled
4888 		 */
4889 		intel_disable_iommus();
4890 		goto out_free_dmar;
4891 	}
4892 
4893 	if (list_empty(&dmar_rmrr_units))
4894 		pr_info("No RMRR found\n");
4895 
4896 	if (list_empty(&dmar_atsr_units))
4897 		pr_info("No ATSR found\n");
4898 
4899 	if (dmar_init_reserved_ranges()) {
4900 		if (force_on)
4901 			panic("tboot: Failed to reserve iommu ranges\n");
4902 		goto out_free_reserved_range;
4903 	}
4904 
4905 	if (dmar_map_gfx)
4906 		intel_iommu_gfx_mapped = 1;
4907 
4908 	init_no_remapping_devices();
4909 
4910 	ret = init_dmars();
4911 	if (ret) {
4912 		if (force_on)
4913 			panic("tboot: Failed to initialize DMARs\n");
4914 		pr_err("Initialization failed\n");
4915 		goto out_free_reserved_range;
4916 	}
4917 	up_write(&dmar_global_lock);
4918 
4919 	init_iommu_pm_ops();
4920 
4921 	down_read(&dmar_global_lock);
4922 	for_each_active_iommu(iommu, drhd) {
4923 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4924 				       intel_iommu_groups,
4925 				       "%s", iommu->name);
4926 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4927 		iommu_device_register(&iommu->iommu);
4928 	}
4929 	up_read(&dmar_global_lock);
4930 
4931 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4932 	if (si_domain && !hw_pass_through)
4933 		register_memory_notifier(&intel_iommu_memory_nb);
4934 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4935 			  intel_iommu_cpu_dead);
4936 
4937 	down_read(&dmar_global_lock);
4938 	if (probe_acpi_namespace_devices())
4939 		pr_warn("ACPI name space devices didn't probe correctly\n");
4940 
4941 	/* Finally, we enable the DMA remapping hardware. */
4942 	for_each_iommu(iommu, drhd) {
4943 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4944 			iommu_enable_translation(iommu);
4945 
4946 		iommu_disable_protect_mem_regions(iommu);
4947 	}
4948 	up_read(&dmar_global_lock);
4949 
4950 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4951 
4952 	intel_iommu_enabled = 1;
4953 
4954 	return 0;
4955 
4956 out_free_reserved_range:
4957 	put_iova_domain(&reserved_iova_list);
4958 out_free_dmar:
4959 	intel_iommu_free_dmars();
4960 	up_write(&dmar_global_lock);
4961 	iommu_exit_mempool();
4962 	return ret;
4963 }
4964 
4965 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4966 {
4967 	struct intel_iommu *iommu = opaque;
4968 
4969 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4970 	return 0;
4971 }
4972 
4973 /*
4974  * NB - intel-iommu lacks any sort of reference counting for the users of
4975  * dependent devices.  If multiple endpoints have intersecting dependent
4976  * devices, unbinding the driver from any one of them will possibly leave
4977  * the others unable to operate.
4978  */
4979 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4980 {
4981 	if (!iommu || !dev || !dev_is_pci(dev))
4982 		return;
4983 
4984 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4985 }
4986 
4987 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4988 {
4989 	struct dmar_domain *domain;
4990 	struct intel_iommu *iommu;
4991 	unsigned long flags;
4992 
4993 	assert_spin_locked(&device_domain_lock);
4994 
4995 	if (WARN_ON(!info))
4996 		return;
4997 
4998 	iommu = info->iommu;
4999 	domain = info->domain;
5000 
5001 	if (info->dev) {
5002 		if (dev_is_pci(info->dev) && sm_supported(iommu))
5003 			intel_pasid_tear_down_entry(iommu, info->dev,
5004 					PASID_RID2PASID, false);
5005 
5006 		iommu_disable_dev_iotlb(info);
5007 		if (!dev_is_real_dma_subdevice(info->dev))
5008 			domain_context_clear(iommu, info->dev);
5009 		intel_pasid_free_table(info->dev);
5010 	}
5011 
5012 	unlink_domain_info(info);
5013 
5014 	spin_lock_irqsave(&iommu->lock, flags);
5015 	domain_detach_iommu(domain, iommu);
5016 	spin_unlock_irqrestore(&iommu->lock, flags);
5017 
5018 	free_devinfo_mem(info);
5019 }
5020 
5021 static void dmar_remove_one_dev_info(struct device *dev)
5022 {
5023 	struct device_domain_info *info;
5024 	unsigned long flags;
5025 
5026 	spin_lock_irqsave(&device_domain_lock, flags);
5027 	info = get_domain_info(dev);
5028 	if (info)
5029 		__dmar_remove_one_dev_info(info);
5030 	spin_unlock_irqrestore(&device_domain_lock, flags);
5031 }
5032 
5033 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5034 {
5035 	int adjust_width;
5036 
5037 	/* calculate AGAW */
5038 	domain->gaw = guest_width;
5039 	adjust_width = guestwidth_to_adjustwidth(guest_width);
5040 	domain->agaw = width_to_agaw(adjust_width);
5041 
5042 	domain->iommu_coherency = 0;
5043 	domain->iommu_snooping = 0;
5044 	domain->iommu_superpage = 0;
5045 	domain->max_addr = 0;
5046 
5047 	/* always allocate the top pgd */
5048 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5049 	if (!domain->pgd)
5050 		return -ENOMEM;
5051 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5052 	return 0;
5053 }
5054 
5055 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5056 {
5057 	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5058 	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5059 
5060 	if (!intel_iommu_strict &&
5061 	    init_iova_flush_queue(&dmar_domain->iovad,
5062 				  iommu_flush_iova, iova_entry_free))
5063 		pr_info("iova flush queue initialization failed\n");
5064 }
5065 
5066 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5067 {
5068 	struct dmar_domain *dmar_domain;
5069 	struct iommu_domain *domain;
5070 
5071 	switch (type) {
5072 	case IOMMU_DOMAIN_DMA:
5073 	case IOMMU_DOMAIN_UNMANAGED:
5074 		dmar_domain = alloc_domain(0);
5075 		if (!dmar_domain) {
5076 			pr_err("Can't allocate dmar_domain\n");
5077 			return NULL;
5078 		}
5079 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5080 			pr_err("Domain initialization failed\n");
5081 			domain_exit(dmar_domain);
5082 			return NULL;
5083 		}
5084 
5085 		if (type == IOMMU_DOMAIN_DMA)
5086 			intel_init_iova_domain(dmar_domain);
5087 
5088 		domain_update_iommu_cap(dmar_domain);
5089 
5090 		domain = &dmar_domain->domain;
5091 		domain->geometry.aperture_start = 0;
5092 		domain->geometry.aperture_end   =
5093 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5094 		domain->geometry.force_aperture = true;
5095 
5096 		return domain;
5097 	case IOMMU_DOMAIN_IDENTITY:
5098 		return &si_domain->domain;
5099 	default:
5100 		return NULL;
5101 	}
5102 
5103 	return NULL;
5104 }
5105 
5106 static void intel_iommu_domain_free(struct iommu_domain *domain)
5107 {
5108 	if (domain != &si_domain->domain)
5109 		domain_exit(to_dmar_domain(domain));
5110 }
5111 
5112 /*
5113  * Check whether a @domain could be attached to the @dev through the
5114  * aux-domain attach/detach APIs.
5115  */
5116 static inline bool
5117 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5118 {
5119 	struct device_domain_info *info = get_domain_info(dev);
5120 
5121 	return info && info->auxd_enabled &&
5122 			domain->type == IOMMU_DOMAIN_UNMANAGED;
5123 }
5124 
5125 static void auxiliary_link_device(struct dmar_domain *domain,
5126 				  struct device *dev)
5127 {
5128 	struct device_domain_info *info = get_domain_info(dev);
5129 
5130 	assert_spin_locked(&device_domain_lock);
5131 	if (WARN_ON(!info))
5132 		return;
5133 
5134 	domain->auxd_refcnt++;
5135 	list_add(&domain->auxd, &info->auxiliary_domains);
5136 }
5137 
5138 static void auxiliary_unlink_device(struct dmar_domain *domain,
5139 				    struct device *dev)
5140 {
5141 	struct device_domain_info *info = get_domain_info(dev);
5142 
5143 	assert_spin_locked(&device_domain_lock);
5144 	if (WARN_ON(!info))
5145 		return;
5146 
5147 	list_del(&domain->auxd);
5148 	domain->auxd_refcnt--;
5149 
5150 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5151 		ioasid_free(domain->default_pasid);
5152 }
5153 
5154 static int aux_domain_add_dev(struct dmar_domain *domain,
5155 			      struct device *dev)
5156 {
5157 	int ret;
5158 	unsigned long flags;
5159 	struct intel_iommu *iommu;
5160 
5161 	iommu = device_to_iommu(dev, NULL, NULL);
5162 	if (!iommu)
5163 		return -ENODEV;
5164 
5165 	if (domain->default_pasid <= 0) {
5166 		int pasid;
5167 
5168 		/* No private data needed for the default pasid */
5169 		pasid = ioasid_alloc(NULL, PASID_MIN,
5170 				     pci_max_pasids(to_pci_dev(dev)) - 1,
5171 				     NULL);
5172 		if (pasid == INVALID_IOASID) {
5173 			pr_err("Can't allocate default pasid\n");
5174 			return -ENODEV;
5175 		}
5176 		domain->default_pasid = pasid;
5177 	}
5178 
5179 	spin_lock_irqsave(&device_domain_lock, flags);
5180 	/*
5181 	 * iommu->lock must be held to attach domain to iommu and setup the
5182 	 * pasid entry for second level translation.
5183 	 */
5184 	spin_lock(&iommu->lock);
5185 	ret = domain_attach_iommu(domain, iommu);
5186 	if (ret)
5187 		goto attach_failed;
5188 
5189 	/* Setup the PASID entry for mediated devices: */
5190 	if (domain_use_first_level(domain))
5191 		ret = domain_setup_first_level(iommu, domain, dev,
5192 					       domain->default_pasid);
5193 	else
5194 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5195 						     domain->default_pasid);
5196 	if (ret)
5197 		goto table_failed;
5198 	spin_unlock(&iommu->lock);
5199 
5200 	auxiliary_link_device(domain, dev);
5201 
5202 	spin_unlock_irqrestore(&device_domain_lock, flags);
5203 
5204 	return 0;
5205 
5206 table_failed:
5207 	domain_detach_iommu(domain, iommu);
5208 attach_failed:
5209 	spin_unlock(&iommu->lock);
5210 	spin_unlock_irqrestore(&device_domain_lock, flags);
5211 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5212 		ioasid_free(domain->default_pasid);
5213 
5214 	return ret;
5215 }
5216 
5217 static void aux_domain_remove_dev(struct dmar_domain *domain,
5218 				  struct device *dev)
5219 {
5220 	struct device_domain_info *info;
5221 	struct intel_iommu *iommu;
5222 	unsigned long flags;
5223 
5224 	if (!is_aux_domain(dev, &domain->domain))
5225 		return;
5226 
5227 	spin_lock_irqsave(&device_domain_lock, flags);
5228 	info = get_domain_info(dev);
5229 	iommu = info->iommu;
5230 
5231 	auxiliary_unlink_device(domain, dev);
5232 
5233 	spin_lock(&iommu->lock);
5234 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5235 	domain_detach_iommu(domain, iommu);
5236 	spin_unlock(&iommu->lock);
5237 
5238 	spin_unlock_irqrestore(&device_domain_lock, flags);
5239 }
5240 
5241 static int prepare_domain_attach_device(struct iommu_domain *domain,
5242 					struct device *dev)
5243 {
5244 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5245 	struct intel_iommu *iommu;
5246 	int addr_width;
5247 
5248 	iommu = device_to_iommu(dev, NULL, NULL);
5249 	if (!iommu)
5250 		return -ENODEV;
5251 
5252 	/* check if this iommu agaw is sufficient for max mapped address */
5253 	addr_width = agaw_to_width(iommu->agaw);
5254 	if (addr_width > cap_mgaw(iommu->cap))
5255 		addr_width = cap_mgaw(iommu->cap);
5256 
5257 	if (dmar_domain->max_addr > (1LL << addr_width)) {
5258 		dev_err(dev, "%s: iommu width (%d) is not "
5259 		        "sufficient for the mapped address (%llx)\n",
5260 		        __func__, addr_width, dmar_domain->max_addr);
5261 		return -EFAULT;
5262 	}
5263 	dmar_domain->gaw = addr_width;
5264 
5265 	/*
5266 	 * Knock out extra levels of page tables if necessary
5267 	 */
5268 	while (iommu->agaw < dmar_domain->agaw) {
5269 		struct dma_pte *pte;
5270 
5271 		pte = dmar_domain->pgd;
5272 		if (dma_pte_present(pte)) {
5273 			dmar_domain->pgd = (struct dma_pte *)
5274 				phys_to_virt(dma_pte_addr(pte));
5275 			free_pgtable_page(pte);
5276 		}
5277 		dmar_domain->agaw--;
5278 	}
5279 
5280 	return 0;
5281 }
5282 
5283 static int intel_iommu_attach_device(struct iommu_domain *domain,
5284 				     struct device *dev)
5285 {
5286 	int ret;
5287 
5288 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5289 	    device_is_rmrr_locked(dev)) {
5290 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5291 		return -EPERM;
5292 	}
5293 
5294 	if (is_aux_domain(dev, domain))
5295 		return -EPERM;
5296 
5297 	/* normally dev is not mapped */
5298 	if (unlikely(domain_context_mapped(dev))) {
5299 		struct dmar_domain *old_domain;
5300 
5301 		old_domain = find_domain(dev);
5302 		if (old_domain)
5303 			dmar_remove_one_dev_info(dev);
5304 	}
5305 
5306 	ret = prepare_domain_attach_device(domain, dev);
5307 	if (ret)
5308 		return ret;
5309 
5310 	return domain_add_dev_info(to_dmar_domain(domain), dev);
5311 }
5312 
5313 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5314 					 struct device *dev)
5315 {
5316 	int ret;
5317 
5318 	if (!is_aux_domain(dev, domain))
5319 		return -EPERM;
5320 
5321 	ret = prepare_domain_attach_device(domain, dev);
5322 	if (ret)
5323 		return ret;
5324 
5325 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5326 }
5327 
5328 static void intel_iommu_detach_device(struct iommu_domain *domain,
5329 				      struct device *dev)
5330 {
5331 	dmar_remove_one_dev_info(dev);
5332 }
5333 
5334 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5335 					  struct device *dev)
5336 {
5337 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5338 }
5339 
5340 /*
5341  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5342  * VT-d granularity. Invalidation is typically included in the unmap operation
5343  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5344  * owns the first level page tables. Invalidations of translation caches in the
5345  * guest are trapped and passed down to the host.
5346  *
5347  * vIOMMU in the guest will only expose first level page tables, therefore
5348  * we do not support IOTLB granularity for request without PASID (second level).
5349  *
5350  * For example, to find the VT-d granularity encoding for IOTLB
5351  * type and page selective granularity within PASID:
5352  * X: indexed by iommu cache type
5353  * Y: indexed by enum iommu_inv_granularity
5354  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5355  */
5356 
5357 static const int
5358 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5359 	/*
5360 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5361 	 * page selective (address granularity)
5362 	 */
5363 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5364 	/* PASID based dev TLBs */
5365 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5366 	/* PASID cache */
5367 	{-EINVAL, -EINVAL, -EINVAL}
5368 };
5369 
5370 static inline int to_vtd_granularity(int type, int granu)
5371 {
5372 	return inv_type_granu_table[type][granu];
5373 }
5374 
5375 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5376 {
5377 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5378 
5379 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5380 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5381 	 * granu size in contiguous memory.
5382 	 */
5383 	return order_base_2(nr_pages);
5384 }
5385 
5386 #ifdef CONFIG_INTEL_IOMMU_SVM
5387 static int
5388 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5389 			   struct iommu_cache_invalidate_info *inv_info)
5390 {
5391 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5392 	struct device_domain_info *info;
5393 	struct intel_iommu *iommu;
5394 	unsigned long flags;
5395 	int cache_type;
5396 	u8 bus, devfn;
5397 	u16 did, sid;
5398 	int ret = 0;
5399 	u64 size = 0;
5400 
5401 	if (!inv_info || !dmar_domain ||
5402 	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5403 		return -EINVAL;
5404 
5405 	if (!dev || !dev_is_pci(dev))
5406 		return -ENODEV;
5407 
5408 	iommu = device_to_iommu(dev, &bus, &devfn);
5409 	if (!iommu)
5410 		return -ENODEV;
5411 
5412 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5413 		return -EINVAL;
5414 
5415 	spin_lock_irqsave(&device_domain_lock, flags);
5416 	spin_lock(&iommu->lock);
5417 	info = get_domain_info(dev);
5418 	if (!info) {
5419 		ret = -EINVAL;
5420 		goto out_unlock;
5421 	}
5422 	did = dmar_domain->iommu_did[iommu->seq_id];
5423 	sid = PCI_DEVID(bus, devfn);
5424 
5425 	/* Size is only valid in address selective invalidation */
5426 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5427 		size = to_vtd_size(inv_info->addr_info.granule_size,
5428 				   inv_info->addr_info.nb_granules);
5429 
5430 	for_each_set_bit(cache_type,
5431 			 (unsigned long *)&inv_info->cache,
5432 			 IOMMU_CACHE_INV_TYPE_NR) {
5433 		int granu = 0;
5434 		u64 pasid = 0;
5435 		u64 addr = 0;
5436 
5437 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5438 		if (granu == -EINVAL) {
5439 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5440 					   cache_type, inv_info->granularity);
5441 			break;
5442 		}
5443 
5444 		/*
5445 		 * PASID is stored in different locations based on the
5446 		 * granularity.
5447 		 */
5448 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5449 		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5450 			pasid = inv_info->pasid_info.pasid;
5451 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5452 			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5453 			pasid = inv_info->addr_info.pasid;
5454 
5455 		switch (BIT(cache_type)) {
5456 		case IOMMU_CACHE_INV_TYPE_IOTLB:
5457 			/* HW will ignore LSB bits based on address mask */
5458 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5459 			    size &&
5460 			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5461 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5462 						   inv_info->addr_info.addr, size);
5463 			}
5464 
5465 			/*
5466 			 * If granu is PASID-selective, address is ignored.
5467 			 * We use npages = -1 to indicate that.
5468 			 */
5469 			qi_flush_piotlb(iommu, did, pasid,
5470 					mm_to_dma_pfn(inv_info->addr_info.addr),
5471 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5472 					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5473 
5474 			if (!info->ats_enabled)
5475 				break;
5476 			/*
5477 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5478 			 * in the guest may assume IOTLB flush is inclusive,
5479 			 * which is more efficient.
5480 			 */
5481 			fallthrough;
5482 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5483 			/*
5484 			 * PASID based device TLB invalidation does not support
5485 			 * IOMMU_INV_GRANU_PASID granularity but only supports
5486 			 * IOMMU_INV_GRANU_ADDR.
5487 			 * The equivalent of that is we set the size to be the
5488 			 * entire range of 64 bit. User only provides PASID info
5489 			 * without address info. So we set addr to 0.
5490 			 */
5491 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5492 				size = 64 - VTD_PAGE_SHIFT;
5493 				addr = 0;
5494 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5495 				addr = inv_info->addr_info.addr;
5496 			}
5497 
5498 			if (info->ats_enabled)
5499 				qi_flush_dev_iotlb_pasid(iommu, sid,
5500 						info->pfsid, pasid,
5501 						info->ats_qdep, addr,
5502 						size);
5503 			else
5504 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5505 			break;
5506 		default:
5507 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5508 					    cache_type);
5509 			ret = -EINVAL;
5510 		}
5511 	}
5512 out_unlock:
5513 	spin_unlock(&iommu->lock);
5514 	spin_unlock_irqrestore(&device_domain_lock, flags);
5515 
5516 	return ret;
5517 }
5518 #endif
5519 
5520 static int intel_iommu_map(struct iommu_domain *domain,
5521 			   unsigned long iova, phys_addr_t hpa,
5522 			   size_t size, int iommu_prot, gfp_t gfp)
5523 {
5524 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5525 	u64 max_addr;
5526 	int prot = 0;
5527 	int ret;
5528 
5529 	if (iommu_prot & IOMMU_READ)
5530 		prot |= DMA_PTE_READ;
5531 	if (iommu_prot & IOMMU_WRITE)
5532 		prot |= DMA_PTE_WRITE;
5533 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5534 		prot |= DMA_PTE_SNP;
5535 
5536 	max_addr = iova + size;
5537 	if (dmar_domain->max_addr < max_addr) {
5538 		u64 end;
5539 
5540 		/* check if minimum agaw is sufficient for mapped address */
5541 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5542 		if (end < max_addr) {
5543 			pr_err("%s: iommu width (%d) is not "
5544 			       "sufficient for the mapped address (%llx)\n",
5545 			       __func__, dmar_domain->gaw, max_addr);
5546 			return -EFAULT;
5547 		}
5548 		dmar_domain->max_addr = max_addr;
5549 	}
5550 	/* Round up size to next multiple of PAGE_SIZE, if it and
5551 	   the low bits of hpa would take us onto the next page */
5552 	size = aligned_nrpages(hpa, size);
5553 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5554 				 hpa >> VTD_PAGE_SHIFT, size, prot);
5555 	return ret;
5556 }
5557 
5558 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5559 				unsigned long iova, size_t size,
5560 				struct iommu_iotlb_gather *gather)
5561 {
5562 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5563 	struct page *freelist = NULL;
5564 	unsigned long start_pfn, last_pfn;
5565 	unsigned int npages;
5566 	int iommu_id, level = 0;
5567 
5568 	/* Cope with horrid API which requires us to unmap more than the
5569 	   size argument if it happens to be a large-page mapping. */
5570 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5571 
5572 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5573 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5574 
5575 	start_pfn = iova >> VTD_PAGE_SHIFT;
5576 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5577 
5578 	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5579 
5580 	npages = last_pfn - start_pfn + 1;
5581 
5582 	for_each_domain_iommu(iommu_id, dmar_domain)
5583 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5584 				      start_pfn, npages, !freelist, 0);
5585 
5586 	dma_free_pagelist(freelist);
5587 
5588 	if (dmar_domain->max_addr == iova + size)
5589 		dmar_domain->max_addr = iova;
5590 
5591 	return size;
5592 }
5593 
5594 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5595 					    dma_addr_t iova)
5596 {
5597 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5598 	struct dma_pte *pte;
5599 	int level = 0;
5600 	u64 phys = 0;
5601 
5602 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5603 	if (pte && dma_pte_present(pte))
5604 		phys = dma_pte_addr(pte) +
5605 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5606 						VTD_PAGE_SHIFT) - 1));
5607 
5608 	return phys;
5609 }
5610 
5611 static inline bool scalable_mode_support(void)
5612 {
5613 	struct dmar_drhd_unit *drhd;
5614 	struct intel_iommu *iommu;
5615 	bool ret = true;
5616 
5617 	rcu_read_lock();
5618 	for_each_active_iommu(iommu, drhd) {
5619 		if (!sm_supported(iommu)) {
5620 			ret = false;
5621 			break;
5622 		}
5623 	}
5624 	rcu_read_unlock();
5625 
5626 	return ret;
5627 }
5628 
5629 static inline bool iommu_pasid_support(void)
5630 {
5631 	struct dmar_drhd_unit *drhd;
5632 	struct intel_iommu *iommu;
5633 	bool ret = true;
5634 
5635 	rcu_read_lock();
5636 	for_each_active_iommu(iommu, drhd) {
5637 		if (!pasid_supported(iommu)) {
5638 			ret = false;
5639 			break;
5640 		}
5641 	}
5642 	rcu_read_unlock();
5643 
5644 	return ret;
5645 }
5646 
5647 static inline bool nested_mode_support(void)
5648 {
5649 	struct dmar_drhd_unit *drhd;
5650 	struct intel_iommu *iommu;
5651 	bool ret = true;
5652 
5653 	rcu_read_lock();
5654 	for_each_active_iommu(iommu, drhd) {
5655 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5656 			ret = false;
5657 			break;
5658 		}
5659 	}
5660 	rcu_read_unlock();
5661 
5662 	return ret;
5663 }
5664 
5665 static bool intel_iommu_capable(enum iommu_cap cap)
5666 {
5667 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5668 		return domain_update_iommu_snooping(NULL) == 1;
5669 	if (cap == IOMMU_CAP_INTR_REMAP)
5670 		return irq_remapping_enabled == 1;
5671 
5672 	return false;
5673 }
5674 
5675 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5676 {
5677 	struct intel_iommu *iommu;
5678 
5679 	iommu = device_to_iommu(dev, NULL, NULL);
5680 	if (!iommu)
5681 		return ERR_PTR(-ENODEV);
5682 
5683 	if (translation_pre_enabled(iommu))
5684 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5685 
5686 	return &iommu->iommu;
5687 }
5688 
5689 static void intel_iommu_release_device(struct device *dev)
5690 {
5691 	struct intel_iommu *iommu;
5692 
5693 	iommu = device_to_iommu(dev, NULL, NULL);
5694 	if (!iommu)
5695 		return;
5696 
5697 	dmar_remove_one_dev_info(dev);
5698 
5699 	set_dma_ops(dev, NULL);
5700 }
5701 
5702 static void intel_iommu_probe_finalize(struct device *dev)
5703 {
5704 	struct iommu_domain *domain;
5705 
5706 	domain = iommu_get_domain_for_dev(dev);
5707 	if (device_needs_bounce(dev))
5708 		set_dma_ops(dev, &bounce_dma_ops);
5709 	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5710 		set_dma_ops(dev, &intel_dma_ops);
5711 	else
5712 		set_dma_ops(dev, NULL);
5713 }
5714 
5715 static void intel_iommu_get_resv_regions(struct device *device,
5716 					 struct list_head *head)
5717 {
5718 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5719 	struct iommu_resv_region *reg;
5720 	struct dmar_rmrr_unit *rmrr;
5721 	struct device *i_dev;
5722 	int i;
5723 
5724 	down_read(&dmar_global_lock);
5725 	for_each_rmrr_units(rmrr) {
5726 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5727 					  i, i_dev) {
5728 			struct iommu_resv_region *resv;
5729 			enum iommu_resv_type type;
5730 			size_t length;
5731 
5732 			if (i_dev != device &&
5733 			    !is_downstream_to_pci_bridge(device, i_dev))
5734 				continue;
5735 
5736 			length = rmrr->end_address - rmrr->base_address + 1;
5737 
5738 			type = device_rmrr_is_relaxable(device) ?
5739 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5740 
5741 			resv = iommu_alloc_resv_region(rmrr->base_address,
5742 						       length, prot, type);
5743 			if (!resv)
5744 				break;
5745 
5746 			list_add_tail(&resv->list, head);
5747 		}
5748 	}
5749 	up_read(&dmar_global_lock);
5750 
5751 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5752 	if (dev_is_pci(device)) {
5753 		struct pci_dev *pdev = to_pci_dev(device);
5754 
5755 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5756 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5757 						   IOMMU_RESV_DIRECT_RELAXABLE);
5758 			if (reg)
5759 				list_add_tail(&reg->list, head);
5760 		}
5761 	}
5762 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5763 
5764 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5765 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5766 				      0, IOMMU_RESV_MSI);
5767 	if (!reg)
5768 		return;
5769 	list_add_tail(&reg->list, head);
5770 }
5771 
5772 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5773 {
5774 	struct device_domain_info *info;
5775 	struct context_entry *context;
5776 	struct dmar_domain *domain;
5777 	unsigned long flags;
5778 	u64 ctx_lo;
5779 	int ret;
5780 
5781 	domain = find_domain(dev);
5782 	if (!domain)
5783 		return -EINVAL;
5784 
5785 	spin_lock_irqsave(&device_domain_lock, flags);
5786 	spin_lock(&iommu->lock);
5787 
5788 	ret = -EINVAL;
5789 	info = get_domain_info(dev);
5790 	if (!info || !info->pasid_supported)
5791 		goto out;
5792 
5793 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5794 	if (WARN_ON(!context))
5795 		goto out;
5796 
5797 	ctx_lo = context[0].lo;
5798 
5799 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5800 		ctx_lo |= CONTEXT_PASIDE;
5801 		context[0].lo = ctx_lo;
5802 		wmb();
5803 		iommu->flush.flush_context(iommu,
5804 					   domain->iommu_did[iommu->seq_id],
5805 					   PCI_DEVID(info->bus, info->devfn),
5806 					   DMA_CCMD_MASK_NOBIT,
5807 					   DMA_CCMD_DEVICE_INVL);
5808 	}
5809 
5810 	/* Enable PASID support in the device, if it wasn't already */
5811 	if (!info->pasid_enabled)
5812 		iommu_enable_dev_iotlb(info);
5813 
5814 	ret = 0;
5815 
5816  out:
5817 	spin_unlock(&iommu->lock);
5818 	spin_unlock_irqrestore(&device_domain_lock, flags);
5819 
5820 	return ret;
5821 }
5822 
5823 static void intel_iommu_apply_resv_region(struct device *dev,
5824 					  struct iommu_domain *domain,
5825 					  struct iommu_resv_region *region)
5826 {
5827 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5828 	unsigned long start, end;
5829 
5830 	start = IOVA_PFN(region->start);
5831 	end   = IOVA_PFN(region->start + region->length - 1);
5832 
5833 	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5834 }
5835 
5836 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5837 {
5838 	if (dev_is_pci(dev))
5839 		return pci_device_group(dev);
5840 	return generic_device_group(dev);
5841 }
5842 
5843 static int intel_iommu_enable_auxd(struct device *dev)
5844 {
5845 	struct device_domain_info *info;
5846 	struct intel_iommu *iommu;
5847 	unsigned long flags;
5848 	int ret;
5849 
5850 	iommu = device_to_iommu(dev, NULL, NULL);
5851 	if (!iommu || dmar_disabled)
5852 		return -EINVAL;
5853 
5854 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5855 		return -EINVAL;
5856 
5857 	ret = intel_iommu_enable_pasid(iommu, dev);
5858 	if (ret)
5859 		return -ENODEV;
5860 
5861 	spin_lock_irqsave(&device_domain_lock, flags);
5862 	info = get_domain_info(dev);
5863 	info->auxd_enabled = 1;
5864 	spin_unlock_irqrestore(&device_domain_lock, flags);
5865 
5866 	return 0;
5867 }
5868 
5869 static int intel_iommu_disable_auxd(struct device *dev)
5870 {
5871 	struct device_domain_info *info;
5872 	unsigned long flags;
5873 
5874 	spin_lock_irqsave(&device_domain_lock, flags);
5875 	info = get_domain_info(dev);
5876 	if (!WARN_ON(!info))
5877 		info->auxd_enabled = 0;
5878 	spin_unlock_irqrestore(&device_domain_lock, flags);
5879 
5880 	return 0;
5881 }
5882 
5883 /*
5884  * A PCI express designated vendor specific extended capability is defined
5885  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5886  * for system software and tools to detect endpoint devices supporting the
5887  * Intel scalable IO virtualization without host driver dependency.
5888  *
5889  * Returns the address of the matching extended capability structure within
5890  * the device's PCI configuration space or 0 if the device does not support
5891  * it.
5892  */
5893 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5894 {
5895 	int pos;
5896 	u16 vendor, id;
5897 
5898 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5899 	while (pos) {
5900 		pci_read_config_word(pdev, pos + 4, &vendor);
5901 		pci_read_config_word(pdev, pos + 8, &id);
5902 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5903 			return pos;
5904 
5905 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5906 	}
5907 
5908 	return 0;
5909 }
5910 
5911 static bool
5912 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5913 {
5914 	if (feat == IOMMU_DEV_FEAT_AUX) {
5915 		int ret;
5916 
5917 		if (!dev_is_pci(dev) || dmar_disabled ||
5918 		    !scalable_mode_support() || !iommu_pasid_support())
5919 			return false;
5920 
5921 		ret = pci_pasid_features(to_pci_dev(dev));
5922 		if (ret < 0)
5923 			return false;
5924 
5925 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5926 	}
5927 
5928 	if (feat == IOMMU_DEV_FEAT_SVA) {
5929 		struct device_domain_info *info = get_domain_info(dev);
5930 
5931 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5932 			info->pasid_supported && info->pri_supported &&
5933 			info->ats_supported;
5934 	}
5935 
5936 	return false;
5937 }
5938 
5939 static int
5940 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5941 {
5942 	if (feat == IOMMU_DEV_FEAT_AUX)
5943 		return intel_iommu_enable_auxd(dev);
5944 
5945 	if (feat == IOMMU_DEV_FEAT_SVA) {
5946 		struct device_domain_info *info = get_domain_info(dev);
5947 
5948 		if (!info)
5949 			return -EINVAL;
5950 
5951 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5952 			return 0;
5953 	}
5954 
5955 	return -ENODEV;
5956 }
5957 
5958 static int
5959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5960 {
5961 	if (feat == IOMMU_DEV_FEAT_AUX)
5962 		return intel_iommu_disable_auxd(dev);
5963 
5964 	return -ENODEV;
5965 }
5966 
5967 static bool
5968 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5969 {
5970 	struct device_domain_info *info = get_domain_info(dev);
5971 
5972 	if (feat == IOMMU_DEV_FEAT_AUX)
5973 		return scalable_mode_support() && info && info->auxd_enabled;
5974 
5975 	return false;
5976 }
5977 
5978 static int
5979 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5980 {
5981 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5982 
5983 	return dmar_domain->default_pasid > 0 ?
5984 			dmar_domain->default_pasid : -EINVAL;
5985 }
5986 
5987 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5988 					   struct device *dev)
5989 {
5990 	return attach_deferred(dev);
5991 }
5992 
5993 static int
5994 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5995 			    enum iommu_attr attr, void *data)
5996 {
5997 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5998 	unsigned long flags;
5999 	int ret = 0;
6000 
6001 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6002 		return -EINVAL;
6003 
6004 	switch (attr) {
6005 	case DOMAIN_ATTR_NESTING:
6006 		spin_lock_irqsave(&device_domain_lock, flags);
6007 		if (nested_mode_support() &&
6008 		    list_empty(&dmar_domain->devices)) {
6009 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6010 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6011 		} else {
6012 			ret = -ENODEV;
6013 		}
6014 		spin_unlock_irqrestore(&device_domain_lock, flags);
6015 		break;
6016 	default:
6017 		ret = -EINVAL;
6018 		break;
6019 	}
6020 
6021 	return ret;
6022 }
6023 
6024 /*
6025  * Check that the device does not live on an external facing PCI port that is
6026  * marked as untrusted. Such devices should not be able to apply quirks and
6027  * thus not be able to bypass the IOMMU restrictions.
6028  */
6029 static bool risky_device(struct pci_dev *pdev)
6030 {
6031 	if (pdev->untrusted) {
6032 		pci_info(pdev,
6033 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6034 			 pdev->vendor, pdev->device);
6035 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6036 		return true;
6037 	}
6038 	return false;
6039 }
6040 
6041 const struct iommu_ops intel_iommu_ops = {
6042 	.capable		= intel_iommu_capable,
6043 	.domain_alloc		= intel_iommu_domain_alloc,
6044 	.domain_free		= intel_iommu_domain_free,
6045 	.domain_set_attr	= intel_iommu_domain_set_attr,
6046 	.attach_dev		= intel_iommu_attach_device,
6047 	.detach_dev		= intel_iommu_detach_device,
6048 	.aux_attach_dev		= intel_iommu_aux_attach_device,
6049 	.aux_detach_dev		= intel_iommu_aux_detach_device,
6050 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6051 	.map			= intel_iommu_map,
6052 	.unmap			= intel_iommu_unmap,
6053 	.iova_to_phys		= intel_iommu_iova_to_phys,
6054 	.probe_device		= intel_iommu_probe_device,
6055 	.probe_finalize		= intel_iommu_probe_finalize,
6056 	.release_device		= intel_iommu_release_device,
6057 	.get_resv_regions	= intel_iommu_get_resv_regions,
6058 	.put_resv_regions	= generic_iommu_put_resv_regions,
6059 	.apply_resv_region	= intel_iommu_apply_resv_region,
6060 	.device_group		= intel_iommu_device_group,
6061 	.dev_has_feat		= intel_iommu_dev_has_feat,
6062 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6063 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6064 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6065 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6066 	.def_domain_type	= device_def_domain_type,
6067 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6068 #ifdef CONFIG_INTEL_IOMMU_SVM
6069 	.cache_invalidate	= intel_iommu_sva_invalidate,
6070 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6071 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6072 	.sva_bind		= intel_svm_bind,
6073 	.sva_unbind		= intel_svm_unbind,
6074 	.sva_get_pasid		= intel_svm_get_pasid,
6075 	.page_response		= intel_svm_page_response,
6076 #endif
6077 };
6078 
6079 static void quirk_iommu_igfx(struct pci_dev *dev)
6080 {
6081 	if (risky_device(dev))
6082 		return;
6083 
6084 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6085 	dmar_map_gfx = 0;
6086 }
6087 
6088 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6091 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6092 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6093 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6094 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6096 
6097 /* Broadwell igfx malfunctions with dmar */
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6122 
6123 static void quirk_iommu_rwbf(struct pci_dev *dev)
6124 {
6125 	if (risky_device(dev))
6126 		return;
6127 
6128 	/*
6129 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6130 	 * but needs it. Same seems to hold for the desktop versions.
6131 	 */
6132 	pci_info(dev, "Forcing write-buffer flush capability\n");
6133 	rwbf_quirk = 1;
6134 }
6135 
6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6143 
6144 #define GGC 0x52
6145 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6146 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6147 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6148 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6149 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6150 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6151 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6152 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6153 
6154 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6155 {
6156 	unsigned short ggc;
6157 
6158 	if (risky_device(dev))
6159 		return;
6160 
6161 	if (pci_read_config_word(dev, GGC, &ggc))
6162 		return;
6163 
6164 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6165 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6166 		dmar_map_gfx = 0;
6167 	} else if (dmar_map_gfx) {
6168 		/* we have to ensure the gfx device is idle before we flush */
6169 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6170 		intel_iommu_strict = 1;
6171        }
6172 }
6173 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6174 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6175 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6176 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6177 
6178 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6179 {
6180 	unsigned short ver;
6181 
6182 	if (!IS_GFX_DEVICE(dev))
6183 		return;
6184 
6185 	ver = (dev->device >> 8) & 0xff;
6186 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6187 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6188 	    ver != 0x9a)
6189 		return;
6190 
6191 	if (risky_device(dev))
6192 		return;
6193 
6194 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6195 	iommu_skip_te_disable = 1;
6196 }
6197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6198 
6199 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6200    ISOCH DMAR unit for the Azalia sound device, but not give it any
6201    TLB entries, which causes it to deadlock. Check for that.  We do
6202    this in a function called from init_dmars(), instead of in a PCI
6203    quirk, because we don't want to print the obnoxious "BIOS broken"
6204    message if VT-d is actually disabled.
6205 */
6206 static void __init check_tylersburg_isoch(void)
6207 {
6208 	struct pci_dev *pdev;
6209 	uint32_t vtisochctrl;
6210 
6211 	/* If there's no Azalia in the system anyway, forget it. */
6212 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6213 	if (!pdev)
6214 		return;
6215 
6216 	if (risky_device(pdev)) {
6217 		pci_dev_put(pdev);
6218 		return;
6219 	}
6220 
6221 	pci_dev_put(pdev);
6222 
6223 	/* System Management Registers. Might be hidden, in which case
6224 	   we can't do the sanity check. But that's OK, because the
6225 	   known-broken BIOSes _don't_ actually hide it, so far. */
6226 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6227 	if (!pdev)
6228 		return;
6229 
6230 	if (risky_device(pdev)) {
6231 		pci_dev_put(pdev);
6232 		return;
6233 	}
6234 
6235 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6236 		pci_dev_put(pdev);
6237 		return;
6238 	}
6239 
6240 	pci_dev_put(pdev);
6241 
6242 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6243 	if (vtisochctrl & 1)
6244 		return;
6245 
6246 	/* Drop all bits other than the number of TLB entries */
6247 	vtisochctrl &= 0x1c;
6248 
6249 	/* If we have the recommended number of TLB entries (16), fine. */
6250 	if (vtisochctrl == 0x10)
6251 		return;
6252 
6253 	/* Zero TLB entries? You get to ride the short bus to school. */
6254 	if (!vtisochctrl) {
6255 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6256 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6257 		     dmi_get_system_info(DMI_BIOS_VENDOR),
6258 		     dmi_get_system_info(DMI_BIOS_VERSION),
6259 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6260 		iommu_identity_mapping |= IDENTMAP_AZALIA;
6261 		return;
6262 	}
6263 
6264 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6265 	       vtisochctrl);
6266 }
6267