xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 89551fdd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28 
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131 
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173 	context->lo &= ~(1ULL << 11);
174 }
175 
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178 	return !!(context->lo & (1ULL << 11));
179 }
180 
181 static inline void context_set_copied(struct context_entry *context)
182 {
183 	context->hi |= (1ull << 3);
184 }
185 
186 static inline bool context_copied(struct context_entry *context)
187 {
188 	return !!(context->hi & (1ULL << 3));
189 }
190 
191 static inline bool __context_present(struct context_entry *context)
192 {
193 	return (context->lo & 1);
194 }
195 
196 bool context_present(struct context_entry *context)
197 {
198 	return context_pasid_enabled(context) ?
199 	     __context_present(context) :
200 	     __context_present(context) && !context_copied(context);
201 }
202 
203 static inline void context_set_present(struct context_entry *context)
204 {
205 	context->lo |= 1;
206 }
207 
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210 	context->lo &= (((u64)-1) << 2) | 1;
211 }
212 
213 static inline void context_set_translation_type(struct context_entry *context,
214 						unsigned long value)
215 {
216 	context->lo &= (((u64)-1) << 4) | 3;
217 	context->lo |= (value & 3) << 2;
218 }
219 
220 static inline void context_set_address_root(struct context_entry *context,
221 					    unsigned long value)
222 {
223 	context->lo &= ~VTD_PAGE_MASK;
224 	context->lo |= value & VTD_PAGE_MASK;
225 }
226 
227 static inline void context_set_address_width(struct context_entry *context,
228 					     unsigned long value)
229 {
230 	context->hi |= value & 7;
231 }
232 
233 static inline void context_set_domain_id(struct context_entry *context,
234 					 unsigned long value)
235 {
236 	context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238 
239 static inline int context_domain_id(struct context_entry *c)
240 {
241 	return((c->hi >> 8) & 0xffff);
242 }
243 
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246 	context->lo = 0;
247 	context->hi = 0;
248 }
249 
250 /*
251  * This domain is a statically identity mapping domain.
252  *	1. This domain creats a static 1:1 mapping to all usable memory.
253  * 	2. It maps to each iommu if successful.
254  *	3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258 
259 #define for_each_domain_iommu(idx, domain)			\
260 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
261 		if (domain->iommu_refcnt[idx])
262 
263 struct dmar_rmrr_unit {
264 	struct list_head list;		/* list of rmrr units	*/
265 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
266 	u64	base_address;		/* reserved base address*/
267 	u64	end_address;		/* reserved end address */
268 	struct dmar_dev_scope *devices;	/* target devices */
269 	int	devices_cnt;		/* target device count */
270 };
271 
272 struct dmar_atsr_unit {
273 	struct list_head list;		/* list of ATSR units */
274 	struct acpi_dmar_header *hdr;	/* ACPI header */
275 	struct dmar_dev_scope *devices;	/* target devices */
276 	int devices_cnt;		/* target device count */
277 	u8 include_all:1;		/* include all ports */
278 };
279 
280 struct dmar_satc_unit {
281 	struct list_head list;		/* list of SATC units */
282 	struct acpi_dmar_header *hdr;	/* ACPI header */
283 	struct dmar_dev_scope *devices;	/* target devices */
284 	struct intel_iommu *iommu;	/* the corresponding iommu */
285 	int devices_cnt;		/* target device count */
286 	u8 atc_required:1;		/* ATS is required */
287 };
288 
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292 
293 #define for_each_rmrr_units(rmrr) \
294 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295 
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298 
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302 
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305 
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308 
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313 
314 #define IDENTMAP_GFX		2
315 #define IDENTMAP_AZALIA		4
316 
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319 
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322 
323 /*
324  * Iterate over elements in device_domain_list and call the specified
325  * callback @fn against each element.
326  */
327 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
328 				     void *data), void *data)
329 {
330 	int ret = 0;
331 	unsigned long flags;
332 	struct device_domain_info *info;
333 
334 	spin_lock_irqsave(&device_domain_lock, flags);
335 	list_for_each_entry(info, &device_domain_list, global) {
336 		ret = fn(info, data);
337 		if (ret) {
338 			spin_unlock_irqrestore(&device_domain_lock, flags);
339 			return ret;
340 		}
341 	}
342 	spin_unlock_irqrestore(&device_domain_lock, flags);
343 
344 	return 0;
345 }
346 
347 const struct iommu_ops intel_iommu_ops;
348 
349 static bool translation_pre_enabled(struct intel_iommu *iommu)
350 {
351 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
352 }
353 
354 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
355 {
356 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
357 }
358 
359 static void init_translation_status(struct intel_iommu *iommu)
360 {
361 	u32 gsts;
362 
363 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
364 	if (gsts & DMA_GSTS_TES)
365 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
366 }
367 
368 static int __init intel_iommu_setup(char *str)
369 {
370 	if (!str)
371 		return -EINVAL;
372 
373 	while (*str) {
374 		if (!strncmp(str, "on", 2)) {
375 			dmar_disabled = 0;
376 			pr_info("IOMMU enabled\n");
377 		} else if (!strncmp(str, "off", 3)) {
378 			dmar_disabled = 1;
379 			no_platform_optin = 1;
380 			pr_info("IOMMU disabled\n");
381 		} else if (!strncmp(str, "igfx_off", 8)) {
382 			dmar_map_gfx = 0;
383 			pr_info("Disable GFX device mapping\n");
384 		} else if (!strncmp(str, "forcedac", 8)) {
385 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
386 			iommu_dma_forcedac = true;
387 		} else if (!strncmp(str, "strict", 6)) {
388 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
389 			iommu_set_dma_strict();
390 		} else if (!strncmp(str, "sp_off", 6)) {
391 			pr_info("Disable supported super page\n");
392 			intel_iommu_superpage = 0;
393 		} else if (!strncmp(str, "sm_on", 5)) {
394 			pr_info("Enable scalable mode if hardware supports\n");
395 			intel_iommu_sm = 1;
396 		} else if (!strncmp(str, "sm_off", 6)) {
397 			pr_info("Scalable mode is disallowed\n");
398 			intel_iommu_sm = 0;
399 		} else if (!strncmp(str, "tboot_noforce", 13)) {
400 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
401 			intel_iommu_tboot_noforce = 1;
402 		} else {
403 			pr_notice("Unknown option - '%s'\n", str);
404 		}
405 
406 		str += strcspn(str, ",");
407 		while (*str == ',')
408 			str++;
409 	}
410 
411 	return 1;
412 }
413 __setup("intel_iommu=", intel_iommu_setup);
414 
415 void *alloc_pgtable_page(int node)
416 {
417 	struct page *page;
418 	void *vaddr = NULL;
419 
420 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
421 	if (page)
422 		vaddr = page_address(page);
423 	return vaddr;
424 }
425 
426 void free_pgtable_page(void *vaddr)
427 {
428 	free_page((unsigned long)vaddr);
429 }
430 
431 static inline int domain_type_is_si(struct dmar_domain *domain)
432 {
433 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
434 }
435 
436 static inline bool domain_use_first_level(struct dmar_domain *domain)
437 {
438 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
439 }
440 
441 static inline int domain_pfn_supported(struct dmar_domain *domain,
442 				       unsigned long pfn)
443 {
444 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
445 
446 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
447 }
448 
449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
450 {
451 	unsigned long sagaw;
452 	int agaw;
453 
454 	sagaw = cap_sagaw(iommu->cap);
455 	for (agaw = width_to_agaw(max_gaw);
456 	     agaw >= 0; agaw--) {
457 		if (test_bit(agaw, &sagaw))
458 			break;
459 	}
460 
461 	return agaw;
462 }
463 
464 /*
465  * Calculate max SAGAW for each iommu.
466  */
467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
468 {
469 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
470 }
471 
472 /*
473  * calculate agaw for each iommu.
474  * "SAGAW" may be different across iommus, use a default agaw, and
475  * get a supported less agaw for iommus that don't support the default agaw.
476  */
477 int iommu_calculate_agaw(struct intel_iommu *iommu)
478 {
479 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
480 }
481 
482 /* This functionin only returns single iommu in a domain */
483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
484 {
485 	int iommu_id;
486 
487 	/* si_domain and vm domain should not get here. */
488 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
489 		return NULL;
490 
491 	for_each_domain_iommu(iommu_id, domain)
492 		break;
493 
494 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
495 		return NULL;
496 
497 	return g_iommus[iommu_id];
498 }
499 
500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
501 {
502 	return sm_supported(iommu) ?
503 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
504 }
505 
506 static void domain_update_iommu_coherency(struct dmar_domain *domain)
507 {
508 	struct dmar_drhd_unit *drhd;
509 	struct intel_iommu *iommu;
510 	bool found = false;
511 	int i;
512 
513 	domain->iommu_coherency = true;
514 
515 	for_each_domain_iommu(i, domain) {
516 		found = true;
517 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
518 			domain->iommu_coherency = false;
519 			break;
520 		}
521 	}
522 	if (found)
523 		return;
524 
525 	/* No hardware attached; use lowest common denominator */
526 	rcu_read_lock();
527 	for_each_active_iommu(iommu, drhd) {
528 		if (!iommu_paging_structure_coherency(iommu)) {
529 			domain->iommu_coherency = false;
530 			break;
531 		}
532 	}
533 	rcu_read_unlock();
534 }
535 
536 static int domain_update_iommu_superpage(struct dmar_domain *domain,
537 					 struct intel_iommu *skip)
538 {
539 	struct dmar_drhd_unit *drhd;
540 	struct intel_iommu *iommu;
541 	int mask = 0x3;
542 
543 	if (!intel_iommu_superpage)
544 		return 0;
545 
546 	/* set iommu_superpage to the smallest common denominator */
547 	rcu_read_lock();
548 	for_each_active_iommu(iommu, drhd) {
549 		if (iommu != skip) {
550 			if (domain && domain_use_first_level(domain)) {
551 				if (!cap_fl1gp_support(iommu->cap))
552 					mask = 0x1;
553 			} else {
554 				mask &= cap_super_page_val(iommu->cap);
555 			}
556 
557 			if (!mask)
558 				break;
559 		}
560 	}
561 	rcu_read_unlock();
562 
563 	return fls(mask);
564 }
565 
566 static int domain_update_device_node(struct dmar_domain *domain)
567 {
568 	struct device_domain_info *info;
569 	int nid = NUMA_NO_NODE;
570 
571 	assert_spin_locked(&device_domain_lock);
572 
573 	if (list_empty(&domain->devices))
574 		return NUMA_NO_NODE;
575 
576 	list_for_each_entry(info, &domain->devices, link) {
577 		if (!info->dev)
578 			continue;
579 
580 		/*
581 		 * There could possibly be multiple device numa nodes as devices
582 		 * within the same domain may sit behind different IOMMUs. There
583 		 * isn't perfect answer in such situation, so we select first
584 		 * come first served policy.
585 		 */
586 		nid = dev_to_node(info->dev);
587 		if (nid != NUMA_NO_NODE)
588 			break;
589 	}
590 
591 	return nid;
592 }
593 
594 static void domain_update_iotlb(struct dmar_domain *domain);
595 
596 /* Return the super pagesize bitmap if supported. */
597 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
598 {
599 	unsigned long bitmap = 0;
600 
601 	/*
602 	 * 1-level super page supports page size of 2MiB, 2-level super page
603 	 * supports page size of both 2MiB and 1GiB.
604 	 */
605 	if (domain->iommu_superpage == 1)
606 		bitmap |= SZ_2M;
607 	else if (domain->iommu_superpage == 2)
608 		bitmap |= SZ_2M | SZ_1G;
609 
610 	return bitmap;
611 }
612 
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616 	domain_update_iommu_coherency(domain);
617 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
618 
619 	/*
620 	 * If RHSA is missing, we should default to the device numa domain
621 	 * as fall back.
622 	 */
623 	if (domain->nid == NUMA_NO_NODE)
624 		domain->nid = domain_update_device_node(domain);
625 
626 	/*
627 	 * First-level translation restricts the input-address to a
628 	 * canonical address (i.e., address bits 63:N have the same
629 	 * value as address bit [N-1], where N is 48-bits with 4-level
630 	 * paging and 57-bits with 5-level paging). Hence, skip bit
631 	 * [N-1].
632 	 */
633 	if (domain_use_first_level(domain))
634 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
635 	else
636 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
637 
638 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
639 	domain_update_iotlb(domain);
640 }
641 
642 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
643 					 u8 devfn, int alloc)
644 {
645 	struct root_entry *root = &iommu->root_entry[bus];
646 	struct context_entry *context;
647 	u64 *entry;
648 
649 	entry = &root->lo;
650 	if (sm_supported(iommu)) {
651 		if (devfn >= 0x80) {
652 			devfn -= 0x80;
653 			entry = &root->hi;
654 		}
655 		devfn *= 2;
656 	}
657 	if (*entry & 1)
658 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
659 	else {
660 		unsigned long phy_addr;
661 		if (!alloc)
662 			return NULL;
663 
664 		context = alloc_pgtable_page(iommu->node);
665 		if (!context)
666 			return NULL;
667 
668 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
669 		phy_addr = virt_to_phys((void *)context);
670 		*entry = phy_addr | 1;
671 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
672 	}
673 	return &context[devfn];
674 }
675 
676 /**
677  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
678  *				 sub-hierarchy of a candidate PCI-PCI bridge
679  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
680  * @bridge: the candidate PCI-PCI bridge
681  *
682  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
683  */
684 static bool
685 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
686 {
687 	struct pci_dev *pdev, *pbridge;
688 
689 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
690 		return false;
691 
692 	pdev = to_pci_dev(dev);
693 	pbridge = to_pci_dev(bridge);
694 
695 	if (pbridge->subordinate &&
696 	    pbridge->subordinate->number <= pdev->bus->number &&
697 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
698 		return true;
699 
700 	return false;
701 }
702 
703 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
704 {
705 	struct dmar_drhd_unit *drhd;
706 	u32 vtbar;
707 	int rc;
708 
709 	/* We know that this device on this chipset has its own IOMMU.
710 	 * If we find it under a different IOMMU, then the BIOS is lying
711 	 * to us. Hope that the IOMMU for this device is actually
712 	 * disabled, and it needs no translation...
713 	 */
714 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
715 	if (rc) {
716 		/* "can't" happen */
717 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
718 		return false;
719 	}
720 	vtbar &= 0xffff0000;
721 
722 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
723 	drhd = dmar_find_matched_drhd_unit(pdev);
724 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
725 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
726 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
727 		return true;
728 	}
729 
730 	return false;
731 }
732 
733 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
734 {
735 	if (!iommu || iommu->drhd->ignored)
736 		return true;
737 
738 	if (dev_is_pci(dev)) {
739 		struct pci_dev *pdev = to_pci_dev(dev);
740 
741 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
742 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
743 		    quirk_ioat_snb_local_iommu(pdev))
744 			return true;
745 	}
746 
747 	return false;
748 }
749 
750 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
751 {
752 	struct dmar_drhd_unit *drhd = NULL;
753 	struct pci_dev *pdev = NULL;
754 	struct intel_iommu *iommu;
755 	struct device *tmp;
756 	u16 segment = 0;
757 	int i;
758 
759 	if (!dev)
760 		return NULL;
761 
762 	if (dev_is_pci(dev)) {
763 		struct pci_dev *pf_pdev;
764 
765 		pdev = pci_real_dma_dev(to_pci_dev(dev));
766 
767 		/* VFs aren't listed in scope tables; we need to look up
768 		 * the PF instead to find the IOMMU. */
769 		pf_pdev = pci_physfn(pdev);
770 		dev = &pf_pdev->dev;
771 		segment = pci_domain_nr(pdev->bus);
772 	} else if (has_acpi_companion(dev))
773 		dev = &ACPI_COMPANION(dev)->dev;
774 
775 	rcu_read_lock();
776 	for_each_iommu(iommu, drhd) {
777 		if (pdev && segment != drhd->segment)
778 			continue;
779 
780 		for_each_active_dev_scope(drhd->devices,
781 					  drhd->devices_cnt, i, tmp) {
782 			if (tmp == dev) {
783 				/* For a VF use its original BDF# not that of the PF
784 				 * which we used for the IOMMU lookup. Strictly speaking
785 				 * we could do this for all PCI devices; we only need to
786 				 * get the BDF# from the scope table for ACPI matches. */
787 				if (pdev && pdev->is_virtfn)
788 					goto got_pdev;
789 
790 				if (bus && devfn) {
791 					*bus = drhd->devices[i].bus;
792 					*devfn = drhd->devices[i].devfn;
793 				}
794 				goto out;
795 			}
796 
797 			if (is_downstream_to_pci_bridge(dev, tmp))
798 				goto got_pdev;
799 		}
800 
801 		if (pdev && drhd->include_all) {
802 got_pdev:
803 			if (bus && devfn) {
804 				*bus = pdev->bus->number;
805 				*devfn = pdev->devfn;
806 			}
807 			goto out;
808 		}
809 	}
810 	iommu = NULL;
811 out:
812 	if (iommu_is_dummy(iommu, dev))
813 		iommu = NULL;
814 
815 	rcu_read_unlock();
816 
817 	return iommu;
818 }
819 
820 static void domain_flush_cache(struct dmar_domain *domain,
821 			       void *addr, int size)
822 {
823 	if (!domain->iommu_coherency)
824 		clflush_cache_range(addr, size);
825 }
826 
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
828 {
829 	struct context_entry *context;
830 	int ret = 0;
831 	unsigned long flags;
832 
833 	spin_lock_irqsave(&iommu->lock, flags);
834 	context = iommu_context_addr(iommu, bus, devfn, 0);
835 	if (context)
836 		ret = context_present(context);
837 	spin_unlock_irqrestore(&iommu->lock, flags);
838 	return ret;
839 }
840 
841 static void free_context_table(struct intel_iommu *iommu)
842 {
843 	int i;
844 	unsigned long flags;
845 	struct context_entry *context;
846 
847 	spin_lock_irqsave(&iommu->lock, flags);
848 	if (!iommu->root_entry) {
849 		goto out;
850 	}
851 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
852 		context = iommu_context_addr(iommu, i, 0, 0);
853 		if (context)
854 			free_pgtable_page(context);
855 
856 		if (!sm_supported(iommu))
857 			continue;
858 
859 		context = iommu_context_addr(iommu, i, 0x80, 0);
860 		if (context)
861 			free_pgtable_page(context);
862 
863 	}
864 	free_pgtable_page(iommu->root_entry);
865 	iommu->root_entry = NULL;
866 out:
867 	spin_unlock_irqrestore(&iommu->lock, flags);
868 }
869 
870 #ifdef CONFIG_DMAR_DEBUG
871 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
872 {
873 	struct device_domain_info *info;
874 	struct dma_pte *parent, *pte;
875 	struct dmar_domain *domain;
876 	int offset, level;
877 
878 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
879 	if (!info || !info->domain) {
880 		pr_info("device [%02x:%02x.%d] not probed\n",
881 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
882 		return;
883 	}
884 
885 	domain = info->domain;
886 	level = agaw_to_level(domain->agaw);
887 	parent = domain->pgd;
888 	if (!parent) {
889 		pr_info("no page table setup\n");
890 		return;
891 	}
892 
893 	while (1) {
894 		offset = pfn_level_offset(pfn, level);
895 		pte = &parent[offset];
896 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
897 			pr_info("PTE not present at level %d\n", level);
898 			break;
899 		}
900 
901 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
902 
903 		if (level == 1)
904 			break;
905 
906 		parent = phys_to_virt(dma_pte_addr(pte));
907 		level--;
908 	}
909 }
910 
911 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
912 			  unsigned long long addr, u32 pasid)
913 {
914 	struct pasid_dir_entry *dir, *pde;
915 	struct pasid_entry *entries, *pte;
916 	struct context_entry *ctx_entry;
917 	struct root_entry *rt_entry;
918 	u8 devfn = source_id & 0xff;
919 	u8 bus = source_id >> 8;
920 	int i, dir_index, index;
921 
922 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
923 
924 	/* root entry dump */
925 	rt_entry = &iommu->root_entry[bus];
926 	if (!rt_entry) {
927 		pr_info("root table entry is not present\n");
928 		return;
929 	}
930 
931 	if (sm_supported(iommu))
932 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
933 			rt_entry->hi, rt_entry->lo);
934 	else
935 		pr_info("root entry: 0x%016llx", rt_entry->lo);
936 
937 	/* context entry dump */
938 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
939 	if (!ctx_entry) {
940 		pr_info("context table entry is not present\n");
941 		return;
942 	}
943 
944 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
945 		ctx_entry->hi, ctx_entry->lo);
946 
947 	/* legacy mode does not require PASID entries */
948 	if (!sm_supported(iommu))
949 		goto pgtable_walk;
950 
951 	/* get the pointer to pasid directory entry */
952 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
953 	if (!dir) {
954 		pr_info("pasid directory entry is not present\n");
955 		return;
956 	}
957 	/* For request-without-pasid, get the pasid from context entry */
958 	if (intel_iommu_sm && pasid == INVALID_IOASID)
959 		pasid = PASID_RID2PASID;
960 
961 	dir_index = pasid >> PASID_PDE_SHIFT;
962 	pde = &dir[dir_index];
963 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
964 
965 	/* get the pointer to the pasid table entry */
966 	entries = get_pasid_table_from_pde(pde);
967 	if (!entries) {
968 		pr_info("pasid table entry is not present\n");
969 		return;
970 	}
971 	index = pasid & PASID_PTE_MASK;
972 	pte = &entries[index];
973 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
974 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
975 
976 pgtable_walk:
977 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
978 }
979 #endif
980 
981 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
982 				      unsigned long pfn, int *target_level)
983 {
984 	struct dma_pte *parent, *pte;
985 	int level = agaw_to_level(domain->agaw);
986 	int offset;
987 
988 	BUG_ON(!domain->pgd);
989 
990 	if (!domain_pfn_supported(domain, pfn))
991 		/* Address beyond IOMMU's addressing capabilities. */
992 		return NULL;
993 
994 	parent = domain->pgd;
995 
996 	while (1) {
997 		void *tmp_page;
998 
999 		offset = pfn_level_offset(pfn, level);
1000 		pte = &parent[offset];
1001 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1002 			break;
1003 		if (level == *target_level)
1004 			break;
1005 
1006 		if (!dma_pte_present(pte)) {
1007 			uint64_t pteval;
1008 
1009 			tmp_page = alloc_pgtable_page(domain->nid);
1010 
1011 			if (!tmp_page)
1012 				return NULL;
1013 
1014 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1015 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1016 			if (domain_use_first_level(domain)) {
1017 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1018 				if (iommu_is_dma_domain(&domain->domain))
1019 					pteval |= DMA_FL_PTE_ACCESS;
1020 			}
1021 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1022 				/* Someone else set it while we were thinking; use theirs. */
1023 				free_pgtable_page(tmp_page);
1024 			else
1025 				domain_flush_cache(domain, pte, sizeof(*pte));
1026 		}
1027 		if (level == 1)
1028 			break;
1029 
1030 		parent = phys_to_virt(dma_pte_addr(pte));
1031 		level--;
1032 	}
1033 
1034 	if (!*target_level)
1035 		*target_level = level;
1036 
1037 	return pte;
1038 }
1039 
1040 /* return address's pte at specific level */
1041 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1042 					 unsigned long pfn,
1043 					 int level, int *large_page)
1044 {
1045 	struct dma_pte *parent, *pte;
1046 	int total = agaw_to_level(domain->agaw);
1047 	int offset;
1048 
1049 	parent = domain->pgd;
1050 	while (level <= total) {
1051 		offset = pfn_level_offset(pfn, total);
1052 		pte = &parent[offset];
1053 		if (level == total)
1054 			return pte;
1055 
1056 		if (!dma_pte_present(pte)) {
1057 			*large_page = total;
1058 			break;
1059 		}
1060 
1061 		if (dma_pte_superpage(pte)) {
1062 			*large_page = total;
1063 			return pte;
1064 		}
1065 
1066 		parent = phys_to_virt(dma_pte_addr(pte));
1067 		total--;
1068 	}
1069 	return NULL;
1070 }
1071 
1072 /* clear last level pte, a tlb flush should be followed */
1073 static void dma_pte_clear_range(struct dmar_domain *domain,
1074 				unsigned long start_pfn,
1075 				unsigned long last_pfn)
1076 {
1077 	unsigned int large_page;
1078 	struct dma_pte *first_pte, *pte;
1079 
1080 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1081 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1082 	BUG_ON(start_pfn > last_pfn);
1083 
1084 	/* we don't need lock here; nobody else touches the iova range */
1085 	do {
1086 		large_page = 1;
1087 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1088 		if (!pte) {
1089 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1090 			continue;
1091 		}
1092 		do {
1093 			dma_clear_pte(pte);
1094 			start_pfn += lvl_to_nr_pages(large_page);
1095 			pte++;
1096 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1097 
1098 		domain_flush_cache(domain, first_pte,
1099 				   (void *)pte - (void *)first_pte);
1100 
1101 	} while (start_pfn && start_pfn <= last_pfn);
1102 }
1103 
1104 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1105 			       int retain_level, struct dma_pte *pte,
1106 			       unsigned long pfn, unsigned long start_pfn,
1107 			       unsigned long last_pfn)
1108 {
1109 	pfn = max(start_pfn, pfn);
1110 	pte = &pte[pfn_level_offset(pfn, level)];
1111 
1112 	do {
1113 		unsigned long level_pfn;
1114 		struct dma_pte *level_pte;
1115 
1116 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1117 			goto next;
1118 
1119 		level_pfn = pfn & level_mask(level);
1120 		level_pte = phys_to_virt(dma_pte_addr(pte));
1121 
1122 		if (level > 2) {
1123 			dma_pte_free_level(domain, level - 1, retain_level,
1124 					   level_pte, level_pfn, start_pfn,
1125 					   last_pfn);
1126 		}
1127 
1128 		/*
1129 		 * Free the page table if we're below the level we want to
1130 		 * retain and the range covers the entire table.
1131 		 */
1132 		if (level < retain_level && !(start_pfn > level_pfn ||
1133 		      last_pfn < level_pfn + level_size(level) - 1)) {
1134 			dma_clear_pte(pte);
1135 			domain_flush_cache(domain, pte, sizeof(*pte));
1136 			free_pgtable_page(level_pte);
1137 		}
1138 next:
1139 		pfn += level_size(level);
1140 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1141 }
1142 
1143 /*
1144  * clear last level (leaf) ptes and free page table pages below the
1145  * level we wish to keep intact.
1146  */
1147 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1148 				   unsigned long start_pfn,
1149 				   unsigned long last_pfn,
1150 				   int retain_level)
1151 {
1152 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1153 
1154 	/* We don't need lock here; nobody else touches the iova range */
1155 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1156 			   domain->pgd, 0, start_pfn, last_pfn);
1157 
1158 	/* free pgd */
1159 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1160 		free_pgtable_page(domain->pgd);
1161 		domain->pgd = NULL;
1162 	}
1163 }
1164 
1165 /* When a page at a given level is being unlinked from its parent, we don't
1166    need to *modify* it at all. All we need to do is make a list of all the
1167    pages which can be freed just as soon as we've flushed the IOTLB and we
1168    know the hardware page-walk will no longer touch them.
1169    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1170    be freed. */
1171 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1172 				    int level, struct dma_pte *pte,
1173 				    struct list_head *freelist)
1174 {
1175 	struct page *pg;
1176 
1177 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1178 	list_add_tail(&pg->lru, freelist);
1179 
1180 	if (level == 1)
1181 		return;
1182 
1183 	pte = page_address(pg);
1184 	do {
1185 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1186 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1187 		pte++;
1188 	} while (!first_pte_in_page(pte));
1189 }
1190 
1191 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1192 				struct dma_pte *pte, unsigned long pfn,
1193 				unsigned long start_pfn, unsigned long last_pfn,
1194 				struct list_head *freelist)
1195 {
1196 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1197 
1198 	pfn = max(start_pfn, pfn);
1199 	pte = &pte[pfn_level_offset(pfn, level)];
1200 
1201 	do {
1202 		unsigned long level_pfn = pfn & level_mask(level);
1203 
1204 		if (!dma_pte_present(pte))
1205 			goto next;
1206 
1207 		/* If range covers entire pagetable, free it */
1208 		if (start_pfn <= level_pfn &&
1209 		    last_pfn >= level_pfn + level_size(level) - 1) {
1210 			/* These suborbinate page tables are going away entirely. Don't
1211 			   bother to clear them; we're just going to *free* them. */
1212 			if (level > 1 && !dma_pte_superpage(pte))
1213 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1214 
1215 			dma_clear_pte(pte);
1216 			if (!first_pte)
1217 				first_pte = pte;
1218 			last_pte = pte;
1219 		} else if (level > 1) {
1220 			/* Recurse down into a level that isn't *entirely* obsolete */
1221 			dma_pte_clear_level(domain, level - 1,
1222 					    phys_to_virt(dma_pte_addr(pte)),
1223 					    level_pfn, start_pfn, last_pfn,
1224 					    freelist);
1225 		}
1226 next:
1227 		pfn = level_pfn + level_size(level);
1228 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1229 
1230 	if (first_pte)
1231 		domain_flush_cache(domain, first_pte,
1232 				   (void *)++last_pte - (void *)first_pte);
1233 }
1234 
1235 /* We can't just free the pages because the IOMMU may still be walking
1236    the page tables, and may have cached the intermediate levels. The
1237    pages can only be freed after the IOTLB flush has been done. */
1238 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1239 			 unsigned long last_pfn, struct list_head *freelist)
1240 {
1241 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1242 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1243 	BUG_ON(start_pfn > last_pfn);
1244 
1245 	/* we don't need lock here; nobody else touches the iova range */
1246 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1247 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1248 
1249 	/* free pgd */
1250 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1251 		struct page *pgd_page = virt_to_page(domain->pgd);
1252 		list_add_tail(&pgd_page->lru, freelist);
1253 		domain->pgd = NULL;
1254 	}
1255 }
1256 
1257 /* iommu handling */
1258 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1259 {
1260 	struct root_entry *root;
1261 	unsigned long flags;
1262 
1263 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1264 	if (!root) {
1265 		pr_err("Allocating root entry for %s failed\n",
1266 			iommu->name);
1267 		return -ENOMEM;
1268 	}
1269 
1270 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1271 
1272 	spin_lock_irqsave(&iommu->lock, flags);
1273 	iommu->root_entry = root;
1274 	spin_unlock_irqrestore(&iommu->lock, flags);
1275 
1276 	return 0;
1277 }
1278 
1279 static void iommu_set_root_entry(struct intel_iommu *iommu)
1280 {
1281 	u64 addr;
1282 	u32 sts;
1283 	unsigned long flag;
1284 
1285 	addr = virt_to_phys(iommu->root_entry);
1286 	if (sm_supported(iommu))
1287 		addr |= DMA_RTADDR_SMT;
1288 
1289 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1290 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1291 
1292 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1293 
1294 	/* Make sure hardware complete it */
1295 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1296 		      readl, (sts & DMA_GSTS_RTPS), sts);
1297 
1298 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1299 
1300 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1301 	if (sm_supported(iommu))
1302 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1303 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1304 }
1305 
1306 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1307 {
1308 	u32 val;
1309 	unsigned long flag;
1310 
1311 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1312 		return;
1313 
1314 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1315 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1316 
1317 	/* Make sure hardware complete it */
1318 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1319 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1320 
1321 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1322 }
1323 
1324 /* return value determine if we need a write buffer flush */
1325 static void __iommu_flush_context(struct intel_iommu *iommu,
1326 				  u16 did, u16 source_id, u8 function_mask,
1327 				  u64 type)
1328 {
1329 	u64 val = 0;
1330 	unsigned long flag;
1331 
1332 	switch (type) {
1333 	case DMA_CCMD_GLOBAL_INVL:
1334 		val = DMA_CCMD_GLOBAL_INVL;
1335 		break;
1336 	case DMA_CCMD_DOMAIN_INVL:
1337 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1338 		break;
1339 	case DMA_CCMD_DEVICE_INVL:
1340 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1341 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1342 		break;
1343 	default:
1344 		BUG();
1345 	}
1346 	val |= DMA_CCMD_ICC;
1347 
1348 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1349 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1350 
1351 	/* Make sure hardware complete it */
1352 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1353 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1354 
1355 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1356 }
1357 
1358 /* return value determine if we need a write buffer flush */
1359 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1360 				u64 addr, unsigned int size_order, u64 type)
1361 {
1362 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1363 	u64 val = 0, val_iva = 0;
1364 	unsigned long flag;
1365 
1366 	switch (type) {
1367 	case DMA_TLB_GLOBAL_FLUSH:
1368 		/* global flush doesn't need set IVA_REG */
1369 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1370 		break;
1371 	case DMA_TLB_DSI_FLUSH:
1372 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1373 		break;
1374 	case DMA_TLB_PSI_FLUSH:
1375 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1376 		/* IH bit is passed in as part of address */
1377 		val_iva = size_order | addr;
1378 		break;
1379 	default:
1380 		BUG();
1381 	}
1382 	/* Note: set drain read/write */
1383 #if 0
1384 	/*
1385 	 * This is probably to be super secure.. Looks like we can
1386 	 * ignore it without any impact.
1387 	 */
1388 	if (cap_read_drain(iommu->cap))
1389 		val |= DMA_TLB_READ_DRAIN;
1390 #endif
1391 	if (cap_write_drain(iommu->cap))
1392 		val |= DMA_TLB_WRITE_DRAIN;
1393 
1394 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1395 	/* Note: Only uses first TLB reg currently */
1396 	if (val_iva)
1397 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1398 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1399 
1400 	/* Make sure hardware complete it */
1401 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1402 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1403 
1404 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1405 
1406 	/* check IOTLB invalidation granularity */
1407 	if (DMA_TLB_IAIG(val) == 0)
1408 		pr_err("Flush IOTLB failed\n");
1409 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1410 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1411 			(unsigned long long)DMA_TLB_IIRG(type),
1412 			(unsigned long long)DMA_TLB_IAIG(val));
1413 }
1414 
1415 static struct device_domain_info *
1416 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1417 			 u8 bus, u8 devfn)
1418 {
1419 	struct device_domain_info *info;
1420 
1421 	assert_spin_locked(&device_domain_lock);
1422 
1423 	if (!iommu->qi)
1424 		return NULL;
1425 
1426 	list_for_each_entry(info, &domain->devices, link)
1427 		if (info->iommu == iommu && info->bus == bus &&
1428 		    info->devfn == devfn) {
1429 			if (info->ats_supported && info->dev)
1430 				return info;
1431 			break;
1432 		}
1433 
1434 	return NULL;
1435 }
1436 
1437 static void domain_update_iotlb(struct dmar_domain *domain)
1438 {
1439 	struct device_domain_info *info;
1440 	bool has_iotlb_device = false;
1441 
1442 	assert_spin_locked(&device_domain_lock);
1443 
1444 	list_for_each_entry(info, &domain->devices, link)
1445 		if (info->ats_enabled) {
1446 			has_iotlb_device = true;
1447 			break;
1448 		}
1449 
1450 	domain->has_iotlb_device = has_iotlb_device;
1451 }
1452 
1453 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1454 {
1455 	struct pci_dev *pdev;
1456 
1457 	assert_spin_locked(&device_domain_lock);
1458 
1459 	if (!info || !dev_is_pci(info->dev))
1460 		return;
1461 
1462 	pdev = to_pci_dev(info->dev);
1463 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1464 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1465 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1466 	 * reserved, which should be set to 0.
1467 	 */
1468 	if (!ecap_dit(info->iommu->ecap))
1469 		info->pfsid = 0;
1470 	else {
1471 		struct pci_dev *pf_pdev;
1472 
1473 		/* pdev will be returned if device is not a vf */
1474 		pf_pdev = pci_physfn(pdev);
1475 		info->pfsid = pci_dev_id(pf_pdev);
1476 	}
1477 
1478 #ifdef CONFIG_INTEL_IOMMU_SVM
1479 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1480 	   the device if you enable PASID support after ATS support is
1481 	   undefined. So always enable PASID support on devices which
1482 	   have it, even if we can't yet know if we're ever going to
1483 	   use it. */
1484 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1485 		info->pasid_enabled = 1;
1486 
1487 	if (info->pri_supported &&
1488 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1489 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1490 		info->pri_enabled = 1;
1491 #endif
1492 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1493 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1494 		info->ats_enabled = 1;
1495 		domain_update_iotlb(info->domain);
1496 		info->ats_qdep = pci_ats_queue_depth(pdev);
1497 	}
1498 }
1499 
1500 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1501 {
1502 	struct pci_dev *pdev;
1503 
1504 	assert_spin_locked(&device_domain_lock);
1505 
1506 	if (!dev_is_pci(info->dev))
1507 		return;
1508 
1509 	pdev = to_pci_dev(info->dev);
1510 
1511 	if (info->ats_enabled) {
1512 		pci_disable_ats(pdev);
1513 		info->ats_enabled = 0;
1514 		domain_update_iotlb(info->domain);
1515 	}
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517 	if (info->pri_enabled) {
1518 		pci_disable_pri(pdev);
1519 		info->pri_enabled = 0;
1520 	}
1521 	if (info->pasid_enabled) {
1522 		pci_disable_pasid(pdev);
1523 		info->pasid_enabled = 0;
1524 	}
1525 #endif
1526 }
1527 
1528 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1529 				    u64 addr, unsigned int mask)
1530 {
1531 	u16 sid, qdep;
1532 
1533 	if (!info || !info->ats_enabled)
1534 		return;
1535 
1536 	sid = info->bus << 8 | info->devfn;
1537 	qdep = info->ats_qdep;
1538 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1539 			   qdep, addr, mask);
1540 }
1541 
1542 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1543 				  u64 addr, unsigned mask)
1544 {
1545 	unsigned long flags;
1546 	struct device_domain_info *info;
1547 
1548 	if (!domain->has_iotlb_device)
1549 		return;
1550 
1551 	spin_lock_irqsave(&device_domain_lock, flags);
1552 	list_for_each_entry(info, &domain->devices, link)
1553 		__iommu_flush_dev_iotlb(info, addr, mask);
1554 
1555 	spin_unlock_irqrestore(&device_domain_lock, flags);
1556 }
1557 
1558 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1559 				  struct dmar_domain *domain,
1560 				  unsigned long pfn, unsigned int pages,
1561 				  int ih, int map)
1562 {
1563 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1564 	unsigned int mask = ilog2(aligned_pages);
1565 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1566 	u16 did = domain->iommu_did[iommu->seq_id];
1567 
1568 	BUG_ON(pages == 0);
1569 
1570 	if (ih)
1571 		ih = 1 << 6;
1572 
1573 	if (domain_use_first_level(domain)) {
1574 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1575 	} else {
1576 		unsigned long bitmask = aligned_pages - 1;
1577 
1578 		/*
1579 		 * PSI masks the low order bits of the base address. If the
1580 		 * address isn't aligned to the mask, then compute a mask value
1581 		 * needed to ensure the target range is flushed.
1582 		 */
1583 		if (unlikely(bitmask & pfn)) {
1584 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1585 
1586 			/*
1587 			 * Since end_pfn <= pfn + bitmask, the only way bits
1588 			 * higher than bitmask can differ in pfn and end_pfn is
1589 			 * by carrying. This means after masking out bitmask,
1590 			 * high bits starting with the first set bit in
1591 			 * shared_bits are all equal in both pfn and end_pfn.
1592 			 */
1593 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1594 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1595 		}
1596 
1597 		/*
1598 		 * Fallback to domain selective flush if no PSI support or
1599 		 * the size is too big.
1600 		 */
1601 		if (!cap_pgsel_inv(iommu->cap) ||
1602 		    mask > cap_max_amask_val(iommu->cap))
1603 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1604 							DMA_TLB_DSI_FLUSH);
1605 		else
1606 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1607 							DMA_TLB_PSI_FLUSH);
1608 	}
1609 
1610 	/*
1611 	 * In caching mode, changes of pages from non-present to present require
1612 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1613 	 */
1614 	if (!cap_caching_mode(iommu->cap) || !map)
1615 		iommu_flush_dev_iotlb(domain, addr, mask);
1616 }
1617 
1618 /* Notification for newly created mappings */
1619 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1620 					struct dmar_domain *domain,
1621 					unsigned long pfn, unsigned int pages)
1622 {
1623 	/*
1624 	 * It's a non-present to present mapping. Only flush if caching mode
1625 	 * and second level.
1626 	 */
1627 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1628 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1629 	else
1630 		iommu_flush_write_buffer(iommu);
1631 }
1632 
1633 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1634 {
1635 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1636 	int idx;
1637 
1638 	for_each_domain_iommu(idx, dmar_domain) {
1639 		struct intel_iommu *iommu = g_iommus[idx];
1640 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1641 
1642 		if (domain_use_first_level(dmar_domain))
1643 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1644 		else
1645 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1646 						 DMA_TLB_DSI_FLUSH);
1647 
1648 		if (!cap_caching_mode(iommu->cap))
1649 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1650 	}
1651 }
1652 
1653 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1654 {
1655 	u32 pmen;
1656 	unsigned long flags;
1657 
1658 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1659 		return;
1660 
1661 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1662 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1663 	pmen &= ~DMA_PMEN_EPM;
1664 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1665 
1666 	/* wait for the protected region status bit to clear */
1667 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1668 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1669 
1670 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1671 }
1672 
1673 static void iommu_enable_translation(struct intel_iommu *iommu)
1674 {
1675 	u32 sts;
1676 	unsigned long flags;
1677 
1678 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1679 	iommu->gcmd |= DMA_GCMD_TE;
1680 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1681 
1682 	/* Make sure hardware complete it */
1683 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1684 		      readl, (sts & DMA_GSTS_TES), sts);
1685 
1686 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1687 }
1688 
1689 static void iommu_disable_translation(struct intel_iommu *iommu)
1690 {
1691 	u32 sts;
1692 	unsigned long flag;
1693 
1694 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1695 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1696 		return;
1697 
1698 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1699 	iommu->gcmd &= ~DMA_GCMD_TE;
1700 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1701 
1702 	/* Make sure hardware complete it */
1703 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1704 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1705 
1706 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1707 }
1708 
1709 static int iommu_init_domains(struct intel_iommu *iommu)
1710 {
1711 	u32 ndomains;
1712 
1713 	ndomains = cap_ndoms(iommu->cap);
1714 	pr_debug("%s: Number of Domains supported <%d>\n",
1715 		 iommu->name, ndomains);
1716 
1717 	spin_lock_init(&iommu->lock);
1718 
1719 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1720 	if (!iommu->domain_ids)
1721 		return -ENOMEM;
1722 
1723 	/*
1724 	 * If Caching mode is set, then invalid translations are tagged
1725 	 * with domain-id 0, hence we need to pre-allocate it. We also
1726 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1727 	 * make sure it is not used for a real domain.
1728 	 */
1729 	set_bit(0, iommu->domain_ids);
1730 
1731 	/*
1732 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1733 	 * entry for first-level or pass-through translation modes should
1734 	 * be programmed with a domain id different from those used for
1735 	 * second-level or nested translation. We reserve a domain id for
1736 	 * this purpose.
1737 	 */
1738 	if (sm_supported(iommu))
1739 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1740 
1741 	return 0;
1742 }
1743 
1744 static void disable_dmar_iommu(struct intel_iommu *iommu)
1745 {
1746 	struct device_domain_info *info, *tmp;
1747 	unsigned long flags;
1748 
1749 	if (!iommu->domain_ids)
1750 		return;
1751 
1752 	spin_lock_irqsave(&device_domain_lock, flags);
1753 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1754 		if (info->iommu != iommu)
1755 			continue;
1756 
1757 		if (!info->dev || !info->domain)
1758 			continue;
1759 
1760 		__dmar_remove_one_dev_info(info);
1761 	}
1762 	spin_unlock_irqrestore(&device_domain_lock, flags);
1763 
1764 	if (iommu->gcmd & DMA_GCMD_TE)
1765 		iommu_disable_translation(iommu);
1766 }
1767 
1768 static void free_dmar_iommu(struct intel_iommu *iommu)
1769 {
1770 	if (iommu->domain_ids) {
1771 		bitmap_free(iommu->domain_ids);
1772 		iommu->domain_ids = NULL;
1773 	}
1774 
1775 	g_iommus[iommu->seq_id] = NULL;
1776 
1777 	/* free context mapping */
1778 	free_context_table(iommu);
1779 
1780 #ifdef CONFIG_INTEL_IOMMU_SVM
1781 	if (pasid_supported(iommu)) {
1782 		if (ecap_prs(iommu->ecap))
1783 			intel_svm_finish_prq(iommu);
1784 	}
1785 	if (vccap_pasid(iommu->vccap))
1786 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1787 
1788 #endif
1789 }
1790 
1791 /*
1792  * Check and return whether first level is used by default for
1793  * DMA translation.
1794  */
1795 static bool first_level_by_default(unsigned int type)
1796 {
1797 	/* Only SL is available in legacy mode */
1798 	if (!scalable_mode_support())
1799 		return false;
1800 
1801 	/* Only level (either FL or SL) is available, just use it */
1802 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1803 		return intel_cap_flts_sanity();
1804 
1805 	/* Both levels are available, decide it based on domain type */
1806 	return type != IOMMU_DOMAIN_UNMANAGED;
1807 }
1808 
1809 static struct dmar_domain *alloc_domain(unsigned int type)
1810 {
1811 	struct dmar_domain *domain;
1812 
1813 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1814 	if (!domain)
1815 		return NULL;
1816 
1817 	domain->nid = NUMA_NO_NODE;
1818 	if (first_level_by_default(type))
1819 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1820 	domain->has_iotlb_device = false;
1821 	INIT_LIST_HEAD(&domain->devices);
1822 
1823 	return domain;
1824 }
1825 
1826 /* Must be called with iommu->lock */
1827 static int domain_attach_iommu(struct dmar_domain *domain,
1828 			       struct intel_iommu *iommu)
1829 {
1830 	unsigned long ndomains;
1831 	int num;
1832 
1833 	assert_spin_locked(&device_domain_lock);
1834 	assert_spin_locked(&iommu->lock);
1835 
1836 	domain->iommu_refcnt[iommu->seq_id] += 1;
1837 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1838 		ndomains = cap_ndoms(iommu->cap);
1839 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1840 
1841 		if (num >= ndomains) {
1842 			pr_err("%s: No free domain ids\n", iommu->name);
1843 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1844 			return -ENOSPC;
1845 		}
1846 
1847 		set_bit(num, iommu->domain_ids);
1848 		domain->iommu_did[iommu->seq_id] = num;
1849 		domain->nid			 = iommu->node;
1850 		domain_update_iommu_cap(domain);
1851 	}
1852 
1853 	return 0;
1854 }
1855 
1856 static void domain_detach_iommu(struct dmar_domain *domain,
1857 				struct intel_iommu *iommu)
1858 {
1859 	int num;
1860 
1861 	assert_spin_locked(&device_domain_lock);
1862 	assert_spin_locked(&iommu->lock);
1863 
1864 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1865 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1866 		num = domain->iommu_did[iommu->seq_id];
1867 		clear_bit(num, iommu->domain_ids);
1868 		domain_update_iommu_cap(domain);
1869 		domain->iommu_did[iommu->seq_id] = 0;
1870 	}
1871 }
1872 
1873 static inline int guestwidth_to_adjustwidth(int gaw)
1874 {
1875 	int agaw;
1876 	int r = (gaw - 12) % 9;
1877 
1878 	if (r == 0)
1879 		agaw = gaw;
1880 	else
1881 		agaw = gaw + 9 - r;
1882 	if (agaw > 64)
1883 		agaw = 64;
1884 	return agaw;
1885 }
1886 
1887 static void domain_exit(struct dmar_domain *domain)
1888 {
1889 
1890 	/* Remove associated devices and clear attached or cached domains */
1891 	domain_remove_dev_info(domain);
1892 
1893 	if (domain->pgd) {
1894 		LIST_HEAD(freelist);
1895 
1896 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1897 		put_pages_list(&freelist);
1898 	}
1899 
1900 	kfree(domain);
1901 }
1902 
1903 /*
1904  * Get the PASID directory size for scalable mode context entry.
1905  * Value of X in the PDTS field of a scalable mode context entry
1906  * indicates PASID directory with 2^(X + 7) entries.
1907  */
1908 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1909 {
1910 	unsigned long pds, max_pde;
1911 
1912 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1913 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1914 	if (pds < 7)
1915 		return 0;
1916 
1917 	return pds - 7;
1918 }
1919 
1920 /*
1921  * Set the RID_PASID field of a scalable mode context entry. The
1922  * IOMMU hardware will use the PASID value set in this field for
1923  * DMA translations of DMA requests without PASID.
1924  */
1925 static inline void
1926 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1927 {
1928 	context->hi |= pasid & ((1 << 20) - 1);
1929 }
1930 
1931 /*
1932  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1933  * entry.
1934  */
1935 static inline void context_set_sm_dte(struct context_entry *context)
1936 {
1937 	context->lo |= (1 << 2);
1938 }
1939 
1940 /*
1941  * Set the PRE(Page Request Enable) field of a scalable mode context
1942  * entry.
1943  */
1944 static inline void context_set_sm_pre(struct context_entry *context)
1945 {
1946 	context->lo |= (1 << 4);
1947 }
1948 
1949 /* Convert value to context PASID directory size field coding. */
1950 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1951 
1952 static int domain_context_mapping_one(struct dmar_domain *domain,
1953 				      struct intel_iommu *iommu,
1954 				      struct pasid_table *table,
1955 				      u8 bus, u8 devfn)
1956 {
1957 	u16 did = domain->iommu_did[iommu->seq_id];
1958 	int translation = CONTEXT_TT_MULTI_LEVEL;
1959 	struct device_domain_info *info = NULL;
1960 	struct context_entry *context;
1961 	unsigned long flags;
1962 	int ret;
1963 
1964 	WARN_ON(did == 0);
1965 
1966 	if (hw_pass_through && domain_type_is_si(domain))
1967 		translation = CONTEXT_TT_PASS_THROUGH;
1968 
1969 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1970 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1971 
1972 	BUG_ON(!domain->pgd);
1973 
1974 	spin_lock_irqsave(&device_domain_lock, flags);
1975 	spin_lock(&iommu->lock);
1976 
1977 	ret = -ENOMEM;
1978 	context = iommu_context_addr(iommu, bus, devfn, 1);
1979 	if (!context)
1980 		goto out_unlock;
1981 
1982 	ret = 0;
1983 	if (context_present(context))
1984 		goto out_unlock;
1985 
1986 	/*
1987 	 * For kdump cases, old valid entries may be cached due to the
1988 	 * in-flight DMA and copied pgtable, but there is no unmapping
1989 	 * behaviour for them, thus we need an explicit cache flush for
1990 	 * the newly-mapped device. For kdump, at this point, the device
1991 	 * is supposed to finish reset at its driver probe stage, so no
1992 	 * in-flight DMA will exist, and we don't need to worry anymore
1993 	 * hereafter.
1994 	 */
1995 	if (context_copied(context)) {
1996 		u16 did_old = context_domain_id(context);
1997 
1998 		if (did_old < cap_ndoms(iommu->cap)) {
1999 			iommu->flush.flush_context(iommu, did_old,
2000 						   (((u16)bus) << 8) | devfn,
2001 						   DMA_CCMD_MASK_NOBIT,
2002 						   DMA_CCMD_DEVICE_INVL);
2003 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2004 						 DMA_TLB_DSI_FLUSH);
2005 		}
2006 	}
2007 
2008 	context_clear_entry(context);
2009 
2010 	if (sm_supported(iommu)) {
2011 		unsigned long pds;
2012 
2013 		WARN_ON(!table);
2014 
2015 		/* Setup the PASID DIR pointer: */
2016 		pds = context_get_sm_pds(table);
2017 		context->lo = (u64)virt_to_phys(table->table) |
2018 				context_pdts(pds);
2019 
2020 		/* Setup the RID_PASID field: */
2021 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2022 
2023 		/*
2024 		 * Setup the Device-TLB enable bit and Page request
2025 		 * Enable bit:
2026 		 */
2027 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2028 		if (info && info->ats_supported)
2029 			context_set_sm_dte(context);
2030 		if (info && info->pri_supported)
2031 			context_set_sm_pre(context);
2032 	} else {
2033 		struct dma_pte *pgd = domain->pgd;
2034 		int agaw;
2035 
2036 		context_set_domain_id(context, did);
2037 
2038 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2039 			/*
2040 			 * Skip top levels of page tables for iommu which has
2041 			 * less agaw than default. Unnecessary for PT mode.
2042 			 */
2043 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2044 				ret = -ENOMEM;
2045 				pgd = phys_to_virt(dma_pte_addr(pgd));
2046 				if (!dma_pte_present(pgd))
2047 					goto out_unlock;
2048 			}
2049 
2050 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2051 			if (info && info->ats_supported)
2052 				translation = CONTEXT_TT_DEV_IOTLB;
2053 			else
2054 				translation = CONTEXT_TT_MULTI_LEVEL;
2055 
2056 			context_set_address_root(context, virt_to_phys(pgd));
2057 			context_set_address_width(context, agaw);
2058 		} else {
2059 			/*
2060 			 * In pass through mode, AW must be programmed to
2061 			 * indicate the largest AGAW value supported by
2062 			 * hardware. And ASR is ignored by hardware.
2063 			 */
2064 			context_set_address_width(context, iommu->msagaw);
2065 		}
2066 
2067 		context_set_translation_type(context, translation);
2068 	}
2069 
2070 	context_set_fault_enable(context);
2071 	context_set_present(context);
2072 	if (!ecap_coherent(iommu->ecap))
2073 		clflush_cache_range(context, sizeof(*context));
2074 
2075 	/*
2076 	 * It's a non-present to present mapping. If hardware doesn't cache
2077 	 * non-present entry we only need to flush the write-buffer. If the
2078 	 * _does_ cache non-present entries, then it does so in the special
2079 	 * domain #0, which we have to flush:
2080 	 */
2081 	if (cap_caching_mode(iommu->cap)) {
2082 		iommu->flush.flush_context(iommu, 0,
2083 					   (((u16)bus) << 8) | devfn,
2084 					   DMA_CCMD_MASK_NOBIT,
2085 					   DMA_CCMD_DEVICE_INVL);
2086 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2087 	} else {
2088 		iommu_flush_write_buffer(iommu);
2089 	}
2090 	iommu_enable_dev_iotlb(info);
2091 
2092 	ret = 0;
2093 
2094 out_unlock:
2095 	spin_unlock(&iommu->lock);
2096 	spin_unlock_irqrestore(&device_domain_lock, flags);
2097 
2098 	return ret;
2099 }
2100 
2101 struct domain_context_mapping_data {
2102 	struct dmar_domain *domain;
2103 	struct intel_iommu *iommu;
2104 	struct pasid_table *table;
2105 };
2106 
2107 static int domain_context_mapping_cb(struct pci_dev *pdev,
2108 				     u16 alias, void *opaque)
2109 {
2110 	struct domain_context_mapping_data *data = opaque;
2111 
2112 	return domain_context_mapping_one(data->domain, data->iommu,
2113 					  data->table, PCI_BUS_NUM(alias),
2114 					  alias & 0xff);
2115 }
2116 
2117 static int
2118 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2119 {
2120 	struct domain_context_mapping_data data;
2121 	struct pasid_table *table;
2122 	struct intel_iommu *iommu;
2123 	u8 bus, devfn;
2124 
2125 	iommu = device_to_iommu(dev, &bus, &devfn);
2126 	if (!iommu)
2127 		return -ENODEV;
2128 
2129 	table = intel_pasid_get_table(dev);
2130 
2131 	if (!dev_is_pci(dev))
2132 		return domain_context_mapping_one(domain, iommu, table,
2133 						  bus, devfn);
2134 
2135 	data.domain = domain;
2136 	data.iommu = iommu;
2137 	data.table = table;
2138 
2139 	return pci_for_each_dma_alias(to_pci_dev(dev),
2140 				      &domain_context_mapping_cb, &data);
2141 }
2142 
2143 static int domain_context_mapped_cb(struct pci_dev *pdev,
2144 				    u16 alias, void *opaque)
2145 {
2146 	struct intel_iommu *iommu = opaque;
2147 
2148 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2149 }
2150 
2151 static int domain_context_mapped(struct device *dev)
2152 {
2153 	struct intel_iommu *iommu;
2154 	u8 bus, devfn;
2155 
2156 	iommu = device_to_iommu(dev, &bus, &devfn);
2157 	if (!iommu)
2158 		return -ENODEV;
2159 
2160 	if (!dev_is_pci(dev))
2161 		return device_context_mapped(iommu, bus, devfn);
2162 
2163 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2164 				       domain_context_mapped_cb, iommu);
2165 }
2166 
2167 /* Returns a number of VTD pages, but aligned to MM page size */
2168 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2169 					    size_t size)
2170 {
2171 	host_addr &= ~PAGE_MASK;
2172 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2173 }
2174 
2175 /* Return largest possible superpage level for a given mapping */
2176 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2177 					  unsigned long iov_pfn,
2178 					  unsigned long phy_pfn,
2179 					  unsigned long pages)
2180 {
2181 	int support, level = 1;
2182 	unsigned long pfnmerge;
2183 
2184 	support = domain->iommu_superpage;
2185 
2186 	/* To use a large page, the virtual *and* physical addresses
2187 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2188 	   of them will mean we have to use smaller pages. So just
2189 	   merge them and check both at once. */
2190 	pfnmerge = iov_pfn | phy_pfn;
2191 
2192 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2193 		pages >>= VTD_STRIDE_SHIFT;
2194 		if (!pages)
2195 			break;
2196 		pfnmerge >>= VTD_STRIDE_SHIFT;
2197 		level++;
2198 		support--;
2199 	}
2200 	return level;
2201 }
2202 
2203 /*
2204  * Ensure that old small page tables are removed to make room for superpage(s).
2205  * We're going to add new large pages, so make sure we don't remove their parent
2206  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2207  */
2208 static void switch_to_super_page(struct dmar_domain *domain,
2209 				 unsigned long start_pfn,
2210 				 unsigned long end_pfn, int level)
2211 {
2212 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2213 	struct dma_pte *pte = NULL;
2214 	int i;
2215 
2216 	while (start_pfn <= end_pfn) {
2217 		if (!pte)
2218 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2219 
2220 		if (dma_pte_present(pte)) {
2221 			dma_pte_free_pagetable(domain, start_pfn,
2222 					       start_pfn + lvl_pages - 1,
2223 					       level + 1);
2224 
2225 			for_each_domain_iommu(i, domain)
2226 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2227 						      start_pfn, lvl_pages,
2228 						      0, 0);
2229 		}
2230 
2231 		pte++;
2232 		start_pfn += lvl_pages;
2233 		if (first_pte_in_page(pte))
2234 			pte = NULL;
2235 	}
2236 }
2237 
2238 static int
2239 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2240 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2241 {
2242 	struct dma_pte *first_pte = NULL, *pte = NULL;
2243 	unsigned int largepage_lvl = 0;
2244 	unsigned long lvl_pages = 0;
2245 	phys_addr_t pteval;
2246 	u64 attr;
2247 
2248 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2249 
2250 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2251 		return -EINVAL;
2252 
2253 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2254 	attr |= DMA_FL_PTE_PRESENT;
2255 	if (domain_use_first_level(domain)) {
2256 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2257 		if (prot & DMA_PTE_WRITE)
2258 			attr |= DMA_FL_PTE_DIRTY;
2259 	}
2260 
2261 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2262 
2263 	while (nr_pages > 0) {
2264 		uint64_t tmp;
2265 
2266 		if (!pte) {
2267 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2268 					phys_pfn, nr_pages);
2269 
2270 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2271 			if (!pte)
2272 				return -ENOMEM;
2273 			first_pte = pte;
2274 
2275 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2276 
2277 			/* It is large page*/
2278 			if (largepage_lvl > 1) {
2279 				unsigned long end_pfn;
2280 				unsigned long pages_to_remove;
2281 
2282 				pteval |= DMA_PTE_LARGE_PAGE;
2283 				pages_to_remove = min_t(unsigned long, nr_pages,
2284 							nr_pte_to_next_page(pte) * lvl_pages);
2285 				end_pfn = iov_pfn + pages_to_remove - 1;
2286 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2287 			} else {
2288 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289 			}
2290 
2291 		}
2292 		/* We don't need lock here, nobody else
2293 		 * touches the iova range
2294 		 */
2295 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296 		if (tmp) {
2297 			static int dumps = 5;
2298 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299 				iov_pfn, tmp, (unsigned long long)pteval);
2300 			if (dumps) {
2301 				dumps--;
2302 				debug_dma_dump_mappings(NULL);
2303 			}
2304 			WARN_ON(1);
2305 		}
2306 
2307 		nr_pages -= lvl_pages;
2308 		iov_pfn += lvl_pages;
2309 		phys_pfn += lvl_pages;
2310 		pteval += lvl_pages * VTD_PAGE_SIZE;
2311 
2312 		/* If the next PTE would be the first in a new page, then we
2313 		 * need to flush the cache on the entries we've just written.
2314 		 * And then we'll need to recalculate 'pte', so clear it and
2315 		 * let it get set again in the if (!pte) block above.
2316 		 *
2317 		 * If we're done (!nr_pages) we need to flush the cache too.
2318 		 *
2319 		 * Also if we've been setting superpages, we may need to
2320 		 * recalculate 'pte' and switch back to smaller pages for the
2321 		 * end of the mapping, if the trailing size is not enough to
2322 		 * use another superpage (i.e. nr_pages < lvl_pages).
2323 		 */
2324 		pte++;
2325 		if (!nr_pages || first_pte_in_page(pte) ||
2326 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2327 			domain_flush_cache(domain, first_pte,
2328 					   (void *)pte - (void *)first_pte);
2329 			pte = NULL;
2330 		}
2331 	}
2332 
2333 	return 0;
2334 }
2335 
2336 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2337 {
2338 	struct intel_iommu *iommu = info->iommu;
2339 	struct context_entry *context;
2340 	unsigned long flags;
2341 	u16 did_old;
2342 
2343 	if (!iommu)
2344 		return;
2345 
2346 	spin_lock_irqsave(&iommu->lock, flags);
2347 	context = iommu_context_addr(iommu, bus, devfn, 0);
2348 	if (!context) {
2349 		spin_unlock_irqrestore(&iommu->lock, flags);
2350 		return;
2351 	}
2352 
2353 	if (sm_supported(iommu)) {
2354 		if (hw_pass_through && domain_type_is_si(info->domain))
2355 			did_old = FLPT_DEFAULT_DID;
2356 		else
2357 			did_old = info->domain->iommu_did[iommu->seq_id];
2358 	} else {
2359 		did_old = context_domain_id(context);
2360 	}
2361 
2362 	context_clear_entry(context);
2363 	__iommu_flush_cache(iommu, context, sizeof(*context));
2364 	spin_unlock_irqrestore(&iommu->lock, flags);
2365 	iommu->flush.flush_context(iommu,
2366 				   did_old,
2367 				   (((u16)bus) << 8) | devfn,
2368 				   DMA_CCMD_MASK_NOBIT,
2369 				   DMA_CCMD_DEVICE_INVL);
2370 
2371 	if (sm_supported(iommu))
2372 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2373 
2374 	iommu->flush.flush_iotlb(iommu,
2375 				 did_old,
2376 				 0,
2377 				 0,
2378 				 DMA_TLB_DSI_FLUSH);
2379 
2380 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2381 }
2382 
2383 static void domain_remove_dev_info(struct dmar_domain *domain)
2384 {
2385 	struct device_domain_info *info, *tmp;
2386 	unsigned long flags;
2387 
2388 	spin_lock_irqsave(&device_domain_lock, flags);
2389 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2390 		__dmar_remove_one_dev_info(info);
2391 	spin_unlock_irqrestore(&device_domain_lock, flags);
2392 }
2393 
2394 static inline struct device_domain_info *
2395 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2396 {
2397 	struct device_domain_info *info;
2398 
2399 	list_for_each_entry(info, &device_domain_list, global)
2400 		if (info->segment == segment && info->bus == bus &&
2401 		    info->devfn == devfn)
2402 			return info;
2403 
2404 	return NULL;
2405 }
2406 
2407 static int domain_setup_first_level(struct intel_iommu *iommu,
2408 				    struct dmar_domain *domain,
2409 				    struct device *dev,
2410 				    u32 pasid)
2411 {
2412 	struct dma_pte *pgd = domain->pgd;
2413 	int agaw, level;
2414 	int flags = 0;
2415 
2416 	/*
2417 	 * Skip top levels of page tables for iommu which has
2418 	 * less agaw than default. Unnecessary for PT mode.
2419 	 */
2420 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2421 		pgd = phys_to_virt(dma_pte_addr(pgd));
2422 		if (!dma_pte_present(pgd))
2423 			return -ENOMEM;
2424 	}
2425 
2426 	level = agaw_to_level(agaw);
2427 	if (level != 4 && level != 5)
2428 		return -EINVAL;
2429 
2430 	if (pasid != PASID_RID2PASID)
2431 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2432 	if (level == 5)
2433 		flags |= PASID_FLAG_FL5LP;
2434 
2435 	if (domain->force_snooping)
2436 		flags |= PASID_FLAG_PAGE_SNOOP;
2437 
2438 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2439 					     domain->iommu_did[iommu->seq_id],
2440 					     flags);
2441 }
2442 
2443 static bool dev_is_real_dma_subdevice(struct device *dev)
2444 {
2445 	return dev && dev_is_pci(dev) &&
2446 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2447 }
2448 
2449 static int iommu_domain_identity_map(struct dmar_domain *domain,
2450 				     unsigned long first_vpfn,
2451 				     unsigned long last_vpfn)
2452 {
2453 	/*
2454 	 * RMRR range might have overlap with physical memory range,
2455 	 * clear it first
2456 	 */
2457 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2458 
2459 	return __domain_mapping(domain, first_vpfn,
2460 				first_vpfn, last_vpfn - first_vpfn + 1,
2461 				DMA_PTE_READ|DMA_PTE_WRITE);
2462 }
2463 
2464 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2465 
2466 static int __init si_domain_init(int hw)
2467 {
2468 	struct dmar_rmrr_unit *rmrr;
2469 	struct device *dev;
2470 	int i, nid, ret;
2471 
2472 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2473 	if (!si_domain)
2474 		return -EFAULT;
2475 
2476 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2477 		domain_exit(si_domain);
2478 		return -EFAULT;
2479 	}
2480 
2481 	if (hw)
2482 		return 0;
2483 
2484 	for_each_online_node(nid) {
2485 		unsigned long start_pfn, end_pfn;
2486 		int i;
2487 
2488 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2489 			ret = iommu_domain_identity_map(si_domain,
2490 					mm_to_dma_pfn(start_pfn),
2491 					mm_to_dma_pfn(end_pfn));
2492 			if (ret)
2493 				return ret;
2494 		}
2495 	}
2496 
2497 	/*
2498 	 * Identity map the RMRRs so that devices with RMRRs could also use
2499 	 * the si_domain.
2500 	 */
2501 	for_each_rmrr_units(rmrr) {
2502 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2503 					  i, dev) {
2504 			unsigned long long start = rmrr->base_address;
2505 			unsigned long long end = rmrr->end_address;
2506 
2507 			if (WARN_ON(end < start ||
2508 				    end >> agaw_to_width(si_domain->agaw)))
2509 				continue;
2510 
2511 			ret = iommu_domain_identity_map(si_domain,
2512 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2513 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2514 			if (ret)
2515 				return ret;
2516 		}
2517 	}
2518 
2519 	return 0;
2520 }
2521 
2522 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2523 {
2524 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2525 	struct intel_iommu *iommu;
2526 	unsigned long flags;
2527 	u8 bus, devfn;
2528 	int ret;
2529 
2530 	iommu = device_to_iommu(dev, &bus, &devfn);
2531 	if (!iommu)
2532 		return -ENODEV;
2533 
2534 	spin_lock_irqsave(&device_domain_lock, flags);
2535 	info->domain = domain;
2536 	spin_lock(&iommu->lock);
2537 	ret = domain_attach_iommu(domain, iommu);
2538 	spin_unlock(&iommu->lock);
2539 	if (ret) {
2540 		spin_unlock_irqrestore(&device_domain_lock, flags);
2541 		return ret;
2542 	}
2543 	list_add(&info->link, &domain->devices);
2544 	spin_unlock_irqrestore(&device_domain_lock, flags);
2545 
2546 	/* PASID table is mandatory for a PCI device in scalable mode. */
2547 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2548 		ret = intel_pasid_alloc_table(dev);
2549 		if (ret) {
2550 			dev_err(dev, "PASID table allocation failed\n");
2551 			dmar_remove_one_dev_info(dev);
2552 			return ret;
2553 		}
2554 
2555 		/* Setup the PASID entry for requests without PASID: */
2556 		spin_lock_irqsave(&iommu->lock, flags);
2557 		if (hw_pass_through && domain_type_is_si(domain))
2558 			ret = intel_pasid_setup_pass_through(iommu, domain,
2559 					dev, PASID_RID2PASID);
2560 		else if (domain_use_first_level(domain))
2561 			ret = domain_setup_first_level(iommu, domain, dev,
2562 					PASID_RID2PASID);
2563 		else
2564 			ret = intel_pasid_setup_second_level(iommu, domain,
2565 					dev, PASID_RID2PASID);
2566 		spin_unlock_irqrestore(&iommu->lock, flags);
2567 		if (ret) {
2568 			dev_err(dev, "Setup RID2PASID failed\n");
2569 			dmar_remove_one_dev_info(dev);
2570 			return ret;
2571 		}
2572 	}
2573 
2574 	ret = domain_context_mapping(domain, dev);
2575 	if (ret) {
2576 		dev_err(dev, "Domain context map failed\n");
2577 		dmar_remove_one_dev_info(dev);
2578 		return ret;
2579 	}
2580 
2581 	return 0;
2582 }
2583 
2584 static bool device_has_rmrr(struct device *dev)
2585 {
2586 	struct dmar_rmrr_unit *rmrr;
2587 	struct device *tmp;
2588 	int i;
2589 
2590 	rcu_read_lock();
2591 	for_each_rmrr_units(rmrr) {
2592 		/*
2593 		 * Return TRUE if this RMRR contains the device that
2594 		 * is passed in.
2595 		 */
2596 		for_each_active_dev_scope(rmrr->devices,
2597 					  rmrr->devices_cnt, i, tmp)
2598 			if (tmp == dev ||
2599 			    is_downstream_to_pci_bridge(dev, tmp)) {
2600 				rcu_read_unlock();
2601 				return true;
2602 			}
2603 	}
2604 	rcu_read_unlock();
2605 	return false;
2606 }
2607 
2608 /**
2609  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2610  * is relaxable (ie. is allowed to be not enforced under some conditions)
2611  * @dev: device handle
2612  *
2613  * We assume that PCI USB devices with RMRRs have them largely
2614  * for historical reasons and that the RMRR space is not actively used post
2615  * boot.  This exclusion may change if vendors begin to abuse it.
2616  *
2617  * The same exception is made for graphics devices, with the requirement that
2618  * any use of the RMRR regions will be torn down before assigning the device
2619  * to a guest.
2620  *
2621  * Return: true if the RMRR is relaxable, false otherwise
2622  */
2623 static bool device_rmrr_is_relaxable(struct device *dev)
2624 {
2625 	struct pci_dev *pdev;
2626 
2627 	if (!dev_is_pci(dev))
2628 		return false;
2629 
2630 	pdev = to_pci_dev(dev);
2631 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2632 		return true;
2633 	else
2634 		return false;
2635 }
2636 
2637 /*
2638  * There are a couple cases where we need to restrict the functionality of
2639  * devices associated with RMRRs.  The first is when evaluating a device for
2640  * identity mapping because problems exist when devices are moved in and out
2641  * of domains and their respective RMRR information is lost.  This means that
2642  * a device with associated RMRRs will never be in a "passthrough" domain.
2643  * The second is use of the device through the IOMMU API.  This interface
2644  * expects to have full control of the IOVA space for the device.  We cannot
2645  * satisfy both the requirement that RMRR access is maintained and have an
2646  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2647  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2648  * We therefore prevent devices associated with an RMRR from participating in
2649  * the IOMMU API, which eliminates them from device assignment.
2650  *
2651  * In both cases, devices which have relaxable RMRRs are not concerned by this
2652  * restriction. See device_rmrr_is_relaxable comment.
2653  */
2654 static bool device_is_rmrr_locked(struct device *dev)
2655 {
2656 	if (!device_has_rmrr(dev))
2657 		return false;
2658 
2659 	if (device_rmrr_is_relaxable(dev))
2660 		return false;
2661 
2662 	return true;
2663 }
2664 
2665 /*
2666  * Return the required default domain type for a specific device.
2667  *
2668  * @dev: the device in query
2669  * @startup: true if this is during early boot
2670  *
2671  * Returns:
2672  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2673  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2674  *  - 0: both identity and dynamic domains work for this device
2675  */
2676 static int device_def_domain_type(struct device *dev)
2677 {
2678 	if (dev_is_pci(dev)) {
2679 		struct pci_dev *pdev = to_pci_dev(dev);
2680 
2681 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2682 			return IOMMU_DOMAIN_IDENTITY;
2683 
2684 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2685 			return IOMMU_DOMAIN_IDENTITY;
2686 	}
2687 
2688 	return 0;
2689 }
2690 
2691 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2692 {
2693 	/*
2694 	 * Start from the sane iommu hardware state.
2695 	 * If the queued invalidation is already initialized by us
2696 	 * (for example, while enabling interrupt-remapping) then
2697 	 * we got the things already rolling from a sane state.
2698 	 */
2699 	if (!iommu->qi) {
2700 		/*
2701 		 * Clear any previous faults.
2702 		 */
2703 		dmar_fault(-1, iommu);
2704 		/*
2705 		 * Disable queued invalidation if supported and already enabled
2706 		 * before OS handover.
2707 		 */
2708 		dmar_disable_qi(iommu);
2709 	}
2710 
2711 	if (dmar_enable_qi(iommu)) {
2712 		/*
2713 		 * Queued Invalidate not enabled, use Register Based Invalidate
2714 		 */
2715 		iommu->flush.flush_context = __iommu_flush_context;
2716 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2717 		pr_info("%s: Using Register based invalidation\n",
2718 			iommu->name);
2719 	} else {
2720 		iommu->flush.flush_context = qi_flush_context;
2721 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2722 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2723 	}
2724 }
2725 
2726 static int copy_context_table(struct intel_iommu *iommu,
2727 			      struct root_entry *old_re,
2728 			      struct context_entry **tbl,
2729 			      int bus, bool ext)
2730 {
2731 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2732 	struct context_entry *new_ce = NULL, ce;
2733 	struct context_entry *old_ce = NULL;
2734 	struct root_entry re;
2735 	phys_addr_t old_ce_phys;
2736 
2737 	tbl_idx = ext ? bus * 2 : bus;
2738 	memcpy(&re, old_re, sizeof(re));
2739 
2740 	for (devfn = 0; devfn < 256; devfn++) {
2741 		/* First calculate the correct index */
2742 		idx = (ext ? devfn * 2 : devfn) % 256;
2743 
2744 		if (idx == 0) {
2745 			/* First save what we may have and clean up */
2746 			if (new_ce) {
2747 				tbl[tbl_idx] = new_ce;
2748 				__iommu_flush_cache(iommu, new_ce,
2749 						    VTD_PAGE_SIZE);
2750 				pos = 1;
2751 			}
2752 
2753 			if (old_ce)
2754 				memunmap(old_ce);
2755 
2756 			ret = 0;
2757 			if (devfn < 0x80)
2758 				old_ce_phys = root_entry_lctp(&re);
2759 			else
2760 				old_ce_phys = root_entry_uctp(&re);
2761 
2762 			if (!old_ce_phys) {
2763 				if (ext && devfn == 0) {
2764 					/* No LCTP, try UCTP */
2765 					devfn = 0x7f;
2766 					continue;
2767 				} else {
2768 					goto out;
2769 				}
2770 			}
2771 
2772 			ret = -ENOMEM;
2773 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2774 					MEMREMAP_WB);
2775 			if (!old_ce)
2776 				goto out;
2777 
2778 			new_ce = alloc_pgtable_page(iommu->node);
2779 			if (!new_ce)
2780 				goto out_unmap;
2781 
2782 			ret = 0;
2783 		}
2784 
2785 		/* Now copy the context entry */
2786 		memcpy(&ce, old_ce + idx, sizeof(ce));
2787 
2788 		if (!__context_present(&ce))
2789 			continue;
2790 
2791 		did = context_domain_id(&ce);
2792 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2793 			set_bit(did, iommu->domain_ids);
2794 
2795 		/*
2796 		 * We need a marker for copied context entries. This
2797 		 * marker needs to work for the old format as well as
2798 		 * for extended context entries.
2799 		 *
2800 		 * Bit 67 of the context entry is used. In the old
2801 		 * format this bit is available to software, in the
2802 		 * extended format it is the PGE bit, but PGE is ignored
2803 		 * by HW if PASIDs are disabled (and thus still
2804 		 * available).
2805 		 *
2806 		 * So disable PASIDs first and then mark the entry
2807 		 * copied. This means that we don't copy PASID
2808 		 * translations from the old kernel, but this is fine as
2809 		 * faults there are not fatal.
2810 		 */
2811 		context_clear_pasid_enable(&ce);
2812 		context_set_copied(&ce);
2813 
2814 		new_ce[idx] = ce;
2815 	}
2816 
2817 	tbl[tbl_idx + pos] = new_ce;
2818 
2819 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2820 
2821 out_unmap:
2822 	memunmap(old_ce);
2823 
2824 out:
2825 	return ret;
2826 }
2827 
2828 static int copy_translation_tables(struct intel_iommu *iommu)
2829 {
2830 	struct context_entry **ctxt_tbls;
2831 	struct root_entry *old_rt;
2832 	phys_addr_t old_rt_phys;
2833 	int ctxt_table_entries;
2834 	unsigned long flags;
2835 	u64 rtaddr_reg;
2836 	int bus, ret;
2837 	bool new_ext, ext;
2838 
2839 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2840 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2841 	new_ext    = !!ecap_ecs(iommu->ecap);
2842 
2843 	/*
2844 	 * The RTT bit can only be changed when translation is disabled,
2845 	 * but disabling translation means to open a window for data
2846 	 * corruption. So bail out and don't copy anything if we would
2847 	 * have to change the bit.
2848 	 */
2849 	if (new_ext != ext)
2850 		return -EINVAL;
2851 
2852 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2853 	if (!old_rt_phys)
2854 		return -EINVAL;
2855 
2856 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2857 	if (!old_rt)
2858 		return -ENOMEM;
2859 
2860 	/* This is too big for the stack - allocate it from slab */
2861 	ctxt_table_entries = ext ? 512 : 256;
2862 	ret = -ENOMEM;
2863 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2864 	if (!ctxt_tbls)
2865 		goto out_unmap;
2866 
2867 	for (bus = 0; bus < 256; bus++) {
2868 		ret = copy_context_table(iommu, &old_rt[bus],
2869 					 ctxt_tbls, bus, ext);
2870 		if (ret) {
2871 			pr_err("%s: Failed to copy context table for bus %d\n",
2872 				iommu->name, bus);
2873 			continue;
2874 		}
2875 	}
2876 
2877 	spin_lock_irqsave(&iommu->lock, flags);
2878 
2879 	/* Context tables are copied, now write them to the root_entry table */
2880 	for (bus = 0; bus < 256; bus++) {
2881 		int idx = ext ? bus * 2 : bus;
2882 		u64 val;
2883 
2884 		if (ctxt_tbls[idx]) {
2885 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2886 			iommu->root_entry[bus].lo = val;
2887 		}
2888 
2889 		if (!ext || !ctxt_tbls[idx + 1])
2890 			continue;
2891 
2892 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2893 		iommu->root_entry[bus].hi = val;
2894 	}
2895 
2896 	spin_unlock_irqrestore(&iommu->lock, flags);
2897 
2898 	kfree(ctxt_tbls);
2899 
2900 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2901 
2902 	ret = 0;
2903 
2904 out_unmap:
2905 	memunmap(old_rt);
2906 
2907 	return ret;
2908 }
2909 
2910 #ifdef CONFIG_INTEL_IOMMU_SVM
2911 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2912 {
2913 	struct intel_iommu *iommu = data;
2914 	ioasid_t ioasid;
2915 
2916 	if (!iommu)
2917 		return INVALID_IOASID;
2918 	/*
2919 	 * VT-d virtual command interface always uses the full 20 bit
2920 	 * PASID range. Host can partition guest PASID range based on
2921 	 * policies but it is out of guest's control.
2922 	 */
2923 	if (min < PASID_MIN || max > intel_pasid_max_id)
2924 		return INVALID_IOASID;
2925 
2926 	if (vcmd_alloc_pasid(iommu, &ioasid))
2927 		return INVALID_IOASID;
2928 
2929 	return ioasid;
2930 }
2931 
2932 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2933 {
2934 	struct intel_iommu *iommu = data;
2935 
2936 	if (!iommu)
2937 		return;
2938 	/*
2939 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2940 	 * We can only free the PASID when all the devices are unbound.
2941 	 */
2942 	if (ioasid_find(NULL, ioasid, NULL)) {
2943 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2944 		return;
2945 	}
2946 	vcmd_free_pasid(iommu, ioasid);
2947 }
2948 
2949 static void register_pasid_allocator(struct intel_iommu *iommu)
2950 {
2951 	/*
2952 	 * If we are running in the host, no need for custom allocator
2953 	 * in that PASIDs are allocated from the host system-wide.
2954 	 */
2955 	if (!cap_caching_mode(iommu->cap))
2956 		return;
2957 
2958 	if (!sm_supported(iommu)) {
2959 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2960 		return;
2961 	}
2962 
2963 	/*
2964 	 * Register a custom PASID allocator if we are running in a guest,
2965 	 * guest PASID must be obtained via virtual command interface.
2966 	 * There can be multiple vIOMMUs in each guest but only one allocator
2967 	 * is active. All vIOMMU allocators will eventually be calling the same
2968 	 * host allocator.
2969 	 */
2970 	if (!vccap_pasid(iommu->vccap))
2971 		return;
2972 
2973 	pr_info("Register custom PASID allocator\n");
2974 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2975 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2976 	iommu->pasid_allocator.pdata = (void *)iommu;
2977 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2978 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2979 		/*
2980 		 * Disable scalable mode on this IOMMU if there
2981 		 * is no custom allocator. Mixing SM capable vIOMMU
2982 		 * and non-SM vIOMMU are not supported.
2983 		 */
2984 		intel_iommu_sm = 0;
2985 	}
2986 }
2987 #endif
2988 
2989 static int __init init_dmars(void)
2990 {
2991 	struct dmar_drhd_unit *drhd;
2992 	struct intel_iommu *iommu;
2993 	int ret;
2994 
2995 	/*
2996 	 * for each drhd
2997 	 *    allocate root
2998 	 *    initialize and program root entry to not present
2999 	 * endfor
3000 	 */
3001 	for_each_drhd_unit(drhd) {
3002 		/*
3003 		 * lock not needed as this is only incremented in the single
3004 		 * threaded kernel __init code path all other access are read
3005 		 * only
3006 		 */
3007 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3008 			g_num_of_iommus++;
3009 			continue;
3010 		}
3011 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3012 	}
3013 
3014 	/* Preallocate enough resources for IOMMU hot-addition */
3015 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3016 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3017 
3018 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3019 			GFP_KERNEL);
3020 	if (!g_iommus) {
3021 		ret = -ENOMEM;
3022 		goto error;
3023 	}
3024 
3025 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3026 	if (ret)
3027 		goto free_iommu;
3028 
3029 	for_each_iommu(iommu, drhd) {
3030 		if (drhd->ignored) {
3031 			iommu_disable_translation(iommu);
3032 			continue;
3033 		}
3034 
3035 		/*
3036 		 * Find the max pasid size of all IOMMU's in the system.
3037 		 * We need to ensure the system pasid table is no bigger
3038 		 * than the smallest supported.
3039 		 */
3040 		if (pasid_supported(iommu)) {
3041 			u32 temp = 2 << ecap_pss(iommu->ecap);
3042 
3043 			intel_pasid_max_id = min_t(u32, temp,
3044 						   intel_pasid_max_id);
3045 		}
3046 
3047 		g_iommus[iommu->seq_id] = iommu;
3048 
3049 		intel_iommu_init_qi(iommu);
3050 
3051 		ret = iommu_init_domains(iommu);
3052 		if (ret)
3053 			goto free_iommu;
3054 
3055 		init_translation_status(iommu);
3056 
3057 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3058 			iommu_disable_translation(iommu);
3059 			clear_translation_pre_enabled(iommu);
3060 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3061 				iommu->name);
3062 		}
3063 
3064 		/*
3065 		 * TBD:
3066 		 * we could share the same root & context tables
3067 		 * among all IOMMU's. Need to Split it later.
3068 		 */
3069 		ret = iommu_alloc_root_entry(iommu);
3070 		if (ret)
3071 			goto free_iommu;
3072 
3073 		if (translation_pre_enabled(iommu)) {
3074 			pr_info("Translation already enabled - trying to copy translation structures\n");
3075 
3076 			ret = copy_translation_tables(iommu);
3077 			if (ret) {
3078 				/*
3079 				 * We found the IOMMU with translation
3080 				 * enabled - but failed to copy over the
3081 				 * old root-entry table. Try to proceed
3082 				 * by disabling translation now and
3083 				 * allocating a clean root-entry table.
3084 				 * This might cause DMAR faults, but
3085 				 * probably the dump will still succeed.
3086 				 */
3087 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3088 				       iommu->name);
3089 				iommu_disable_translation(iommu);
3090 				clear_translation_pre_enabled(iommu);
3091 			} else {
3092 				pr_info("Copied translation tables from previous kernel for %s\n",
3093 					iommu->name);
3094 			}
3095 		}
3096 
3097 		if (!ecap_pass_through(iommu->ecap))
3098 			hw_pass_through = 0;
3099 		intel_svm_check(iommu);
3100 	}
3101 
3102 	/*
3103 	 * Now that qi is enabled on all iommus, set the root entry and flush
3104 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3105 	 * flush_context function will loop forever and the boot hangs.
3106 	 */
3107 	for_each_active_iommu(iommu, drhd) {
3108 		iommu_flush_write_buffer(iommu);
3109 #ifdef CONFIG_INTEL_IOMMU_SVM
3110 		register_pasid_allocator(iommu);
3111 #endif
3112 		iommu_set_root_entry(iommu);
3113 	}
3114 
3115 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3116 	dmar_map_gfx = 0;
3117 #endif
3118 
3119 	if (!dmar_map_gfx)
3120 		iommu_identity_mapping |= IDENTMAP_GFX;
3121 
3122 	check_tylersburg_isoch();
3123 
3124 	ret = si_domain_init(hw_pass_through);
3125 	if (ret)
3126 		goto free_iommu;
3127 
3128 	/*
3129 	 * for each drhd
3130 	 *   enable fault log
3131 	 *   global invalidate context cache
3132 	 *   global invalidate iotlb
3133 	 *   enable translation
3134 	 */
3135 	for_each_iommu(iommu, drhd) {
3136 		if (drhd->ignored) {
3137 			/*
3138 			 * we always have to disable PMRs or DMA may fail on
3139 			 * this device
3140 			 */
3141 			if (force_on)
3142 				iommu_disable_protect_mem_regions(iommu);
3143 			continue;
3144 		}
3145 
3146 		iommu_flush_write_buffer(iommu);
3147 
3148 #ifdef CONFIG_INTEL_IOMMU_SVM
3149 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3150 			/*
3151 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3152 			 * could cause possible lock race condition.
3153 			 */
3154 			up_write(&dmar_global_lock);
3155 			ret = intel_svm_enable_prq(iommu);
3156 			down_write(&dmar_global_lock);
3157 			if (ret)
3158 				goto free_iommu;
3159 		}
3160 #endif
3161 		ret = dmar_set_interrupt(iommu);
3162 		if (ret)
3163 			goto free_iommu;
3164 	}
3165 
3166 	return 0;
3167 
3168 free_iommu:
3169 	for_each_active_iommu(iommu, drhd) {
3170 		disable_dmar_iommu(iommu);
3171 		free_dmar_iommu(iommu);
3172 	}
3173 
3174 	kfree(g_iommus);
3175 
3176 error:
3177 	return ret;
3178 }
3179 
3180 static void __init init_no_remapping_devices(void)
3181 {
3182 	struct dmar_drhd_unit *drhd;
3183 	struct device *dev;
3184 	int i;
3185 
3186 	for_each_drhd_unit(drhd) {
3187 		if (!drhd->include_all) {
3188 			for_each_active_dev_scope(drhd->devices,
3189 						  drhd->devices_cnt, i, dev)
3190 				break;
3191 			/* ignore DMAR unit if no devices exist */
3192 			if (i == drhd->devices_cnt)
3193 				drhd->ignored = 1;
3194 		}
3195 	}
3196 
3197 	for_each_active_drhd_unit(drhd) {
3198 		if (drhd->include_all)
3199 			continue;
3200 
3201 		for_each_active_dev_scope(drhd->devices,
3202 					  drhd->devices_cnt, i, dev)
3203 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3204 				break;
3205 		if (i < drhd->devices_cnt)
3206 			continue;
3207 
3208 		/* This IOMMU has *only* gfx devices. Either bypass it or
3209 		   set the gfx_mapped flag, as appropriate */
3210 		drhd->gfx_dedicated = 1;
3211 		if (!dmar_map_gfx)
3212 			drhd->ignored = 1;
3213 	}
3214 }
3215 
3216 #ifdef CONFIG_SUSPEND
3217 static int init_iommu_hw(void)
3218 {
3219 	struct dmar_drhd_unit *drhd;
3220 	struct intel_iommu *iommu = NULL;
3221 
3222 	for_each_active_iommu(iommu, drhd)
3223 		if (iommu->qi)
3224 			dmar_reenable_qi(iommu);
3225 
3226 	for_each_iommu(iommu, drhd) {
3227 		if (drhd->ignored) {
3228 			/*
3229 			 * we always have to disable PMRs or DMA may fail on
3230 			 * this device
3231 			 */
3232 			if (force_on)
3233 				iommu_disable_protect_mem_regions(iommu);
3234 			continue;
3235 		}
3236 
3237 		iommu_flush_write_buffer(iommu);
3238 		iommu_set_root_entry(iommu);
3239 		iommu_enable_translation(iommu);
3240 		iommu_disable_protect_mem_regions(iommu);
3241 	}
3242 
3243 	return 0;
3244 }
3245 
3246 static void iommu_flush_all(void)
3247 {
3248 	struct dmar_drhd_unit *drhd;
3249 	struct intel_iommu *iommu;
3250 
3251 	for_each_active_iommu(iommu, drhd) {
3252 		iommu->flush.flush_context(iommu, 0, 0, 0,
3253 					   DMA_CCMD_GLOBAL_INVL);
3254 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3255 					 DMA_TLB_GLOBAL_FLUSH);
3256 	}
3257 }
3258 
3259 static int iommu_suspend(void)
3260 {
3261 	struct dmar_drhd_unit *drhd;
3262 	struct intel_iommu *iommu = NULL;
3263 	unsigned long flag;
3264 
3265 	for_each_active_iommu(iommu, drhd) {
3266 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3267 					     GFP_KERNEL);
3268 		if (!iommu->iommu_state)
3269 			goto nomem;
3270 	}
3271 
3272 	iommu_flush_all();
3273 
3274 	for_each_active_iommu(iommu, drhd) {
3275 		iommu_disable_translation(iommu);
3276 
3277 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3278 
3279 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3280 			readl(iommu->reg + DMAR_FECTL_REG);
3281 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3282 			readl(iommu->reg + DMAR_FEDATA_REG);
3283 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3284 			readl(iommu->reg + DMAR_FEADDR_REG);
3285 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3286 			readl(iommu->reg + DMAR_FEUADDR_REG);
3287 
3288 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3289 	}
3290 	return 0;
3291 
3292 nomem:
3293 	for_each_active_iommu(iommu, drhd)
3294 		kfree(iommu->iommu_state);
3295 
3296 	return -ENOMEM;
3297 }
3298 
3299 static void iommu_resume(void)
3300 {
3301 	struct dmar_drhd_unit *drhd;
3302 	struct intel_iommu *iommu = NULL;
3303 	unsigned long flag;
3304 
3305 	if (init_iommu_hw()) {
3306 		if (force_on)
3307 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3308 		else
3309 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3310 		return;
3311 	}
3312 
3313 	for_each_active_iommu(iommu, drhd) {
3314 
3315 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3316 
3317 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3318 			iommu->reg + DMAR_FECTL_REG);
3319 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3320 			iommu->reg + DMAR_FEDATA_REG);
3321 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3322 			iommu->reg + DMAR_FEADDR_REG);
3323 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3324 			iommu->reg + DMAR_FEUADDR_REG);
3325 
3326 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3327 	}
3328 
3329 	for_each_active_iommu(iommu, drhd)
3330 		kfree(iommu->iommu_state);
3331 }
3332 
3333 static struct syscore_ops iommu_syscore_ops = {
3334 	.resume		= iommu_resume,
3335 	.suspend	= iommu_suspend,
3336 };
3337 
3338 static void __init init_iommu_pm_ops(void)
3339 {
3340 	register_syscore_ops(&iommu_syscore_ops);
3341 }
3342 
3343 #else
3344 static inline void init_iommu_pm_ops(void) {}
3345 #endif	/* CONFIG_PM */
3346 
3347 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3348 {
3349 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3350 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3351 	    rmrr->end_address <= rmrr->base_address ||
3352 	    arch_rmrr_sanity_check(rmrr))
3353 		return -EINVAL;
3354 
3355 	return 0;
3356 }
3357 
3358 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3359 {
3360 	struct acpi_dmar_reserved_memory *rmrr;
3361 	struct dmar_rmrr_unit *rmrru;
3362 
3363 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3364 	if (rmrr_sanity_check(rmrr)) {
3365 		pr_warn(FW_BUG
3366 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3367 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3368 			   rmrr->base_address, rmrr->end_address,
3369 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3370 			   dmi_get_system_info(DMI_BIOS_VERSION),
3371 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3372 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3373 	}
3374 
3375 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3376 	if (!rmrru)
3377 		goto out;
3378 
3379 	rmrru->hdr = header;
3380 
3381 	rmrru->base_address = rmrr->base_address;
3382 	rmrru->end_address = rmrr->end_address;
3383 
3384 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3385 				((void *)rmrr) + rmrr->header.length,
3386 				&rmrru->devices_cnt);
3387 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3388 		goto free_rmrru;
3389 
3390 	list_add(&rmrru->list, &dmar_rmrr_units);
3391 
3392 	return 0;
3393 free_rmrru:
3394 	kfree(rmrru);
3395 out:
3396 	return -ENOMEM;
3397 }
3398 
3399 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3400 {
3401 	struct dmar_atsr_unit *atsru;
3402 	struct acpi_dmar_atsr *tmp;
3403 
3404 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3405 				dmar_rcu_check()) {
3406 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3407 		if (atsr->segment != tmp->segment)
3408 			continue;
3409 		if (atsr->header.length != tmp->header.length)
3410 			continue;
3411 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3412 			return atsru;
3413 	}
3414 
3415 	return NULL;
3416 }
3417 
3418 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3419 {
3420 	struct acpi_dmar_atsr *atsr;
3421 	struct dmar_atsr_unit *atsru;
3422 
3423 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3424 		return 0;
3425 
3426 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3427 	atsru = dmar_find_atsr(atsr);
3428 	if (atsru)
3429 		return 0;
3430 
3431 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3432 	if (!atsru)
3433 		return -ENOMEM;
3434 
3435 	/*
3436 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3437 	 * copy the memory content because the memory buffer will be freed
3438 	 * on return.
3439 	 */
3440 	atsru->hdr = (void *)(atsru + 1);
3441 	memcpy(atsru->hdr, hdr, hdr->length);
3442 	atsru->include_all = atsr->flags & 0x1;
3443 	if (!atsru->include_all) {
3444 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3445 				(void *)atsr + atsr->header.length,
3446 				&atsru->devices_cnt);
3447 		if (atsru->devices_cnt && atsru->devices == NULL) {
3448 			kfree(atsru);
3449 			return -ENOMEM;
3450 		}
3451 	}
3452 
3453 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3454 
3455 	return 0;
3456 }
3457 
3458 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3459 {
3460 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3461 	kfree(atsru);
3462 }
3463 
3464 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3465 {
3466 	struct acpi_dmar_atsr *atsr;
3467 	struct dmar_atsr_unit *atsru;
3468 
3469 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3470 	atsru = dmar_find_atsr(atsr);
3471 	if (atsru) {
3472 		list_del_rcu(&atsru->list);
3473 		synchronize_rcu();
3474 		intel_iommu_free_atsr(atsru);
3475 	}
3476 
3477 	return 0;
3478 }
3479 
3480 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3481 {
3482 	int i;
3483 	struct device *dev;
3484 	struct acpi_dmar_atsr *atsr;
3485 	struct dmar_atsr_unit *atsru;
3486 
3487 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3488 	atsru = dmar_find_atsr(atsr);
3489 	if (!atsru)
3490 		return 0;
3491 
3492 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3493 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3494 					  i, dev)
3495 			return -EBUSY;
3496 	}
3497 
3498 	return 0;
3499 }
3500 
3501 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3502 {
3503 	struct dmar_satc_unit *satcu;
3504 	struct acpi_dmar_satc *tmp;
3505 
3506 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3507 				dmar_rcu_check()) {
3508 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3509 		if (satc->segment != tmp->segment)
3510 			continue;
3511 		if (satc->header.length != tmp->header.length)
3512 			continue;
3513 		if (memcmp(satc, tmp, satc->header.length) == 0)
3514 			return satcu;
3515 	}
3516 
3517 	return NULL;
3518 }
3519 
3520 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3521 {
3522 	struct acpi_dmar_satc *satc;
3523 	struct dmar_satc_unit *satcu;
3524 
3525 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3526 		return 0;
3527 
3528 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3529 	satcu = dmar_find_satc(satc);
3530 	if (satcu)
3531 		return 0;
3532 
3533 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3534 	if (!satcu)
3535 		return -ENOMEM;
3536 
3537 	satcu->hdr = (void *)(satcu + 1);
3538 	memcpy(satcu->hdr, hdr, hdr->length);
3539 	satcu->atc_required = satc->flags & 0x1;
3540 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3541 					      (void *)satc + satc->header.length,
3542 					      &satcu->devices_cnt);
3543 	if (satcu->devices_cnt && !satcu->devices) {
3544 		kfree(satcu);
3545 		return -ENOMEM;
3546 	}
3547 	list_add_rcu(&satcu->list, &dmar_satc_units);
3548 
3549 	return 0;
3550 }
3551 
3552 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3553 {
3554 	int sp, ret;
3555 	struct intel_iommu *iommu = dmaru->iommu;
3556 
3557 	if (g_iommus[iommu->seq_id])
3558 		return 0;
3559 
3560 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3561 	if (ret)
3562 		goto out;
3563 
3564 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3565 		pr_warn("%s: Doesn't support hardware pass through.\n",
3566 			iommu->name);
3567 		return -ENXIO;
3568 	}
3569 
3570 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3571 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3572 		pr_warn("%s: Doesn't support large page.\n",
3573 			iommu->name);
3574 		return -ENXIO;
3575 	}
3576 
3577 	/*
3578 	 * Disable translation if already enabled prior to OS handover.
3579 	 */
3580 	if (iommu->gcmd & DMA_GCMD_TE)
3581 		iommu_disable_translation(iommu);
3582 
3583 	g_iommus[iommu->seq_id] = iommu;
3584 	ret = iommu_init_domains(iommu);
3585 	if (ret == 0)
3586 		ret = iommu_alloc_root_entry(iommu);
3587 	if (ret)
3588 		goto out;
3589 
3590 	intel_svm_check(iommu);
3591 
3592 	if (dmaru->ignored) {
3593 		/*
3594 		 * we always have to disable PMRs or DMA may fail on this device
3595 		 */
3596 		if (force_on)
3597 			iommu_disable_protect_mem_regions(iommu);
3598 		return 0;
3599 	}
3600 
3601 	intel_iommu_init_qi(iommu);
3602 	iommu_flush_write_buffer(iommu);
3603 
3604 #ifdef CONFIG_INTEL_IOMMU_SVM
3605 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3606 		ret = intel_svm_enable_prq(iommu);
3607 		if (ret)
3608 			goto disable_iommu;
3609 	}
3610 #endif
3611 	ret = dmar_set_interrupt(iommu);
3612 	if (ret)
3613 		goto disable_iommu;
3614 
3615 	iommu_set_root_entry(iommu);
3616 	iommu_enable_translation(iommu);
3617 
3618 	iommu_disable_protect_mem_regions(iommu);
3619 	return 0;
3620 
3621 disable_iommu:
3622 	disable_dmar_iommu(iommu);
3623 out:
3624 	free_dmar_iommu(iommu);
3625 	return ret;
3626 }
3627 
3628 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3629 {
3630 	int ret = 0;
3631 	struct intel_iommu *iommu = dmaru->iommu;
3632 
3633 	if (!intel_iommu_enabled)
3634 		return 0;
3635 	if (iommu == NULL)
3636 		return -EINVAL;
3637 
3638 	if (insert) {
3639 		ret = intel_iommu_add(dmaru);
3640 	} else {
3641 		disable_dmar_iommu(iommu);
3642 		free_dmar_iommu(iommu);
3643 	}
3644 
3645 	return ret;
3646 }
3647 
3648 static void intel_iommu_free_dmars(void)
3649 {
3650 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3651 	struct dmar_atsr_unit *atsru, *atsr_n;
3652 	struct dmar_satc_unit *satcu, *satc_n;
3653 
3654 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3655 		list_del(&rmrru->list);
3656 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3657 		kfree(rmrru);
3658 	}
3659 
3660 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3661 		list_del(&atsru->list);
3662 		intel_iommu_free_atsr(atsru);
3663 	}
3664 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3665 		list_del(&satcu->list);
3666 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3667 		kfree(satcu);
3668 	}
3669 }
3670 
3671 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3672 {
3673 	struct dmar_satc_unit *satcu;
3674 	struct acpi_dmar_satc *satc;
3675 	struct device *tmp;
3676 	int i;
3677 
3678 	dev = pci_physfn(dev);
3679 	rcu_read_lock();
3680 
3681 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3682 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3683 		if (satc->segment != pci_domain_nr(dev->bus))
3684 			continue;
3685 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3686 			if (to_pci_dev(tmp) == dev)
3687 				goto out;
3688 	}
3689 	satcu = NULL;
3690 out:
3691 	rcu_read_unlock();
3692 	return satcu;
3693 }
3694 
3695 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3696 {
3697 	int i, ret = 1;
3698 	struct pci_bus *bus;
3699 	struct pci_dev *bridge = NULL;
3700 	struct device *tmp;
3701 	struct acpi_dmar_atsr *atsr;
3702 	struct dmar_atsr_unit *atsru;
3703 	struct dmar_satc_unit *satcu;
3704 
3705 	dev = pci_physfn(dev);
3706 	satcu = dmar_find_matched_satc_unit(dev);
3707 	if (satcu)
3708 		/*
3709 		 * This device supports ATS as it is in SATC table.
3710 		 * When IOMMU is in legacy mode, enabling ATS is done
3711 		 * automatically by HW for the device that requires
3712 		 * ATS, hence OS should not enable this device ATS
3713 		 * to avoid duplicated TLB invalidation.
3714 		 */
3715 		return !(satcu->atc_required && !sm_supported(iommu));
3716 
3717 	for (bus = dev->bus; bus; bus = bus->parent) {
3718 		bridge = bus->self;
3719 		/* If it's an integrated device, allow ATS */
3720 		if (!bridge)
3721 			return 1;
3722 		/* Connected via non-PCIe: no ATS */
3723 		if (!pci_is_pcie(bridge) ||
3724 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3725 			return 0;
3726 		/* If we found the root port, look it up in the ATSR */
3727 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3728 			break;
3729 	}
3730 
3731 	rcu_read_lock();
3732 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3733 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3734 		if (atsr->segment != pci_domain_nr(dev->bus))
3735 			continue;
3736 
3737 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3738 			if (tmp == &bridge->dev)
3739 				goto out;
3740 
3741 		if (atsru->include_all)
3742 			goto out;
3743 	}
3744 	ret = 0;
3745 out:
3746 	rcu_read_unlock();
3747 
3748 	return ret;
3749 }
3750 
3751 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3752 {
3753 	int ret;
3754 	struct dmar_rmrr_unit *rmrru;
3755 	struct dmar_atsr_unit *atsru;
3756 	struct dmar_satc_unit *satcu;
3757 	struct acpi_dmar_atsr *atsr;
3758 	struct acpi_dmar_reserved_memory *rmrr;
3759 	struct acpi_dmar_satc *satc;
3760 
3761 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3762 		return 0;
3763 
3764 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3765 		rmrr = container_of(rmrru->hdr,
3766 				    struct acpi_dmar_reserved_memory, header);
3767 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3768 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3769 				((void *)rmrr) + rmrr->header.length,
3770 				rmrr->segment, rmrru->devices,
3771 				rmrru->devices_cnt);
3772 			if (ret < 0)
3773 				return ret;
3774 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3775 			dmar_remove_dev_scope(info, rmrr->segment,
3776 				rmrru->devices, rmrru->devices_cnt);
3777 		}
3778 	}
3779 
3780 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3781 		if (atsru->include_all)
3782 			continue;
3783 
3784 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3785 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3786 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3787 					(void *)atsr + atsr->header.length,
3788 					atsr->segment, atsru->devices,
3789 					atsru->devices_cnt);
3790 			if (ret > 0)
3791 				break;
3792 			else if (ret < 0)
3793 				return ret;
3794 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3795 			if (dmar_remove_dev_scope(info, atsr->segment,
3796 					atsru->devices, atsru->devices_cnt))
3797 				break;
3798 		}
3799 	}
3800 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3801 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3802 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3803 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3804 					(void *)satc + satc->header.length,
3805 					satc->segment, satcu->devices,
3806 					satcu->devices_cnt);
3807 			if (ret > 0)
3808 				break;
3809 			else if (ret < 0)
3810 				return ret;
3811 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3812 			if (dmar_remove_dev_scope(info, satc->segment,
3813 					satcu->devices, satcu->devices_cnt))
3814 				break;
3815 		}
3816 	}
3817 
3818 	return 0;
3819 }
3820 
3821 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3822 				       unsigned long val, void *v)
3823 {
3824 	struct memory_notify *mhp = v;
3825 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3826 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3827 			mhp->nr_pages - 1);
3828 
3829 	switch (val) {
3830 	case MEM_GOING_ONLINE:
3831 		if (iommu_domain_identity_map(si_domain,
3832 					      start_vpfn, last_vpfn)) {
3833 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3834 				start_vpfn, last_vpfn);
3835 			return NOTIFY_BAD;
3836 		}
3837 		break;
3838 
3839 	case MEM_OFFLINE:
3840 	case MEM_CANCEL_ONLINE:
3841 		{
3842 			struct dmar_drhd_unit *drhd;
3843 			struct intel_iommu *iommu;
3844 			LIST_HEAD(freelist);
3845 
3846 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3847 
3848 			rcu_read_lock();
3849 			for_each_active_iommu(iommu, drhd)
3850 				iommu_flush_iotlb_psi(iommu, si_domain,
3851 					start_vpfn, mhp->nr_pages,
3852 					list_empty(&freelist), 0);
3853 			rcu_read_unlock();
3854 			put_pages_list(&freelist);
3855 		}
3856 		break;
3857 	}
3858 
3859 	return NOTIFY_OK;
3860 }
3861 
3862 static struct notifier_block intel_iommu_memory_nb = {
3863 	.notifier_call = intel_iommu_memory_notifier,
3864 	.priority = 0
3865 };
3866 
3867 static void intel_disable_iommus(void)
3868 {
3869 	struct intel_iommu *iommu = NULL;
3870 	struct dmar_drhd_unit *drhd;
3871 
3872 	for_each_iommu(iommu, drhd)
3873 		iommu_disable_translation(iommu);
3874 }
3875 
3876 void intel_iommu_shutdown(void)
3877 {
3878 	struct dmar_drhd_unit *drhd;
3879 	struct intel_iommu *iommu = NULL;
3880 
3881 	if (no_iommu || dmar_disabled)
3882 		return;
3883 
3884 	down_write(&dmar_global_lock);
3885 
3886 	/* Disable PMRs explicitly here. */
3887 	for_each_iommu(iommu, drhd)
3888 		iommu_disable_protect_mem_regions(iommu);
3889 
3890 	/* Make sure the IOMMUs are switched off */
3891 	intel_disable_iommus();
3892 
3893 	up_write(&dmar_global_lock);
3894 }
3895 
3896 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3897 {
3898 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3899 
3900 	return container_of(iommu_dev, struct intel_iommu, iommu);
3901 }
3902 
3903 static ssize_t version_show(struct device *dev,
3904 			    struct device_attribute *attr, char *buf)
3905 {
3906 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3907 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3908 	return sprintf(buf, "%d:%d\n",
3909 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3910 }
3911 static DEVICE_ATTR_RO(version);
3912 
3913 static ssize_t address_show(struct device *dev,
3914 			    struct device_attribute *attr, char *buf)
3915 {
3916 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3917 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3918 }
3919 static DEVICE_ATTR_RO(address);
3920 
3921 static ssize_t cap_show(struct device *dev,
3922 			struct device_attribute *attr, char *buf)
3923 {
3924 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3925 	return sprintf(buf, "%llx\n", iommu->cap);
3926 }
3927 static DEVICE_ATTR_RO(cap);
3928 
3929 static ssize_t ecap_show(struct device *dev,
3930 			 struct device_attribute *attr, char *buf)
3931 {
3932 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3933 	return sprintf(buf, "%llx\n", iommu->ecap);
3934 }
3935 static DEVICE_ATTR_RO(ecap);
3936 
3937 static ssize_t domains_supported_show(struct device *dev,
3938 				      struct device_attribute *attr, char *buf)
3939 {
3940 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3941 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3942 }
3943 static DEVICE_ATTR_RO(domains_supported);
3944 
3945 static ssize_t domains_used_show(struct device *dev,
3946 				 struct device_attribute *attr, char *buf)
3947 {
3948 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3949 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3950 						  cap_ndoms(iommu->cap)));
3951 }
3952 static DEVICE_ATTR_RO(domains_used);
3953 
3954 static struct attribute *intel_iommu_attrs[] = {
3955 	&dev_attr_version.attr,
3956 	&dev_attr_address.attr,
3957 	&dev_attr_cap.attr,
3958 	&dev_attr_ecap.attr,
3959 	&dev_attr_domains_supported.attr,
3960 	&dev_attr_domains_used.attr,
3961 	NULL,
3962 };
3963 
3964 static struct attribute_group intel_iommu_group = {
3965 	.name = "intel-iommu",
3966 	.attrs = intel_iommu_attrs,
3967 };
3968 
3969 const struct attribute_group *intel_iommu_groups[] = {
3970 	&intel_iommu_group,
3971 	NULL,
3972 };
3973 
3974 static inline bool has_external_pci(void)
3975 {
3976 	struct pci_dev *pdev = NULL;
3977 
3978 	for_each_pci_dev(pdev)
3979 		if (pdev->external_facing)
3980 			return true;
3981 
3982 	return false;
3983 }
3984 
3985 static int __init platform_optin_force_iommu(void)
3986 {
3987 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3988 		return 0;
3989 
3990 	if (no_iommu || dmar_disabled)
3991 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3992 
3993 	/*
3994 	 * If Intel-IOMMU is disabled by default, we will apply identity
3995 	 * map for all devices except those marked as being untrusted.
3996 	 */
3997 	if (dmar_disabled)
3998 		iommu_set_default_passthrough(false);
3999 
4000 	dmar_disabled = 0;
4001 	no_iommu = 0;
4002 
4003 	return 1;
4004 }
4005 
4006 static int __init probe_acpi_namespace_devices(void)
4007 {
4008 	struct dmar_drhd_unit *drhd;
4009 	/* To avoid a -Wunused-but-set-variable warning. */
4010 	struct intel_iommu *iommu __maybe_unused;
4011 	struct device *dev;
4012 	int i, ret = 0;
4013 
4014 	for_each_active_iommu(iommu, drhd) {
4015 		for_each_active_dev_scope(drhd->devices,
4016 					  drhd->devices_cnt, i, dev) {
4017 			struct acpi_device_physical_node *pn;
4018 			struct iommu_group *group;
4019 			struct acpi_device *adev;
4020 
4021 			if (dev->bus != &acpi_bus_type)
4022 				continue;
4023 
4024 			adev = to_acpi_device(dev);
4025 			mutex_lock(&adev->physical_node_lock);
4026 			list_for_each_entry(pn,
4027 					    &adev->physical_node_list, node) {
4028 				group = iommu_group_get(pn->dev);
4029 				if (group) {
4030 					iommu_group_put(group);
4031 					continue;
4032 				}
4033 
4034 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4035 				ret = iommu_probe_device(pn->dev);
4036 				if (ret)
4037 					break;
4038 			}
4039 			mutex_unlock(&adev->physical_node_lock);
4040 
4041 			if (ret)
4042 				return ret;
4043 		}
4044 	}
4045 
4046 	return 0;
4047 }
4048 
4049 int __init intel_iommu_init(void)
4050 {
4051 	int ret = -ENODEV;
4052 	struct dmar_drhd_unit *drhd;
4053 	struct intel_iommu *iommu;
4054 
4055 	/*
4056 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4057 	 * opt in, so enforce that.
4058 	 */
4059 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4060 		    platform_optin_force_iommu();
4061 
4062 	down_write(&dmar_global_lock);
4063 	if (dmar_table_init()) {
4064 		if (force_on)
4065 			panic("tboot: Failed to initialize DMAR table\n");
4066 		goto out_free_dmar;
4067 	}
4068 
4069 	if (dmar_dev_scope_init() < 0) {
4070 		if (force_on)
4071 			panic("tboot: Failed to initialize DMAR device scope\n");
4072 		goto out_free_dmar;
4073 	}
4074 
4075 	up_write(&dmar_global_lock);
4076 
4077 	/*
4078 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4079 	 * complain later when we register it under the lock.
4080 	 */
4081 	dmar_register_bus_notifier();
4082 
4083 	down_write(&dmar_global_lock);
4084 
4085 	if (!no_iommu)
4086 		intel_iommu_debugfs_init();
4087 
4088 	if (no_iommu || dmar_disabled) {
4089 		/*
4090 		 * We exit the function here to ensure IOMMU's remapping and
4091 		 * mempool aren't setup, which means that the IOMMU's PMRs
4092 		 * won't be disabled via the call to init_dmars(). So disable
4093 		 * it explicitly here. The PMRs were setup by tboot prior to
4094 		 * calling SENTER, but the kernel is expected to reset/tear
4095 		 * down the PMRs.
4096 		 */
4097 		if (intel_iommu_tboot_noforce) {
4098 			for_each_iommu(iommu, drhd)
4099 				iommu_disable_protect_mem_regions(iommu);
4100 		}
4101 
4102 		/*
4103 		 * Make sure the IOMMUs are switched off, even when we
4104 		 * boot into a kexec kernel and the previous kernel left
4105 		 * them enabled
4106 		 */
4107 		intel_disable_iommus();
4108 		goto out_free_dmar;
4109 	}
4110 
4111 	if (list_empty(&dmar_rmrr_units))
4112 		pr_info("No RMRR found\n");
4113 
4114 	if (list_empty(&dmar_atsr_units))
4115 		pr_info("No ATSR found\n");
4116 
4117 	if (list_empty(&dmar_satc_units))
4118 		pr_info("No SATC found\n");
4119 
4120 	if (dmar_map_gfx)
4121 		intel_iommu_gfx_mapped = 1;
4122 
4123 	init_no_remapping_devices();
4124 
4125 	ret = init_dmars();
4126 	if (ret) {
4127 		if (force_on)
4128 			panic("tboot: Failed to initialize DMARs\n");
4129 		pr_err("Initialization failed\n");
4130 		goto out_free_dmar;
4131 	}
4132 	up_write(&dmar_global_lock);
4133 
4134 	init_iommu_pm_ops();
4135 
4136 	down_read(&dmar_global_lock);
4137 	for_each_active_iommu(iommu, drhd) {
4138 		/*
4139 		 * The flush queue implementation does not perform
4140 		 * page-selective invalidations that are required for efficient
4141 		 * TLB flushes in virtual environments.  The benefit of batching
4142 		 * is likely to be much lower than the overhead of synchronizing
4143 		 * the virtual and physical IOMMU page-tables.
4144 		 */
4145 		if (cap_caching_mode(iommu->cap)) {
4146 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4147 			iommu_set_dma_strict();
4148 		}
4149 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4150 				       intel_iommu_groups,
4151 				       "%s", iommu->name);
4152 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4153 	}
4154 	up_read(&dmar_global_lock);
4155 
4156 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4157 	if (si_domain && !hw_pass_through)
4158 		register_memory_notifier(&intel_iommu_memory_nb);
4159 
4160 	down_read(&dmar_global_lock);
4161 	if (probe_acpi_namespace_devices())
4162 		pr_warn("ACPI name space devices didn't probe correctly\n");
4163 
4164 	/* Finally, we enable the DMA remapping hardware. */
4165 	for_each_iommu(iommu, drhd) {
4166 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4167 			iommu_enable_translation(iommu);
4168 
4169 		iommu_disable_protect_mem_regions(iommu);
4170 	}
4171 	up_read(&dmar_global_lock);
4172 
4173 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4174 
4175 	intel_iommu_enabled = 1;
4176 
4177 	return 0;
4178 
4179 out_free_dmar:
4180 	intel_iommu_free_dmars();
4181 	up_write(&dmar_global_lock);
4182 	return ret;
4183 }
4184 
4185 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4186 {
4187 	struct device_domain_info *info = opaque;
4188 
4189 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4190 	return 0;
4191 }
4192 
4193 /*
4194  * NB - intel-iommu lacks any sort of reference counting for the users of
4195  * dependent devices.  If multiple endpoints have intersecting dependent
4196  * devices, unbinding the driver from any one of them will possibly leave
4197  * the others unable to operate.
4198  */
4199 static void domain_context_clear(struct device_domain_info *info)
4200 {
4201 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4202 		return;
4203 
4204 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4205 			       &domain_context_clear_one_cb, info);
4206 }
4207 
4208 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4209 {
4210 	struct dmar_domain *domain;
4211 	struct intel_iommu *iommu;
4212 	unsigned long flags;
4213 
4214 	assert_spin_locked(&device_domain_lock);
4215 
4216 	if (WARN_ON(!info))
4217 		return;
4218 
4219 	iommu = info->iommu;
4220 	domain = info->domain;
4221 
4222 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4223 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4224 			intel_pasid_tear_down_entry(iommu, info->dev,
4225 					PASID_RID2PASID, false);
4226 
4227 		iommu_disable_dev_iotlb(info);
4228 		domain_context_clear(info);
4229 		intel_pasid_free_table(info->dev);
4230 	}
4231 
4232 	list_del(&info->link);
4233 
4234 	spin_lock_irqsave(&iommu->lock, flags);
4235 	domain_detach_iommu(domain, iommu);
4236 	spin_unlock_irqrestore(&iommu->lock, flags);
4237 }
4238 
4239 static void dmar_remove_one_dev_info(struct device *dev)
4240 {
4241 	struct device_domain_info *info;
4242 	unsigned long flags;
4243 
4244 	spin_lock_irqsave(&device_domain_lock, flags);
4245 	info = dev_iommu_priv_get(dev);
4246 	if (info)
4247 		__dmar_remove_one_dev_info(info);
4248 	spin_unlock_irqrestore(&device_domain_lock, flags);
4249 }
4250 
4251 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4252 {
4253 	int adjust_width;
4254 
4255 	/* calculate AGAW */
4256 	domain->gaw = guest_width;
4257 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4258 	domain->agaw = width_to_agaw(adjust_width);
4259 
4260 	domain->iommu_coherency = false;
4261 	domain->iommu_superpage = 0;
4262 	domain->max_addr = 0;
4263 
4264 	/* always allocate the top pgd */
4265 	domain->pgd = alloc_pgtable_page(domain->nid);
4266 	if (!domain->pgd)
4267 		return -ENOMEM;
4268 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4269 	return 0;
4270 }
4271 
4272 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4273 {
4274 	struct dmar_domain *dmar_domain;
4275 	struct iommu_domain *domain;
4276 
4277 	switch (type) {
4278 	case IOMMU_DOMAIN_DMA:
4279 	case IOMMU_DOMAIN_DMA_FQ:
4280 	case IOMMU_DOMAIN_UNMANAGED:
4281 		dmar_domain = alloc_domain(type);
4282 		if (!dmar_domain) {
4283 			pr_err("Can't allocate dmar_domain\n");
4284 			return NULL;
4285 		}
4286 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4287 			pr_err("Domain initialization failed\n");
4288 			domain_exit(dmar_domain);
4289 			return NULL;
4290 		}
4291 
4292 		domain = &dmar_domain->domain;
4293 		domain->geometry.aperture_start = 0;
4294 		domain->geometry.aperture_end   =
4295 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4296 		domain->geometry.force_aperture = true;
4297 
4298 		return domain;
4299 	case IOMMU_DOMAIN_IDENTITY:
4300 		return &si_domain->domain;
4301 	default:
4302 		return NULL;
4303 	}
4304 
4305 	return NULL;
4306 }
4307 
4308 static void intel_iommu_domain_free(struct iommu_domain *domain)
4309 {
4310 	if (domain != &si_domain->domain)
4311 		domain_exit(to_dmar_domain(domain));
4312 }
4313 
4314 static int prepare_domain_attach_device(struct iommu_domain *domain,
4315 					struct device *dev)
4316 {
4317 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4318 	struct intel_iommu *iommu;
4319 	int addr_width;
4320 
4321 	iommu = device_to_iommu(dev, NULL, NULL);
4322 	if (!iommu)
4323 		return -ENODEV;
4324 
4325 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4326 		return -EOPNOTSUPP;
4327 
4328 	/* check if this iommu agaw is sufficient for max mapped address */
4329 	addr_width = agaw_to_width(iommu->agaw);
4330 	if (addr_width > cap_mgaw(iommu->cap))
4331 		addr_width = cap_mgaw(iommu->cap);
4332 
4333 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4334 		dev_err(dev, "%s: iommu width (%d) is not "
4335 		        "sufficient for the mapped address (%llx)\n",
4336 		        __func__, addr_width, dmar_domain->max_addr);
4337 		return -EFAULT;
4338 	}
4339 	dmar_domain->gaw = addr_width;
4340 
4341 	/*
4342 	 * Knock out extra levels of page tables if necessary
4343 	 */
4344 	while (iommu->agaw < dmar_domain->agaw) {
4345 		struct dma_pte *pte;
4346 
4347 		pte = dmar_domain->pgd;
4348 		if (dma_pte_present(pte)) {
4349 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4350 			free_pgtable_page(pte);
4351 		}
4352 		dmar_domain->agaw--;
4353 	}
4354 
4355 	return 0;
4356 }
4357 
4358 static int intel_iommu_attach_device(struct iommu_domain *domain,
4359 				     struct device *dev)
4360 {
4361 	int ret;
4362 
4363 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4364 	    device_is_rmrr_locked(dev)) {
4365 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4366 		return -EPERM;
4367 	}
4368 
4369 	/* normally dev is not mapped */
4370 	if (unlikely(domain_context_mapped(dev))) {
4371 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4372 
4373 		if (info->domain)
4374 			dmar_remove_one_dev_info(dev);
4375 	}
4376 
4377 	ret = prepare_domain_attach_device(domain, dev);
4378 	if (ret)
4379 		return ret;
4380 
4381 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4382 }
4383 
4384 static void intel_iommu_detach_device(struct iommu_domain *domain,
4385 				      struct device *dev)
4386 {
4387 	dmar_remove_one_dev_info(dev);
4388 }
4389 
4390 static int intel_iommu_map(struct iommu_domain *domain,
4391 			   unsigned long iova, phys_addr_t hpa,
4392 			   size_t size, int iommu_prot, gfp_t gfp)
4393 {
4394 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4395 	u64 max_addr;
4396 	int prot = 0;
4397 
4398 	if (iommu_prot & IOMMU_READ)
4399 		prot |= DMA_PTE_READ;
4400 	if (iommu_prot & IOMMU_WRITE)
4401 		prot |= DMA_PTE_WRITE;
4402 	if (dmar_domain->set_pte_snp)
4403 		prot |= DMA_PTE_SNP;
4404 
4405 	max_addr = iova + size;
4406 	if (dmar_domain->max_addr < max_addr) {
4407 		u64 end;
4408 
4409 		/* check if minimum agaw is sufficient for mapped address */
4410 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4411 		if (end < max_addr) {
4412 			pr_err("%s: iommu width (%d) is not "
4413 			       "sufficient for the mapped address (%llx)\n",
4414 			       __func__, dmar_domain->gaw, max_addr);
4415 			return -EFAULT;
4416 		}
4417 		dmar_domain->max_addr = max_addr;
4418 	}
4419 	/* Round up size to next multiple of PAGE_SIZE, if it and
4420 	   the low bits of hpa would take us onto the next page */
4421 	size = aligned_nrpages(hpa, size);
4422 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4423 				hpa >> VTD_PAGE_SHIFT, size, prot);
4424 }
4425 
4426 static int intel_iommu_map_pages(struct iommu_domain *domain,
4427 				 unsigned long iova, phys_addr_t paddr,
4428 				 size_t pgsize, size_t pgcount,
4429 				 int prot, gfp_t gfp, size_t *mapped)
4430 {
4431 	unsigned long pgshift = __ffs(pgsize);
4432 	size_t size = pgcount << pgshift;
4433 	int ret;
4434 
4435 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4436 		return -EINVAL;
4437 
4438 	if (!IS_ALIGNED(iova | paddr, pgsize))
4439 		return -EINVAL;
4440 
4441 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4442 	if (!ret && mapped)
4443 		*mapped = size;
4444 
4445 	return ret;
4446 }
4447 
4448 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4449 				unsigned long iova, size_t size,
4450 				struct iommu_iotlb_gather *gather)
4451 {
4452 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4453 	unsigned long start_pfn, last_pfn;
4454 	int level = 0;
4455 
4456 	/* Cope with horrid API which requires us to unmap more than the
4457 	   size argument if it happens to be a large-page mapping. */
4458 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4459 
4460 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4461 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4462 
4463 	start_pfn = iova >> VTD_PAGE_SHIFT;
4464 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4465 
4466 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4467 
4468 	if (dmar_domain->max_addr == iova + size)
4469 		dmar_domain->max_addr = iova;
4470 
4471 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4472 
4473 	return size;
4474 }
4475 
4476 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4477 				      unsigned long iova,
4478 				      size_t pgsize, size_t pgcount,
4479 				      struct iommu_iotlb_gather *gather)
4480 {
4481 	unsigned long pgshift = __ffs(pgsize);
4482 	size_t size = pgcount << pgshift;
4483 
4484 	return intel_iommu_unmap(domain, iova, size, gather);
4485 }
4486 
4487 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4488 				 struct iommu_iotlb_gather *gather)
4489 {
4490 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4491 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4492 	size_t size = gather->end - gather->start;
4493 	unsigned long start_pfn;
4494 	unsigned long nrpages;
4495 	int iommu_id;
4496 
4497 	nrpages = aligned_nrpages(gather->start, size);
4498 	start_pfn = mm_to_dma_pfn(iova_pfn);
4499 
4500 	for_each_domain_iommu(iommu_id, dmar_domain)
4501 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4502 				      start_pfn, nrpages,
4503 				      list_empty(&gather->freelist), 0);
4504 
4505 	put_pages_list(&gather->freelist);
4506 }
4507 
4508 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4509 					    dma_addr_t iova)
4510 {
4511 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4512 	struct dma_pte *pte;
4513 	int level = 0;
4514 	u64 phys = 0;
4515 
4516 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4517 	if (pte && dma_pte_present(pte))
4518 		phys = dma_pte_addr(pte) +
4519 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4520 						VTD_PAGE_SHIFT) - 1));
4521 
4522 	return phys;
4523 }
4524 
4525 static bool domain_support_force_snooping(struct dmar_domain *domain)
4526 {
4527 	struct device_domain_info *info;
4528 	bool support = true;
4529 
4530 	assert_spin_locked(&device_domain_lock);
4531 	list_for_each_entry(info, &domain->devices, link) {
4532 		if (!ecap_sc_support(info->iommu->ecap)) {
4533 			support = false;
4534 			break;
4535 		}
4536 	}
4537 
4538 	return support;
4539 }
4540 
4541 static void domain_set_force_snooping(struct dmar_domain *domain)
4542 {
4543 	struct device_domain_info *info;
4544 
4545 	assert_spin_locked(&device_domain_lock);
4546 
4547 	/*
4548 	 * Second level page table supports per-PTE snoop control. The
4549 	 * iommu_map() interface will handle this by setting SNP bit.
4550 	 */
4551 	if (!domain_use_first_level(domain)) {
4552 		domain->set_pte_snp = true;
4553 		return;
4554 	}
4555 
4556 	list_for_each_entry(info, &domain->devices, link)
4557 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4558 						     PASID_RID2PASID);
4559 }
4560 
4561 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4562 {
4563 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4564 	unsigned long flags;
4565 
4566 	if (dmar_domain->force_snooping)
4567 		return true;
4568 
4569 	spin_lock_irqsave(&device_domain_lock, flags);
4570 	if (!domain_support_force_snooping(dmar_domain)) {
4571 		spin_unlock_irqrestore(&device_domain_lock, flags);
4572 		return false;
4573 	}
4574 
4575 	domain_set_force_snooping(dmar_domain);
4576 	dmar_domain->force_snooping = true;
4577 	spin_unlock_irqrestore(&device_domain_lock, flags);
4578 
4579 	return true;
4580 }
4581 
4582 static bool intel_iommu_capable(enum iommu_cap cap)
4583 {
4584 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4585 		return true;
4586 	if (cap == IOMMU_CAP_INTR_REMAP)
4587 		return irq_remapping_enabled == 1;
4588 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4589 		return dmar_platform_optin();
4590 
4591 	return false;
4592 }
4593 
4594 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4595 {
4596 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4597 	struct device_domain_info *info;
4598 	struct intel_iommu *iommu;
4599 	unsigned long flags;
4600 	u8 bus, devfn;
4601 
4602 	iommu = device_to_iommu(dev, &bus, &devfn);
4603 	if (!iommu)
4604 		return ERR_PTR(-ENODEV);
4605 
4606 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4607 	if (!info)
4608 		return ERR_PTR(-ENOMEM);
4609 
4610 	if (dev_is_real_dma_subdevice(dev)) {
4611 		info->bus = pdev->bus->number;
4612 		info->devfn = pdev->devfn;
4613 		info->segment = pci_domain_nr(pdev->bus);
4614 	} else {
4615 		info->bus = bus;
4616 		info->devfn = devfn;
4617 		info->segment = iommu->segment;
4618 	}
4619 
4620 	info->dev = dev;
4621 	info->iommu = iommu;
4622 	if (dev_is_pci(dev)) {
4623 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4624 		    pci_ats_supported(pdev) &&
4625 		    dmar_ats_supported(pdev, iommu))
4626 			info->ats_supported = 1;
4627 
4628 		if (sm_supported(iommu)) {
4629 			if (pasid_supported(iommu)) {
4630 				int features = pci_pasid_features(pdev);
4631 
4632 				if (features >= 0)
4633 					info->pasid_supported = features | 1;
4634 			}
4635 
4636 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4637 			    pci_pri_supported(pdev))
4638 				info->pri_supported = 1;
4639 		}
4640 	}
4641 
4642 	spin_lock_irqsave(&device_domain_lock, flags);
4643 	list_add(&info->global, &device_domain_list);
4644 	dev_iommu_priv_set(dev, info);
4645 	spin_unlock_irqrestore(&device_domain_lock, flags);
4646 
4647 	return &iommu->iommu;
4648 }
4649 
4650 static void intel_iommu_release_device(struct device *dev)
4651 {
4652 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4653 	unsigned long flags;
4654 
4655 	dmar_remove_one_dev_info(dev);
4656 
4657 	spin_lock_irqsave(&device_domain_lock, flags);
4658 	dev_iommu_priv_set(dev, NULL);
4659 	list_del(&info->global);
4660 	spin_unlock_irqrestore(&device_domain_lock, flags);
4661 
4662 	kfree(info);
4663 	set_dma_ops(dev, NULL);
4664 }
4665 
4666 static void intel_iommu_probe_finalize(struct device *dev)
4667 {
4668 	set_dma_ops(dev, NULL);
4669 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4670 }
4671 
4672 static void intel_iommu_get_resv_regions(struct device *device,
4673 					 struct list_head *head)
4674 {
4675 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4676 	struct iommu_resv_region *reg;
4677 	struct dmar_rmrr_unit *rmrr;
4678 	struct device *i_dev;
4679 	int i;
4680 
4681 	down_read(&dmar_global_lock);
4682 	for_each_rmrr_units(rmrr) {
4683 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4684 					  i, i_dev) {
4685 			struct iommu_resv_region *resv;
4686 			enum iommu_resv_type type;
4687 			size_t length;
4688 
4689 			if (i_dev != device &&
4690 			    !is_downstream_to_pci_bridge(device, i_dev))
4691 				continue;
4692 
4693 			length = rmrr->end_address - rmrr->base_address + 1;
4694 
4695 			type = device_rmrr_is_relaxable(device) ?
4696 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4697 
4698 			resv = iommu_alloc_resv_region(rmrr->base_address,
4699 						       length, prot, type);
4700 			if (!resv)
4701 				break;
4702 
4703 			list_add_tail(&resv->list, head);
4704 		}
4705 	}
4706 	up_read(&dmar_global_lock);
4707 
4708 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4709 	if (dev_is_pci(device)) {
4710 		struct pci_dev *pdev = to_pci_dev(device);
4711 
4712 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4713 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4714 						   IOMMU_RESV_DIRECT_RELAXABLE);
4715 			if (reg)
4716 				list_add_tail(&reg->list, head);
4717 		}
4718 	}
4719 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4720 
4721 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4722 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4723 				      0, IOMMU_RESV_MSI);
4724 	if (!reg)
4725 		return;
4726 	list_add_tail(&reg->list, head);
4727 }
4728 
4729 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4730 {
4731 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4732 	struct context_entry *context;
4733 	struct dmar_domain *domain;
4734 	unsigned long flags;
4735 	u64 ctx_lo;
4736 	int ret;
4737 
4738 	domain = info->domain;
4739 	if (!domain)
4740 		return -EINVAL;
4741 
4742 	spin_lock_irqsave(&device_domain_lock, flags);
4743 	spin_lock(&iommu->lock);
4744 
4745 	ret = -EINVAL;
4746 	if (!info->pasid_supported)
4747 		goto out;
4748 
4749 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4750 	if (WARN_ON(!context))
4751 		goto out;
4752 
4753 	ctx_lo = context[0].lo;
4754 
4755 	if (!(ctx_lo & CONTEXT_PASIDE)) {
4756 		ctx_lo |= CONTEXT_PASIDE;
4757 		context[0].lo = ctx_lo;
4758 		wmb();
4759 		iommu->flush.flush_context(iommu,
4760 					   domain->iommu_did[iommu->seq_id],
4761 					   PCI_DEVID(info->bus, info->devfn),
4762 					   DMA_CCMD_MASK_NOBIT,
4763 					   DMA_CCMD_DEVICE_INVL);
4764 	}
4765 
4766 	/* Enable PASID support in the device, if it wasn't already */
4767 	if (!info->pasid_enabled)
4768 		iommu_enable_dev_iotlb(info);
4769 
4770 	ret = 0;
4771 
4772  out:
4773 	spin_unlock(&iommu->lock);
4774 	spin_unlock_irqrestore(&device_domain_lock, flags);
4775 
4776 	return ret;
4777 }
4778 
4779 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4780 {
4781 	if (dev_is_pci(dev))
4782 		return pci_device_group(dev);
4783 	return generic_device_group(dev);
4784 }
4785 
4786 static int intel_iommu_enable_sva(struct device *dev)
4787 {
4788 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4789 	struct intel_iommu *iommu;
4790 	int ret;
4791 
4792 	if (!info || dmar_disabled)
4793 		return -EINVAL;
4794 
4795 	iommu = info->iommu;
4796 	if (!iommu)
4797 		return -EINVAL;
4798 
4799 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4800 		return -ENODEV;
4801 
4802 	if (intel_iommu_enable_pasid(iommu, dev))
4803 		return -ENODEV;
4804 
4805 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4806 		return -EINVAL;
4807 
4808 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4809 	if (!ret)
4810 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4811 
4812 	return ret;
4813 }
4814 
4815 static int intel_iommu_disable_sva(struct device *dev)
4816 {
4817 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4818 	struct intel_iommu *iommu = info->iommu;
4819 	int ret;
4820 
4821 	ret = iommu_unregister_device_fault_handler(dev);
4822 	if (!ret)
4823 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4824 
4825 	return ret;
4826 }
4827 
4828 static int intel_iommu_enable_iopf(struct device *dev)
4829 {
4830 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4831 
4832 	if (info && info->pri_supported)
4833 		return 0;
4834 
4835 	return -ENODEV;
4836 }
4837 
4838 static int
4839 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4840 {
4841 	switch (feat) {
4842 	case IOMMU_DEV_FEAT_IOPF:
4843 		return intel_iommu_enable_iopf(dev);
4844 
4845 	case IOMMU_DEV_FEAT_SVA:
4846 		return intel_iommu_enable_sva(dev);
4847 
4848 	default:
4849 		return -ENODEV;
4850 	}
4851 }
4852 
4853 static int
4854 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4855 {
4856 	switch (feat) {
4857 	case IOMMU_DEV_FEAT_IOPF:
4858 		return 0;
4859 
4860 	case IOMMU_DEV_FEAT_SVA:
4861 		return intel_iommu_disable_sva(dev);
4862 
4863 	default:
4864 		return -ENODEV;
4865 	}
4866 }
4867 
4868 static bool intel_iommu_is_attach_deferred(struct device *dev)
4869 {
4870 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4871 
4872 	return translation_pre_enabled(info->iommu) && !info->domain;
4873 }
4874 
4875 /*
4876  * Check that the device does not live on an external facing PCI port that is
4877  * marked as untrusted. Such devices should not be able to apply quirks and
4878  * thus not be able to bypass the IOMMU restrictions.
4879  */
4880 static bool risky_device(struct pci_dev *pdev)
4881 {
4882 	if (pdev->untrusted) {
4883 		pci_info(pdev,
4884 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4885 			 pdev->vendor, pdev->device);
4886 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4887 		return true;
4888 	}
4889 	return false;
4890 }
4891 
4892 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4893 				       unsigned long iova, size_t size)
4894 {
4895 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4896 	unsigned long pages = aligned_nrpages(iova, size);
4897 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4898 	struct intel_iommu *iommu;
4899 	int iommu_id;
4900 
4901 	for_each_domain_iommu(iommu_id, dmar_domain) {
4902 		iommu = g_iommus[iommu_id];
4903 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
4904 	}
4905 }
4906 
4907 const struct iommu_ops intel_iommu_ops = {
4908 	.capable		= intel_iommu_capable,
4909 	.domain_alloc		= intel_iommu_domain_alloc,
4910 	.probe_device		= intel_iommu_probe_device,
4911 	.probe_finalize		= intel_iommu_probe_finalize,
4912 	.release_device		= intel_iommu_release_device,
4913 	.get_resv_regions	= intel_iommu_get_resv_regions,
4914 	.put_resv_regions	= generic_iommu_put_resv_regions,
4915 	.device_group		= intel_iommu_device_group,
4916 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4917 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4918 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4919 	.def_domain_type	= device_def_domain_type,
4920 	.pgsize_bitmap		= SZ_4K,
4921 #ifdef CONFIG_INTEL_IOMMU_SVM
4922 	.sva_bind		= intel_svm_bind,
4923 	.sva_unbind		= intel_svm_unbind,
4924 	.sva_get_pasid		= intel_svm_get_pasid,
4925 	.page_response		= intel_svm_page_response,
4926 #endif
4927 	.default_domain_ops = &(const struct iommu_domain_ops) {
4928 		.attach_dev		= intel_iommu_attach_device,
4929 		.detach_dev		= intel_iommu_detach_device,
4930 		.map_pages		= intel_iommu_map_pages,
4931 		.unmap_pages		= intel_iommu_unmap_pages,
4932 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4933 		.flush_iotlb_all        = intel_flush_iotlb_all,
4934 		.iotlb_sync		= intel_iommu_tlb_sync,
4935 		.iova_to_phys		= intel_iommu_iova_to_phys,
4936 		.free			= intel_iommu_domain_free,
4937 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4938 	}
4939 };
4940 
4941 static void quirk_iommu_igfx(struct pci_dev *dev)
4942 {
4943 	if (risky_device(dev))
4944 		return;
4945 
4946 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4947 	dmar_map_gfx = 0;
4948 }
4949 
4950 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4958 
4959 /* Broadwell igfx malfunctions with dmar */
4960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4968 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4969 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4970 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4971 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4972 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4973 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4984 
4985 static void quirk_iommu_rwbf(struct pci_dev *dev)
4986 {
4987 	if (risky_device(dev))
4988 		return;
4989 
4990 	/*
4991 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4992 	 * but needs it. Same seems to hold for the desktop versions.
4993 	 */
4994 	pci_info(dev, "Forcing write-buffer flush capability\n");
4995 	rwbf_quirk = 1;
4996 }
4997 
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5005 
5006 #define GGC 0x52
5007 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5008 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5009 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5010 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5011 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5012 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5013 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5014 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5015 
5016 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5017 {
5018 	unsigned short ggc;
5019 
5020 	if (risky_device(dev))
5021 		return;
5022 
5023 	if (pci_read_config_word(dev, GGC, &ggc))
5024 		return;
5025 
5026 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5027 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5028 		dmar_map_gfx = 0;
5029 	} else if (dmar_map_gfx) {
5030 		/* we have to ensure the gfx device is idle before we flush */
5031 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5032 		iommu_set_dma_strict();
5033 	}
5034 }
5035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5039 
5040 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5041 {
5042 	unsigned short ver;
5043 
5044 	if (!IS_GFX_DEVICE(dev))
5045 		return;
5046 
5047 	ver = (dev->device >> 8) & 0xff;
5048 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5049 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5050 	    ver != 0x9a && ver != 0xa7)
5051 		return;
5052 
5053 	if (risky_device(dev))
5054 		return;
5055 
5056 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5057 	iommu_skip_te_disable = 1;
5058 }
5059 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5060 
5061 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5062    ISOCH DMAR unit for the Azalia sound device, but not give it any
5063    TLB entries, which causes it to deadlock. Check for that.  We do
5064    this in a function called from init_dmars(), instead of in a PCI
5065    quirk, because we don't want to print the obnoxious "BIOS broken"
5066    message if VT-d is actually disabled.
5067 */
5068 static void __init check_tylersburg_isoch(void)
5069 {
5070 	struct pci_dev *pdev;
5071 	uint32_t vtisochctrl;
5072 
5073 	/* If there's no Azalia in the system anyway, forget it. */
5074 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5075 	if (!pdev)
5076 		return;
5077 
5078 	if (risky_device(pdev)) {
5079 		pci_dev_put(pdev);
5080 		return;
5081 	}
5082 
5083 	pci_dev_put(pdev);
5084 
5085 	/* System Management Registers. Might be hidden, in which case
5086 	   we can't do the sanity check. But that's OK, because the
5087 	   known-broken BIOSes _don't_ actually hide it, so far. */
5088 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5089 	if (!pdev)
5090 		return;
5091 
5092 	if (risky_device(pdev)) {
5093 		pci_dev_put(pdev);
5094 		return;
5095 	}
5096 
5097 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5098 		pci_dev_put(pdev);
5099 		return;
5100 	}
5101 
5102 	pci_dev_put(pdev);
5103 
5104 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5105 	if (vtisochctrl & 1)
5106 		return;
5107 
5108 	/* Drop all bits other than the number of TLB entries */
5109 	vtisochctrl &= 0x1c;
5110 
5111 	/* If we have the recommended number of TLB entries (16), fine. */
5112 	if (vtisochctrl == 0x10)
5113 		return;
5114 
5115 	/* Zero TLB entries? You get to ride the short bus to school. */
5116 	if (!vtisochctrl) {
5117 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5118 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5119 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5120 		     dmi_get_system_info(DMI_BIOS_VERSION),
5121 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5122 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5123 		return;
5124 	}
5125 
5126 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5127 	       vtisochctrl);
5128 }
5129