xref: /openbmc/linux/arch/arm64/kvm/mmu.c (revision 547840bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6 
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_ras.h>
18 #include <asm/kvm_asm.h>
19 #include <asm/kvm_emulate.h>
20 #include <asm/virt.h>
21 
22 #include "trace.h"
23 
24 static pgd_t *boot_hyp_pgd;
25 static pgd_t *hyp_pgd;
26 static pgd_t *merged_hyp_pgd;
27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
28 
29 static unsigned long hyp_idmap_start;
30 static unsigned long hyp_idmap_end;
31 static phys_addr_t hyp_idmap_vector;
32 
33 static unsigned long io_map_base;
34 
35 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
36 
37 #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
38 #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
39 
40 static bool is_iomap(unsigned long flags)
41 {
42 	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
43 }
44 
45 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
46 {
47 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
48 }
49 
50 /**
51  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
52  * @kvm:	pointer to kvm structure.
53  *
54  * Interface to HYP function to flush all VM TLB entries
55  */
56 void kvm_flush_remote_tlbs(struct kvm *kvm)
57 {
58 	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
59 }
60 
61 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
62 {
63 	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
64 }
65 
66 /*
67  * D-Cache management functions. They take the page table entries by
68  * value, as they are flushing the cache using the kernel mapping (or
69  * kmap on 32bit).
70  */
71 static void kvm_flush_dcache_pte(pte_t pte)
72 {
73 	__kvm_flush_dcache_pte(pte);
74 }
75 
76 static void kvm_flush_dcache_pmd(pmd_t pmd)
77 {
78 	__kvm_flush_dcache_pmd(pmd);
79 }
80 
81 static void kvm_flush_dcache_pud(pud_t pud)
82 {
83 	__kvm_flush_dcache_pud(pud);
84 }
85 
86 static bool kvm_is_device_pfn(unsigned long pfn)
87 {
88 	return !pfn_valid(pfn);
89 }
90 
91 /**
92  * stage2_dissolve_pmd() - clear and flush huge PMD entry
93  * @kvm:	pointer to kvm structure.
94  * @addr:	IPA
95  * @pmd:	pmd pointer for IPA
96  *
97  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
98  */
99 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
100 {
101 	if (!pmd_thp_or_huge(*pmd))
102 		return;
103 
104 	pmd_clear(pmd);
105 	kvm_tlb_flush_vmid_ipa(kvm, addr);
106 	put_page(virt_to_page(pmd));
107 }
108 
109 /**
110  * stage2_dissolve_pud() - clear and flush huge PUD entry
111  * @kvm:	pointer to kvm structure.
112  * @addr:	IPA
113  * @pud:	pud pointer for IPA
114  *
115  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
116  */
117 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
118 {
119 	if (!stage2_pud_huge(kvm, *pudp))
120 		return;
121 
122 	stage2_pud_clear(kvm, pudp);
123 	kvm_tlb_flush_vmid_ipa(kvm, addr);
124 	put_page(virt_to_page(pudp));
125 }
126 
127 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
128 				  int min, int max)
129 {
130 	void *page;
131 
132 	BUG_ON(max > KVM_NR_MEM_OBJS);
133 	if (cache->nobjs >= min)
134 		return 0;
135 	while (cache->nobjs < max) {
136 		page = (void *)__get_free_page(GFP_PGTABLE_USER);
137 		if (!page)
138 			return -ENOMEM;
139 		cache->objects[cache->nobjs++] = page;
140 	}
141 	return 0;
142 }
143 
144 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
145 {
146 	while (mc->nobjs)
147 		free_page((unsigned long)mc->objects[--mc->nobjs]);
148 }
149 
150 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
151 {
152 	void *p;
153 
154 	BUG_ON(!mc || !mc->nobjs);
155 	p = mc->objects[--mc->nobjs];
156 	return p;
157 }
158 
159 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
160 {
161 	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
162 	stage2_pgd_clear(kvm, pgd);
163 	kvm_tlb_flush_vmid_ipa(kvm, addr);
164 	stage2_pud_free(kvm, pud_table);
165 	put_page(virt_to_page(pgd));
166 }
167 
168 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
169 {
170 	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
171 	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
172 	stage2_pud_clear(kvm, pud);
173 	kvm_tlb_flush_vmid_ipa(kvm, addr);
174 	stage2_pmd_free(kvm, pmd_table);
175 	put_page(virt_to_page(pud));
176 }
177 
178 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
179 {
180 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
181 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
182 	pmd_clear(pmd);
183 	kvm_tlb_flush_vmid_ipa(kvm, addr);
184 	free_page((unsigned long)pte_table);
185 	put_page(virt_to_page(pmd));
186 }
187 
188 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
189 {
190 	WRITE_ONCE(*ptep, new_pte);
191 	dsb(ishst);
192 }
193 
194 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
195 {
196 	WRITE_ONCE(*pmdp, new_pmd);
197 	dsb(ishst);
198 }
199 
200 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
201 {
202 	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
203 }
204 
205 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
206 {
207 	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
208 	dsb(ishst);
209 }
210 
211 static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
212 {
213 	WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
214 	dsb(ishst);
215 }
216 
217 /*
218  * Unmapping vs dcache management:
219  *
220  * If a guest maps certain memory pages as uncached, all writes will
221  * bypass the data cache and go directly to RAM.  However, the CPUs
222  * can still speculate reads (not writes) and fill cache lines with
223  * data.
224  *
225  * Those cache lines will be *clean* cache lines though, so a
226  * clean+invalidate operation is equivalent to an invalidate
227  * operation, because no cache lines are marked dirty.
228  *
229  * Those clean cache lines could be filled prior to an uncached write
230  * by the guest, and the cache coherent IO subsystem would therefore
231  * end up writing old data to disk.
232  *
233  * This is why right after unmapping a page/section and invalidating
234  * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
235  * the IO subsystem will never hit in the cache.
236  *
237  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
238  * we then fully enforce cacheability of RAM, no matter what the guest
239  * does.
240  */
241 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
242 		       phys_addr_t addr, phys_addr_t end)
243 {
244 	phys_addr_t start_addr = addr;
245 	pte_t *pte, *start_pte;
246 
247 	start_pte = pte = pte_offset_kernel(pmd, addr);
248 	do {
249 		if (!pte_none(*pte)) {
250 			pte_t old_pte = *pte;
251 
252 			kvm_set_pte(pte, __pte(0));
253 			kvm_tlb_flush_vmid_ipa(kvm, addr);
254 
255 			/* No need to invalidate the cache for device mappings */
256 			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
257 				kvm_flush_dcache_pte(old_pte);
258 
259 			put_page(virt_to_page(pte));
260 		}
261 	} while (pte++, addr += PAGE_SIZE, addr != end);
262 
263 	if (stage2_pte_table_empty(kvm, start_pte))
264 		clear_stage2_pmd_entry(kvm, pmd, start_addr);
265 }
266 
267 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
268 		       phys_addr_t addr, phys_addr_t end)
269 {
270 	phys_addr_t next, start_addr = addr;
271 	pmd_t *pmd, *start_pmd;
272 
273 	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
274 	do {
275 		next = stage2_pmd_addr_end(kvm, addr, end);
276 		if (!pmd_none(*pmd)) {
277 			if (pmd_thp_or_huge(*pmd)) {
278 				pmd_t old_pmd = *pmd;
279 
280 				pmd_clear(pmd);
281 				kvm_tlb_flush_vmid_ipa(kvm, addr);
282 
283 				kvm_flush_dcache_pmd(old_pmd);
284 
285 				put_page(virt_to_page(pmd));
286 			} else {
287 				unmap_stage2_ptes(kvm, pmd, addr, next);
288 			}
289 		}
290 	} while (pmd++, addr = next, addr != end);
291 
292 	if (stage2_pmd_table_empty(kvm, start_pmd))
293 		clear_stage2_pud_entry(kvm, pud, start_addr);
294 }
295 
296 static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
297 		       phys_addr_t addr, phys_addr_t end)
298 {
299 	phys_addr_t next, start_addr = addr;
300 	pud_t *pud, *start_pud;
301 
302 	start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
303 	do {
304 		next = stage2_pud_addr_end(kvm, addr, end);
305 		if (!stage2_pud_none(kvm, *pud)) {
306 			if (stage2_pud_huge(kvm, *pud)) {
307 				pud_t old_pud = *pud;
308 
309 				stage2_pud_clear(kvm, pud);
310 				kvm_tlb_flush_vmid_ipa(kvm, addr);
311 				kvm_flush_dcache_pud(old_pud);
312 				put_page(virt_to_page(pud));
313 			} else {
314 				unmap_stage2_pmds(kvm, pud, addr, next);
315 			}
316 		}
317 	} while (pud++, addr = next, addr != end);
318 
319 	if (stage2_pud_table_empty(kvm, start_pud))
320 		clear_stage2_pgd_entry(kvm, pgd, start_addr);
321 }
322 
323 /**
324  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
325  * @kvm:   The VM pointer
326  * @start: The intermediate physical base address of the range to unmap
327  * @size:  The size of the area to unmap
328  *
329  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
330  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
331  * destroying the VM), otherwise another faulting VCPU may come in and mess
332  * with things behind our backs.
333  */
334 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
335 {
336 	pgd_t *pgd;
337 	phys_addr_t addr = start, end = start + size;
338 	phys_addr_t next;
339 
340 	assert_spin_locked(&kvm->mmu_lock);
341 	WARN_ON(size & ~PAGE_MASK);
342 
343 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
344 	do {
345 		/*
346 		 * Make sure the page table is still active, as another thread
347 		 * could have possibly freed the page table, while we released
348 		 * the lock.
349 		 */
350 		if (!READ_ONCE(kvm->arch.pgd))
351 			break;
352 		next = stage2_pgd_addr_end(kvm, addr, end);
353 		if (!stage2_pgd_none(kvm, *pgd))
354 			unmap_stage2_puds(kvm, pgd, addr, next);
355 		/*
356 		 * If the range is too large, release the kvm->mmu_lock
357 		 * to prevent starvation and lockup detector warnings.
358 		 */
359 		if (next != end)
360 			cond_resched_lock(&kvm->mmu_lock);
361 	} while (pgd++, addr = next, addr != end);
362 }
363 
364 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
365 			      phys_addr_t addr, phys_addr_t end)
366 {
367 	pte_t *pte;
368 
369 	pte = pte_offset_kernel(pmd, addr);
370 	do {
371 		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
372 			kvm_flush_dcache_pte(*pte);
373 	} while (pte++, addr += PAGE_SIZE, addr != end);
374 }
375 
376 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
377 			      phys_addr_t addr, phys_addr_t end)
378 {
379 	pmd_t *pmd;
380 	phys_addr_t next;
381 
382 	pmd = stage2_pmd_offset(kvm, pud, addr);
383 	do {
384 		next = stage2_pmd_addr_end(kvm, addr, end);
385 		if (!pmd_none(*pmd)) {
386 			if (pmd_thp_or_huge(*pmd))
387 				kvm_flush_dcache_pmd(*pmd);
388 			else
389 				stage2_flush_ptes(kvm, pmd, addr, next);
390 		}
391 	} while (pmd++, addr = next, addr != end);
392 }
393 
394 static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
395 			      phys_addr_t addr, phys_addr_t end)
396 {
397 	pud_t *pud;
398 	phys_addr_t next;
399 
400 	pud = stage2_pud_offset(kvm, pgd, addr);
401 	do {
402 		next = stage2_pud_addr_end(kvm, addr, end);
403 		if (!stage2_pud_none(kvm, *pud)) {
404 			if (stage2_pud_huge(kvm, *pud))
405 				kvm_flush_dcache_pud(*pud);
406 			else
407 				stage2_flush_pmds(kvm, pud, addr, next);
408 		}
409 	} while (pud++, addr = next, addr != end);
410 }
411 
412 static void stage2_flush_memslot(struct kvm *kvm,
413 				 struct kvm_memory_slot *memslot)
414 {
415 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
416 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
417 	phys_addr_t next;
418 	pgd_t *pgd;
419 
420 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
421 	do {
422 		next = stage2_pgd_addr_end(kvm, addr, end);
423 		if (!stage2_pgd_none(kvm, *pgd))
424 			stage2_flush_puds(kvm, pgd, addr, next);
425 
426 		if (next != end)
427 			cond_resched_lock(&kvm->mmu_lock);
428 	} while (pgd++, addr = next, addr != end);
429 }
430 
431 /**
432  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
433  * @kvm: The struct kvm pointer
434  *
435  * Go through the stage 2 page tables and invalidate any cache lines
436  * backing memory already mapped to the VM.
437  */
438 static void stage2_flush_vm(struct kvm *kvm)
439 {
440 	struct kvm_memslots *slots;
441 	struct kvm_memory_slot *memslot;
442 	int idx;
443 
444 	idx = srcu_read_lock(&kvm->srcu);
445 	spin_lock(&kvm->mmu_lock);
446 
447 	slots = kvm_memslots(kvm);
448 	kvm_for_each_memslot(memslot, slots)
449 		stage2_flush_memslot(kvm, memslot);
450 
451 	spin_unlock(&kvm->mmu_lock);
452 	srcu_read_unlock(&kvm->srcu, idx);
453 }
454 
455 static void clear_hyp_pgd_entry(pgd_t *pgd)
456 {
457 	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
458 	pgd_clear(pgd);
459 	pud_free(NULL, pud_table);
460 	put_page(virt_to_page(pgd));
461 }
462 
463 static void clear_hyp_pud_entry(pud_t *pud)
464 {
465 	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
466 	VM_BUG_ON(pud_huge(*pud));
467 	pud_clear(pud);
468 	pmd_free(NULL, pmd_table);
469 	put_page(virt_to_page(pud));
470 }
471 
472 static void clear_hyp_pmd_entry(pmd_t *pmd)
473 {
474 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
475 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
476 	pmd_clear(pmd);
477 	pte_free_kernel(NULL, pte_table);
478 	put_page(virt_to_page(pmd));
479 }
480 
481 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
482 {
483 	pte_t *pte, *start_pte;
484 
485 	start_pte = pte = pte_offset_kernel(pmd, addr);
486 	do {
487 		if (!pte_none(*pte)) {
488 			kvm_set_pte(pte, __pte(0));
489 			put_page(virt_to_page(pte));
490 		}
491 	} while (pte++, addr += PAGE_SIZE, addr != end);
492 
493 	if (hyp_pte_table_empty(start_pte))
494 		clear_hyp_pmd_entry(pmd);
495 }
496 
497 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
498 {
499 	phys_addr_t next;
500 	pmd_t *pmd, *start_pmd;
501 
502 	start_pmd = pmd = pmd_offset(pud, addr);
503 	do {
504 		next = pmd_addr_end(addr, end);
505 		/* Hyp doesn't use huge pmds */
506 		if (!pmd_none(*pmd))
507 			unmap_hyp_ptes(pmd, addr, next);
508 	} while (pmd++, addr = next, addr != end);
509 
510 	if (hyp_pmd_table_empty(start_pmd))
511 		clear_hyp_pud_entry(pud);
512 }
513 
514 static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
515 {
516 	phys_addr_t next;
517 	pud_t *pud, *start_pud;
518 
519 	start_pud = pud = pud_offset(pgd, addr);
520 	do {
521 		next = pud_addr_end(addr, end);
522 		/* Hyp doesn't use huge puds */
523 		if (!pud_none(*pud))
524 			unmap_hyp_pmds(pud, addr, next);
525 	} while (pud++, addr = next, addr != end);
526 
527 	if (hyp_pud_table_empty(start_pud))
528 		clear_hyp_pgd_entry(pgd);
529 }
530 
531 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
532 {
533 	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
534 }
535 
536 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
537 			      phys_addr_t start, u64 size)
538 {
539 	pgd_t *pgd;
540 	phys_addr_t addr = start, end = start + size;
541 	phys_addr_t next;
542 
543 	/*
544 	 * We don't unmap anything from HYP, except at the hyp tear down.
545 	 * Hence, we don't have to invalidate the TLBs here.
546 	 */
547 	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
548 	do {
549 		next = pgd_addr_end(addr, end);
550 		if (!pgd_none(*pgd))
551 			unmap_hyp_puds(pgd, addr, next);
552 	} while (pgd++, addr = next, addr != end);
553 }
554 
555 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
556 {
557 	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
558 }
559 
560 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
561 {
562 	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
563 }
564 
565 /**
566  * free_hyp_pgds - free Hyp-mode page tables
567  *
568  * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
569  * therefore contains either mappings in the kernel memory area (above
570  * PAGE_OFFSET), or device mappings in the idmap range.
571  *
572  * boot_hyp_pgd should only map the idmap range, and is only used in
573  * the extended idmap case.
574  */
575 void free_hyp_pgds(void)
576 {
577 	pgd_t *id_pgd;
578 
579 	mutex_lock(&kvm_hyp_pgd_mutex);
580 
581 	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
582 
583 	if (id_pgd) {
584 		/* In case we never called hyp_mmu_init() */
585 		if (!io_map_base)
586 			io_map_base = hyp_idmap_start;
587 		unmap_hyp_idmap_range(id_pgd, io_map_base,
588 				      hyp_idmap_start + PAGE_SIZE - io_map_base);
589 	}
590 
591 	if (boot_hyp_pgd) {
592 		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
593 		boot_hyp_pgd = NULL;
594 	}
595 
596 	if (hyp_pgd) {
597 		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
598 				(uintptr_t)high_memory - PAGE_OFFSET);
599 
600 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
601 		hyp_pgd = NULL;
602 	}
603 	if (merged_hyp_pgd) {
604 		clear_page(merged_hyp_pgd);
605 		free_page((unsigned long)merged_hyp_pgd);
606 		merged_hyp_pgd = NULL;
607 	}
608 
609 	mutex_unlock(&kvm_hyp_pgd_mutex);
610 }
611 
612 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
613 				    unsigned long end, unsigned long pfn,
614 				    pgprot_t prot)
615 {
616 	pte_t *pte;
617 	unsigned long addr;
618 
619 	addr = start;
620 	do {
621 		pte = pte_offset_kernel(pmd, addr);
622 		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
623 		get_page(virt_to_page(pte));
624 		pfn++;
625 	} while (addr += PAGE_SIZE, addr != end);
626 }
627 
628 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
629 				   unsigned long end, unsigned long pfn,
630 				   pgprot_t prot)
631 {
632 	pmd_t *pmd;
633 	pte_t *pte;
634 	unsigned long addr, next;
635 
636 	addr = start;
637 	do {
638 		pmd = pmd_offset(pud, addr);
639 
640 		BUG_ON(pmd_sect(*pmd));
641 
642 		if (pmd_none(*pmd)) {
643 			pte = pte_alloc_one_kernel(NULL);
644 			if (!pte) {
645 				kvm_err("Cannot allocate Hyp pte\n");
646 				return -ENOMEM;
647 			}
648 			kvm_pmd_populate(pmd, pte);
649 			get_page(virt_to_page(pmd));
650 		}
651 
652 		next = pmd_addr_end(addr, end);
653 
654 		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
655 		pfn += (next - addr) >> PAGE_SHIFT;
656 	} while (addr = next, addr != end);
657 
658 	return 0;
659 }
660 
661 static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
662 				   unsigned long end, unsigned long pfn,
663 				   pgprot_t prot)
664 {
665 	pud_t *pud;
666 	pmd_t *pmd;
667 	unsigned long addr, next;
668 	int ret;
669 
670 	addr = start;
671 	do {
672 		pud = pud_offset(pgd, addr);
673 
674 		if (pud_none_or_clear_bad(pud)) {
675 			pmd = pmd_alloc_one(NULL, addr);
676 			if (!pmd) {
677 				kvm_err("Cannot allocate Hyp pmd\n");
678 				return -ENOMEM;
679 			}
680 			kvm_pud_populate(pud, pmd);
681 			get_page(virt_to_page(pud));
682 		}
683 
684 		next = pud_addr_end(addr, end);
685 		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
686 		if (ret)
687 			return ret;
688 		pfn += (next - addr) >> PAGE_SHIFT;
689 	} while (addr = next, addr != end);
690 
691 	return 0;
692 }
693 
694 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
695 				 unsigned long start, unsigned long end,
696 				 unsigned long pfn, pgprot_t prot)
697 {
698 	pgd_t *pgd;
699 	pud_t *pud;
700 	unsigned long addr, next;
701 	int err = 0;
702 
703 	mutex_lock(&kvm_hyp_pgd_mutex);
704 	addr = start & PAGE_MASK;
705 	end = PAGE_ALIGN(end);
706 	do {
707 		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
708 
709 		if (pgd_none(*pgd)) {
710 			pud = pud_alloc_one(NULL, addr);
711 			if (!pud) {
712 				kvm_err("Cannot allocate Hyp pud\n");
713 				err = -ENOMEM;
714 				goto out;
715 			}
716 			kvm_pgd_populate(pgd, pud);
717 			get_page(virt_to_page(pgd));
718 		}
719 
720 		next = pgd_addr_end(addr, end);
721 		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
722 		if (err)
723 			goto out;
724 		pfn += (next - addr) >> PAGE_SHIFT;
725 	} while (addr = next, addr != end);
726 out:
727 	mutex_unlock(&kvm_hyp_pgd_mutex);
728 	return err;
729 }
730 
731 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
732 {
733 	if (!is_vmalloc_addr(kaddr)) {
734 		BUG_ON(!virt_addr_valid(kaddr));
735 		return __pa(kaddr);
736 	} else {
737 		return page_to_phys(vmalloc_to_page(kaddr)) +
738 		       offset_in_page(kaddr);
739 	}
740 }
741 
742 /**
743  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
744  * @from:	The virtual kernel start address of the range
745  * @to:		The virtual kernel end address of the range (exclusive)
746  * @prot:	The protection to be applied to this range
747  *
748  * The same virtual address as the kernel virtual address is also used
749  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
750  * physical pages.
751  */
752 int create_hyp_mappings(void *from, void *to, pgprot_t prot)
753 {
754 	phys_addr_t phys_addr;
755 	unsigned long virt_addr;
756 	unsigned long start = kern_hyp_va((unsigned long)from);
757 	unsigned long end = kern_hyp_va((unsigned long)to);
758 
759 	if (is_kernel_in_hyp_mode())
760 		return 0;
761 
762 	start = start & PAGE_MASK;
763 	end = PAGE_ALIGN(end);
764 
765 	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
766 		int err;
767 
768 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
769 		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
770 					    virt_addr, virt_addr + PAGE_SIZE,
771 					    __phys_to_pfn(phys_addr),
772 					    prot);
773 		if (err)
774 			return err;
775 	}
776 
777 	return 0;
778 }
779 
780 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
781 					unsigned long *haddr, pgprot_t prot)
782 {
783 	pgd_t *pgd = hyp_pgd;
784 	unsigned long base;
785 	int ret = 0;
786 
787 	mutex_lock(&kvm_hyp_pgd_mutex);
788 
789 	/*
790 	 * This assumes that we have enough space below the idmap
791 	 * page to allocate our VAs. If not, the check below will
792 	 * kick. A potential alternative would be to detect that
793 	 * overflow and switch to an allocation above the idmap.
794 	 *
795 	 * The allocated size is always a multiple of PAGE_SIZE.
796 	 */
797 	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
798 	base = io_map_base - size;
799 
800 	/*
801 	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
802 	 * allocating the new area, as it would indicate we've
803 	 * overflowed the idmap/IO address range.
804 	 */
805 	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
806 		ret = -ENOMEM;
807 	else
808 		io_map_base = base;
809 
810 	mutex_unlock(&kvm_hyp_pgd_mutex);
811 
812 	if (ret)
813 		goto out;
814 
815 	if (__kvm_cpu_uses_extended_idmap())
816 		pgd = boot_hyp_pgd;
817 
818 	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
819 				    base, base + size,
820 				    __phys_to_pfn(phys_addr), prot);
821 	if (ret)
822 		goto out;
823 
824 	*haddr = base + offset_in_page(phys_addr);
825 
826 out:
827 	return ret;
828 }
829 
830 /**
831  * create_hyp_io_mappings - Map IO into both kernel and HYP
832  * @phys_addr:	The physical start address which gets mapped
833  * @size:	Size of the region being mapped
834  * @kaddr:	Kernel VA for this mapping
835  * @haddr:	HYP VA for this mapping
836  */
837 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
838 			   void __iomem **kaddr,
839 			   void __iomem **haddr)
840 {
841 	unsigned long addr;
842 	int ret;
843 
844 	*kaddr = ioremap(phys_addr, size);
845 	if (!*kaddr)
846 		return -ENOMEM;
847 
848 	if (is_kernel_in_hyp_mode()) {
849 		*haddr = *kaddr;
850 		return 0;
851 	}
852 
853 	ret = __create_hyp_private_mapping(phys_addr, size,
854 					   &addr, PAGE_HYP_DEVICE);
855 	if (ret) {
856 		iounmap(*kaddr);
857 		*kaddr = NULL;
858 		*haddr = NULL;
859 		return ret;
860 	}
861 
862 	*haddr = (void __iomem *)addr;
863 	return 0;
864 }
865 
866 /**
867  * create_hyp_exec_mappings - Map an executable range into HYP
868  * @phys_addr:	The physical start address which gets mapped
869  * @size:	Size of the region being mapped
870  * @haddr:	HYP VA for this mapping
871  */
872 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
873 			     void **haddr)
874 {
875 	unsigned long addr;
876 	int ret;
877 
878 	BUG_ON(is_kernel_in_hyp_mode());
879 
880 	ret = __create_hyp_private_mapping(phys_addr, size,
881 					   &addr, PAGE_HYP_EXEC);
882 	if (ret) {
883 		*haddr = NULL;
884 		return ret;
885 	}
886 
887 	*haddr = (void *)addr;
888 	return 0;
889 }
890 
891 /**
892  * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
893  * @kvm:	The KVM struct pointer for the VM.
894  *
895  * Allocates only the stage-2 HW PGD level table(s) of size defined by
896  * stage2_pgd_size(kvm).
897  *
898  * Note we don't need locking here as this is only called when the VM is
899  * created, which can only be done once.
900  */
901 int kvm_alloc_stage2_pgd(struct kvm *kvm)
902 {
903 	phys_addr_t pgd_phys;
904 	pgd_t *pgd;
905 
906 	if (kvm->arch.pgd != NULL) {
907 		kvm_err("kvm_arch already initialized?\n");
908 		return -EINVAL;
909 	}
910 
911 	/* Allocate the HW PGD, making sure that each page gets its own refcount */
912 	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
913 	if (!pgd)
914 		return -ENOMEM;
915 
916 	pgd_phys = virt_to_phys(pgd);
917 	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
918 		return -EINVAL;
919 
920 	kvm->arch.pgd = pgd;
921 	kvm->arch.pgd_phys = pgd_phys;
922 	return 0;
923 }
924 
925 static void stage2_unmap_memslot(struct kvm *kvm,
926 				 struct kvm_memory_slot *memslot)
927 {
928 	hva_t hva = memslot->userspace_addr;
929 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
930 	phys_addr_t size = PAGE_SIZE * memslot->npages;
931 	hva_t reg_end = hva + size;
932 
933 	/*
934 	 * A memory region could potentially cover multiple VMAs, and any holes
935 	 * between them, so iterate over all of them to find out if we should
936 	 * unmap any of them.
937 	 *
938 	 *     +--------------------------------------------+
939 	 * +---------------+----------------+   +----------------+
940 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
941 	 * +---------------+----------------+   +----------------+
942 	 *     |               memory region                |
943 	 *     +--------------------------------------------+
944 	 */
945 	do {
946 		struct vm_area_struct *vma = find_vma(current->mm, hva);
947 		hva_t vm_start, vm_end;
948 
949 		if (!vma || vma->vm_start >= reg_end)
950 			break;
951 
952 		/*
953 		 * Take the intersection of this VMA with the memory region
954 		 */
955 		vm_start = max(hva, vma->vm_start);
956 		vm_end = min(reg_end, vma->vm_end);
957 
958 		if (!(vma->vm_flags & VM_PFNMAP)) {
959 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
960 			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
961 		}
962 		hva = vm_end;
963 	} while (hva < reg_end);
964 }
965 
966 /**
967  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
968  * @kvm: The struct kvm pointer
969  *
970  * Go through the memregions and unmap any regular RAM
971  * backing memory already mapped to the VM.
972  */
973 void stage2_unmap_vm(struct kvm *kvm)
974 {
975 	struct kvm_memslots *slots;
976 	struct kvm_memory_slot *memslot;
977 	int idx;
978 
979 	idx = srcu_read_lock(&kvm->srcu);
980 	down_read(&current->mm->mmap_sem);
981 	spin_lock(&kvm->mmu_lock);
982 
983 	slots = kvm_memslots(kvm);
984 	kvm_for_each_memslot(memslot, slots)
985 		stage2_unmap_memslot(kvm, memslot);
986 
987 	spin_unlock(&kvm->mmu_lock);
988 	up_read(&current->mm->mmap_sem);
989 	srcu_read_unlock(&kvm->srcu, idx);
990 }
991 
992 /**
993  * kvm_free_stage2_pgd - free all stage-2 tables
994  * @kvm:	The KVM struct pointer for the VM.
995  *
996  * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
997  * underlying level-2 and level-3 tables before freeing the actual level-1 table
998  * and setting the struct pointer to NULL.
999  */
1000 void kvm_free_stage2_pgd(struct kvm *kvm)
1001 {
1002 	void *pgd = NULL;
1003 
1004 	spin_lock(&kvm->mmu_lock);
1005 	if (kvm->arch.pgd) {
1006 		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
1007 		pgd = READ_ONCE(kvm->arch.pgd);
1008 		kvm->arch.pgd = NULL;
1009 		kvm->arch.pgd_phys = 0;
1010 	}
1011 	spin_unlock(&kvm->mmu_lock);
1012 
1013 	/* Free the HW pgd, one page at a time */
1014 	if (pgd)
1015 		free_pages_exact(pgd, stage2_pgd_size(kvm));
1016 }
1017 
1018 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1019 			     phys_addr_t addr)
1020 {
1021 	pgd_t *pgd;
1022 	pud_t *pud;
1023 
1024 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1025 	if (stage2_pgd_none(kvm, *pgd)) {
1026 		if (!cache)
1027 			return NULL;
1028 		pud = mmu_memory_cache_alloc(cache);
1029 		stage2_pgd_populate(kvm, pgd, pud);
1030 		get_page(virt_to_page(pgd));
1031 	}
1032 
1033 	return stage2_pud_offset(kvm, pgd, addr);
1034 }
1035 
1036 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1037 			     phys_addr_t addr)
1038 {
1039 	pud_t *pud;
1040 	pmd_t *pmd;
1041 
1042 	pud = stage2_get_pud(kvm, cache, addr);
1043 	if (!pud || stage2_pud_huge(kvm, *pud))
1044 		return NULL;
1045 
1046 	if (stage2_pud_none(kvm, *pud)) {
1047 		if (!cache)
1048 			return NULL;
1049 		pmd = mmu_memory_cache_alloc(cache);
1050 		stage2_pud_populate(kvm, pud, pmd);
1051 		get_page(virt_to_page(pud));
1052 	}
1053 
1054 	return stage2_pmd_offset(kvm, pud, addr);
1055 }
1056 
1057 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1058 			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
1059 {
1060 	pmd_t *pmd, old_pmd;
1061 
1062 retry:
1063 	pmd = stage2_get_pmd(kvm, cache, addr);
1064 	VM_BUG_ON(!pmd);
1065 
1066 	old_pmd = *pmd;
1067 	/*
1068 	 * Multiple vcpus faulting on the same PMD entry, can
1069 	 * lead to them sequentially updating the PMD with the
1070 	 * same value. Following the break-before-make
1071 	 * (pmd_clear() followed by tlb_flush()) process can
1072 	 * hinder forward progress due to refaults generated
1073 	 * on missing translations.
1074 	 *
1075 	 * Skip updating the page table if the entry is
1076 	 * unchanged.
1077 	 */
1078 	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1079 		return 0;
1080 
1081 	if (pmd_present(old_pmd)) {
1082 		/*
1083 		 * If we already have PTE level mapping for this block,
1084 		 * we must unmap it to avoid inconsistent TLB state and
1085 		 * leaking the table page. We could end up in this situation
1086 		 * if the memory slot was marked for dirty logging and was
1087 		 * reverted, leaving PTE level mappings for the pages accessed
1088 		 * during the period. So, unmap the PTE level mapping for this
1089 		 * block and retry, as we could have released the upper level
1090 		 * table in the process.
1091 		 *
1092 		 * Normal THP split/merge follows mmu_notifier callbacks and do
1093 		 * get handled accordingly.
1094 		 */
1095 		if (!pmd_thp_or_huge(old_pmd)) {
1096 			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1097 			goto retry;
1098 		}
1099 		/*
1100 		 * Mapping in huge pages should only happen through a
1101 		 * fault.  If a page is merged into a transparent huge
1102 		 * page, the individual subpages of that huge page
1103 		 * should be unmapped through MMU notifiers before we
1104 		 * get here.
1105 		 *
1106 		 * Merging of CompoundPages is not supported; they
1107 		 * should become splitting first, unmapped, merged,
1108 		 * and mapped back in on-demand.
1109 		 */
1110 		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1111 		pmd_clear(pmd);
1112 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1113 	} else {
1114 		get_page(virt_to_page(pmd));
1115 	}
1116 
1117 	kvm_set_pmd(pmd, *new_pmd);
1118 	return 0;
1119 }
1120 
1121 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1122 			       phys_addr_t addr, const pud_t *new_pudp)
1123 {
1124 	pud_t *pudp, old_pud;
1125 
1126 retry:
1127 	pudp = stage2_get_pud(kvm, cache, addr);
1128 	VM_BUG_ON(!pudp);
1129 
1130 	old_pud = *pudp;
1131 
1132 	/*
1133 	 * A large number of vcpus faulting on the same stage 2 entry,
1134 	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1135 	 * Skip updating the page tables if there is no change.
1136 	 */
1137 	if (pud_val(old_pud) == pud_val(*new_pudp))
1138 		return 0;
1139 
1140 	if (stage2_pud_present(kvm, old_pud)) {
1141 		/*
1142 		 * If we already have table level mapping for this block, unmap
1143 		 * the range for this block and retry.
1144 		 */
1145 		if (!stage2_pud_huge(kvm, old_pud)) {
1146 			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1147 			goto retry;
1148 		}
1149 
1150 		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1151 		stage2_pud_clear(kvm, pudp);
1152 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1153 	} else {
1154 		get_page(virt_to_page(pudp));
1155 	}
1156 
1157 	kvm_set_pud(pudp, *new_pudp);
1158 	return 0;
1159 }
1160 
1161 /*
1162  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1163  * true if a valid and present leaf-entry is found. A pointer to the
1164  * leaf-entry is returned in the appropriate level variable - pudpp,
1165  * pmdpp, ptepp.
1166  */
1167 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
1168 				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1169 {
1170 	pud_t *pudp;
1171 	pmd_t *pmdp;
1172 	pte_t *ptep;
1173 
1174 	*pudpp = NULL;
1175 	*pmdpp = NULL;
1176 	*ptepp = NULL;
1177 
1178 	pudp = stage2_get_pud(kvm, NULL, addr);
1179 	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1180 		return false;
1181 
1182 	if (stage2_pud_huge(kvm, *pudp)) {
1183 		*pudpp = pudp;
1184 		return true;
1185 	}
1186 
1187 	pmdp = stage2_pmd_offset(kvm, pudp, addr);
1188 	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1189 		return false;
1190 
1191 	if (pmd_thp_or_huge(*pmdp)) {
1192 		*pmdpp = pmdp;
1193 		return true;
1194 	}
1195 
1196 	ptep = pte_offset_kernel(pmdp, addr);
1197 	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1198 		return false;
1199 
1200 	*ptepp = ptep;
1201 	return true;
1202 }
1203 
1204 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
1205 {
1206 	pud_t *pudp;
1207 	pmd_t *pmdp;
1208 	pte_t *ptep;
1209 	bool found;
1210 
1211 	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
1212 	if (!found)
1213 		return false;
1214 
1215 	if (pudp)
1216 		return kvm_s2pud_exec(pudp);
1217 	else if (pmdp)
1218 		return kvm_s2pmd_exec(pmdp);
1219 	else
1220 		return kvm_s2pte_exec(ptep);
1221 }
1222 
1223 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1224 			  phys_addr_t addr, const pte_t *new_pte,
1225 			  unsigned long flags)
1226 {
1227 	pud_t *pud;
1228 	pmd_t *pmd;
1229 	pte_t *pte, old_pte;
1230 	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1231 	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1232 
1233 	VM_BUG_ON(logging_active && !cache);
1234 
1235 	/* Create stage-2 page table mapping - Levels 0 and 1 */
1236 	pud = stage2_get_pud(kvm, cache, addr);
1237 	if (!pud) {
1238 		/*
1239 		 * Ignore calls from kvm_set_spte_hva for unallocated
1240 		 * address ranges.
1241 		 */
1242 		return 0;
1243 	}
1244 
1245 	/*
1246 	 * While dirty page logging - dissolve huge PUD, then continue
1247 	 * on to allocate page.
1248 	 */
1249 	if (logging_active)
1250 		stage2_dissolve_pud(kvm, addr, pud);
1251 
1252 	if (stage2_pud_none(kvm, *pud)) {
1253 		if (!cache)
1254 			return 0; /* ignore calls from kvm_set_spte_hva */
1255 		pmd = mmu_memory_cache_alloc(cache);
1256 		stage2_pud_populate(kvm, pud, pmd);
1257 		get_page(virt_to_page(pud));
1258 	}
1259 
1260 	pmd = stage2_pmd_offset(kvm, pud, addr);
1261 	if (!pmd) {
1262 		/*
1263 		 * Ignore calls from kvm_set_spte_hva for unallocated
1264 		 * address ranges.
1265 		 */
1266 		return 0;
1267 	}
1268 
1269 	/*
1270 	 * While dirty page logging - dissolve huge PMD, then continue on to
1271 	 * allocate page.
1272 	 */
1273 	if (logging_active)
1274 		stage2_dissolve_pmd(kvm, addr, pmd);
1275 
1276 	/* Create stage-2 page mappings - Level 2 */
1277 	if (pmd_none(*pmd)) {
1278 		if (!cache)
1279 			return 0; /* ignore calls from kvm_set_spte_hva */
1280 		pte = mmu_memory_cache_alloc(cache);
1281 		kvm_pmd_populate(pmd, pte);
1282 		get_page(virt_to_page(pmd));
1283 	}
1284 
1285 	pte = pte_offset_kernel(pmd, addr);
1286 
1287 	if (iomap && pte_present(*pte))
1288 		return -EFAULT;
1289 
1290 	/* Create 2nd stage page table mapping - Level 3 */
1291 	old_pte = *pte;
1292 	if (pte_present(old_pte)) {
1293 		/* Skip page table update if there is no change */
1294 		if (pte_val(old_pte) == pte_val(*new_pte))
1295 			return 0;
1296 
1297 		kvm_set_pte(pte, __pte(0));
1298 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1299 	} else {
1300 		get_page(virt_to_page(pte));
1301 	}
1302 
1303 	kvm_set_pte(pte, *new_pte);
1304 	return 0;
1305 }
1306 
1307 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1308 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1309 {
1310 	if (pte_young(*pte)) {
1311 		*pte = pte_mkold(*pte);
1312 		return 1;
1313 	}
1314 	return 0;
1315 }
1316 #else
1317 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1318 {
1319 	return __ptep_test_and_clear_young(pte);
1320 }
1321 #endif
1322 
1323 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1324 {
1325 	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1326 }
1327 
1328 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1329 {
1330 	return stage2_ptep_test_and_clear_young((pte_t *)pud);
1331 }
1332 
1333 /**
1334  * kvm_phys_addr_ioremap - map a device range to guest IPA
1335  *
1336  * @kvm:	The KVM pointer
1337  * @guest_ipa:	The IPA at which to insert the mapping
1338  * @pa:		The physical address of the device
1339  * @size:	The size of the mapping
1340  */
1341 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1342 			  phys_addr_t pa, unsigned long size, bool writable)
1343 {
1344 	phys_addr_t addr, end;
1345 	int ret = 0;
1346 	unsigned long pfn;
1347 	struct kvm_mmu_memory_cache cache = { 0, };
1348 
1349 	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1350 	pfn = __phys_to_pfn(pa);
1351 
1352 	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1353 		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1354 
1355 		if (writable)
1356 			pte = kvm_s2pte_mkwrite(pte);
1357 
1358 		ret = mmu_topup_memory_cache(&cache,
1359 					     kvm_mmu_cache_min_pages(kvm),
1360 					     KVM_NR_MEM_OBJS);
1361 		if (ret)
1362 			goto out;
1363 		spin_lock(&kvm->mmu_lock);
1364 		ret = stage2_set_pte(kvm, &cache, addr, &pte,
1365 						KVM_S2PTE_FLAG_IS_IOMAP);
1366 		spin_unlock(&kvm->mmu_lock);
1367 		if (ret)
1368 			goto out;
1369 
1370 		pfn++;
1371 	}
1372 
1373 out:
1374 	mmu_free_memory_cache(&cache);
1375 	return ret;
1376 }
1377 
1378 /**
1379  * stage2_wp_ptes - write protect PMD range
1380  * @pmd:	pointer to pmd entry
1381  * @addr:	range start address
1382  * @end:	range end address
1383  */
1384 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1385 {
1386 	pte_t *pte;
1387 
1388 	pte = pte_offset_kernel(pmd, addr);
1389 	do {
1390 		if (!pte_none(*pte)) {
1391 			if (!kvm_s2pte_readonly(pte))
1392 				kvm_set_s2pte_readonly(pte);
1393 		}
1394 	} while (pte++, addr += PAGE_SIZE, addr != end);
1395 }
1396 
1397 /**
1398  * stage2_wp_pmds - write protect PUD range
1399  * kvm:		kvm instance for the VM
1400  * @pud:	pointer to pud entry
1401  * @addr:	range start address
1402  * @end:	range end address
1403  */
1404 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1405 			   phys_addr_t addr, phys_addr_t end)
1406 {
1407 	pmd_t *pmd;
1408 	phys_addr_t next;
1409 
1410 	pmd = stage2_pmd_offset(kvm, pud, addr);
1411 
1412 	do {
1413 		next = stage2_pmd_addr_end(kvm, addr, end);
1414 		if (!pmd_none(*pmd)) {
1415 			if (pmd_thp_or_huge(*pmd)) {
1416 				if (!kvm_s2pmd_readonly(pmd))
1417 					kvm_set_s2pmd_readonly(pmd);
1418 			} else {
1419 				stage2_wp_ptes(pmd, addr, next);
1420 			}
1421 		}
1422 	} while (pmd++, addr = next, addr != end);
1423 }
1424 
1425 /**
1426  * stage2_wp_puds - write protect PGD range
1427  * @pgd:	pointer to pgd entry
1428  * @addr:	range start address
1429  * @end:	range end address
1430  */
1431 static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1432 			    phys_addr_t addr, phys_addr_t end)
1433 {
1434 	pud_t *pud;
1435 	phys_addr_t next;
1436 
1437 	pud = stage2_pud_offset(kvm, pgd, addr);
1438 	do {
1439 		next = stage2_pud_addr_end(kvm, addr, end);
1440 		if (!stage2_pud_none(kvm, *pud)) {
1441 			if (stage2_pud_huge(kvm, *pud)) {
1442 				if (!kvm_s2pud_readonly(pud))
1443 					kvm_set_s2pud_readonly(pud);
1444 			} else {
1445 				stage2_wp_pmds(kvm, pud, addr, next);
1446 			}
1447 		}
1448 	} while (pud++, addr = next, addr != end);
1449 }
1450 
1451 /**
1452  * stage2_wp_range() - write protect stage2 memory region range
1453  * @kvm:	The KVM pointer
1454  * @addr:	Start address of range
1455  * @end:	End address of range
1456  */
1457 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1458 {
1459 	pgd_t *pgd;
1460 	phys_addr_t next;
1461 
1462 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1463 	do {
1464 		/*
1465 		 * Release kvm_mmu_lock periodically if the memory region is
1466 		 * large. Otherwise, we may see kernel panics with
1467 		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1468 		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1469 		 * will also starve other vCPUs. We have to also make sure
1470 		 * that the page tables are not freed while we released
1471 		 * the lock.
1472 		 */
1473 		cond_resched_lock(&kvm->mmu_lock);
1474 		if (!READ_ONCE(kvm->arch.pgd))
1475 			break;
1476 		next = stage2_pgd_addr_end(kvm, addr, end);
1477 		if (stage2_pgd_present(kvm, *pgd))
1478 			stage2_wp_puds(kvm, pgd, addr, next);
1479 	} while (pgd++, addr = next, addr != end);
1480 }
1481 
1482 /**
1483  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1484  * @kvm:	The KVM pointer
1485  * @slot:	The memory slot to write protect
1486  *
1487  * Called to start logging dirty pages after memory region
1488  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1489  * all present PUD, PMD and PTEs are write protected in the memory region.
1490  * Afterwards read of dirty page log can be called.
1491  *
1492  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1493  * serializing operations for VM memory regions.
1494  */
1495 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1496 {
1497 	struct kvm_memslots *slots = kvm_memslots(kvm);
1498 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1499 	phys_addr_t start, end;
1500 
1501 	if (WARN_ON_ONCE(!memslot))
1502 		return;
1503 
1504 	start = memslot->base_gfn << PAGE_SHIFT;
1505 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1506 
1507 	spin_lock(&kvm->mmu_lock);
1508 	stage2_wp_range(kvm, start, end);
1509 	spin_unlock(&kvm->mmu_lock);
1510 	kvm_flush_remote_tlbs(kvm);
1511 }
1512 
1513 /**
1514  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1515  * @kvm:	The KVM pointer
1516  * @slot:	The memory slot associated with mask
1517  * @gfn_offset:	The gfn offset in memory slot
1518  * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1519  *		slot to be write protected
1520  *
1521  * Walks bits set in mask write protects the associated pte's. Caller must
1522  * acquire kvm_mmu_lock.
1523  */
1524 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1525 		struct kvm_memory_slot *slot,
1526 		gfn_t gfn_offset, unsigned long mask)
1527 {
1528 	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1529 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1530 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1531 
1532 	stage2_wp_range(kvm, start, end);
1533 }
1534 
1535 /*
1536  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1537  * dirty pages.
1538  *
1539  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1540  * enable dirty logging for them.
1541  */
1542 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1543 		struct kvm_memory_slot *slot,
1544 		gfn_t gfn_offset, unsigned long mask)
1545 {
1546 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1547 }
1548 
1549 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1550 {
1551 	__clean_dcache_guest_page(pfn, size);
1552 }
1553 
1554 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1555 {
1556 	__invalidate_icache_guest_page(pfn, size);
1557 }
1558 
1559 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1560 {
1561 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1562 }
1563 
1564 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1565 					       unsigned long hva,
1566 					       unsigned long map_size)
1567 {
1568 	gpa_t gpa_start;
1569 	hva_t uaddr_start, uaddr_end;
1570 	size_t size;
1571 
1572 	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1573 	if (map_size == PAGE_SIZE)
1574 		return true;
1575 
1576 	size = memslot->npages * PAGE_SIZE;
1577 
1578 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
1579 
1580 	uaddr_start = memslot->userspace_addr;
1581 	uaddr_end = uaddr_start + size;
1582 
1583 	/*
1584 	 * Pages belonging to memslots that don't have the same alignment
1585 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1586 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1587 	 *
1588 	 * Consider a layout like the following:
1589 	 *
1590 	 *    memslot->userspace_addr:
1591 	 *    +-----+--------------------+--------------------+---+
1592 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1593 	 *    +-----+--------------------+--------------------+---+
1594 	 *
1595 	 *    memslot->base_gfn << PAGE_SHIFT:
1596 	 *      +---+--------------------+--------------------+-----+
1597 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1598 	 *      +---+--------------------+--------------------+-----+
1599 	 *
1600 	 * If we create those stage-2 blocks, we'll end up with this incorrect
1601 	 * mapping:
1602 	 *   d -> f
1603 	 *   e -> g
1604 	 *   f -> h
1605 	 */
1606 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1607 		return false;
1608 
1609 	/*
1610 	 * Next, let's make sure we're not trying to map anything not covered
1611 	 * by the memslot. This means we have to prohibit block size mappings
1612 	 * for the beginning and end of a non-block aligned and non-block sized
1613 	 * memory slot (illustrated by the head and tail parts of the
1614 	 * userspace view above containing pages 'abcde' and 'xyz',
1615 	 * respectively).
1616 	 *
1617 	 * Note that it doesn't matter if we do the check using the
1618 	 * userspace_addr or the base_gfn, as both are equally aligned (per
1619 	 * the check above) and equally sized.
1620 	 */
1621 	return (hva & ~(map_size - 1)) >= uaddr_start &&
1622 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1623 }
1624 
1625 /*
1626  * Check if the given hva is backed by a transparent huge page (THP) and
1627  * whether it can be mapped using block mapping in stage2. If so, adjust
1628  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1629  * supported. This will need to be updated to support other THP sizes.
1630  *
1631  * Returns the size of the mapping.
1632  */
1633 static unsigned long
1634 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1635 			    unsigned long hva, kvm_pfn_t *pfnp,
1636 			    phys_addr_t *ipap)
1637 {
1638 	kvm_pfn_t pfn = *pfnp;
1639 
1640 	/*
1641 	 * Make sure the adjustment is done only for THP pages. Also make
1642 	 * sure that the HVA and IPA are sufficiently aligned and that the
1643 	 * block map is contained within the memslot.
1644 	 */
1645 	if (kvm_is_transparent_hugepage(pfn) &&
1646 	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1647 		/*
1648 		 * The address we faulted on is backed by a transparent huge
1649 		 * page.  However, because we map the compound huge page and
1650 		 * not the individual tail page, we need to transfer the
1651 		 * refcount to the head page.  We have to be careful that the
1652 		 * THP doesn't start to split while we are adjusting the
1653 		 * refcounts.
1654 		 *
1655 		 * We are sure this doesn't happen, because mmu_notifier_retry
1656 		 * was successful and we are holding the mmu_lock, so if this
1657 		 * THP is trying to split, it will be blocked in the mmu
1658 		 * notifier before touching any of the pages, specifically
1659 		 * before being able to call __split_huge_page_refcount().
1660 		 *
1661 		 * We can therefore safely transfer the refcount from PG_tail
1662 		 * to PG_head and switch the pfn from a tail page to the head
1663 		 * page accordingly.
1664 		 */
1665 		*ipap &= PMD_MASK;
1666 		kvm_release_pfn_clean(pfn);
1667 		pfn &= ~(PTRS_PER_PMD - 1);
1668 		kvm_get_pfn(pfn);
1669 		*pfnp = pfn;
1670 
1671 		return PMD_SIZE;
1672 	}
1673 
1674 	/* Use page mapping if we cannot use block mapping. */
1675 	return PAGE_SIZE;
1676 }
1677 
1678 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1679 			  struct kvm_memory_slot *memslot, unsigned long hva,
1680 			  unsigned long fault_status)
1681 {
1682 	int ret;
1683 	bool write_fault, writable, force_pte = false;
1684 	bool exec_fault, needs_exec;
1685 	unsigned long mmu_seq;
1686 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1687 	struct kvm *kvm = vcpu->kvm;
1688 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1689 	struct vm_area_struct *vma;
1690 	short vma_shift;
1691 	kvm_pfn_t pfn;
1692 	pgprot_t mem_type = PAGE_S2;
1693 	bool logging_active = memslot_is_logging(memslot);
1694 	unsigned long vma_pagesize, flags = 0;
1695 
1696 	write_fault = kvm_is_write_fault(vcpu);
1697 	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1698 	VM_BUG_ON(write_fault && exec_fault);
1699 
1700 	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1701 		kvm_err("Unexpected L2 read permission error\n");
1702 		return -EFAULT;
1703 	}
1704 
1705 	/* Let's check if we will get back a huge page backed by hugetlbfs */
1706 	down_read(&current->mm->mmap_sem);
1707 	vma = find_vma_intersection(current->mm, hva, hva + 1);
1708 	if (unlikely(!vma)) {
1709 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1710 		up_read(&current->mm->mmap_sem);
1711 		return -EFAULT;
1712 	}
1713 
1714 	if (is_vm_hugetlb_page(vma))
1715 		vma_shift = huge_page_shift(hstate_vma(vma));
1716 	else
1717 		vma_shift = PAGE_SHIFT;
1718 
1719 	vma_pagesize = 1ULL << vma_shift;
1720 	if (logging_active ||
1721 	    (vma->vm_flags & VM_PFNMAP) ||
1722 	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1723 		force_pte = true;
1724 		vma_pagesize = PAGE_SIZE;
1725 	}
1726 
1727 	/*
1728 	 * The stage2 has a minimum of 2 level table (For arm64 see
1729 	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1730 	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1731 	 * As for PUD huge maps, we must make sure that we have at least
1732 	 * 3 levels, i.e, PMD is not folded.
1733 	 */
1734 	if (vma_pagesize == PMD_SIZE ||
1735 	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1736 		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1737 	up_read(&current->mm->mmap_sem);
1738 
1739 	/* We need minimum second+third level pages */
1740 	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
1741 				     KVM_NR_MEM_OBJS);
1742 	if (ret)
1743 		return ret;
1744 
1745 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1746 	/*
1747 	 * Ensure the read of mmu_notifier_seq happens before we call
1748 	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1749 	 * the page we just got a reference to gets unmapped before we have a
1750 	 * chance to grab the mmu_lock, which ensure that if the page gets
1751 	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1752 	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1753 	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1754 	 */
1755 	smp_rmb();
1756 
1757 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1758 	if (pfn == KVM_PFN_ERR_HWPOISON) {
1759 		kvm_send_hwpoison_signal(hva, vma_shift);
1760 		return 0;
1761 	}
1762 	if (is_error_noslot_pfn(pfn))
1763 		return -EFAULT;
1764 
1765 	if (kvm_is_device_pfn(pfn)) {
1766 		mem_type = PAGE_S2_DEVICE;
1767 		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1768 	} else if (logging_active) {
1769 		/*
1770 		 * Faults on pages in a memslot with logging enabled
1771 		 * should not be mapped with huge pages (it introduces churn
1772 		 * and performance degradation), so force a pte mapping.
1773 		 */
1774 		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1775 
1776 		/*
1777 		 * Only actually map the page as writable if this was a write
1778 		 * fault.
1779 		 */
1780 		if (!write_fault)
1781 			writable = false;
1782 	}
1783 
1784 	if (exec_fault && is_iomap(flags))
1785 		return -ENOEXEC;
1786 
1787 	spin_lock(&kvm->mmu_lock);
1788 	if (mmu_notifier_retry(kvm, mmu_seq))
1789 		goto out_unlock;
1790 
1791 	/*
1792 	 * If we are not forced to use page mapping, check if we are
1793 	 * backed by a THP and thus use block mapping if possible.
1794 	 */
1795 	if (vma_pagesize == PAGE_SIZE && !force_pte)
1796 		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1797 							   &pfn, &fault_ipa);
1798 	if (writable)
1799 		kvm_set_pfn_dirty(pfn);
1800 
1801 	if (fault_status != FSC_PERM && !is_iomap(flags))
1802 		clean_dcache_guest_page(pfn, vma_pagesize);
1803 
1804 	if (exec_fault)
1805 		invalidate_icache_guest_page(pfn, vma_pagesize);
1806 
1807 	/*
1808 	 * If we took an execution fault we have made the
1809 	 * icache/dcache coherent above and should now let the s2
1810 	 * mapping be executable.
1811 	 *
1812 	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1813 	 * execute permissions, and we preserve whatever we have.
1814 	 */
1815 	needs_exec = exec_fault ||
1816 		(fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
1817 
1818 	if (vma_pagesize == PUD_SIZE) {
1819 		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1820 
1821 		new_pud = kvm_pud_mkhuge(new_pud);
1822 		if (writable)
1823 			new_pud = kvm_s2pud_mkwrite(new_pud);
1824 
1825 		if (needs_exec)
1826 			new_pud = kvm_s2pud_mkexec(new_pud);
1827 
1828 		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
1829 	} else if (vma_pagesize == PMD_SIZE) {
1830 		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1831 
1832 		new_pmd = kvm_pmd_mkhuge(new_pmd);
1833 
1834 		if (writable)
1835 			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1836 
1837 		if (needs_exec)
1838 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
1839 
1840 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1841 	} else {
1842 		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1843 
1844 		if (writable) {
1845 			new_pte = kvm_s2pte_mkwrite(new_pte);
1846 			mark_page_dirty(kvm, gfn);
1847 		}
1848 
1849 		if (needs_exec)
1850 			new_pte = kvm_s2pte_mkexec(new_pte);
1851 
1852 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1853 	}
1854 
1855 out_unlock:
1856 	spin_unlock(&kvm->mmu_lock);
1857 	kvm_set_pfn_accessed(pfn);
1858 	kvm_release_pfn_clean(pfn);
1859 	return ret;
1860 }
1861 
1862 /*
1863  * Resolve the access fault by making the page young again.
1864  * Note that because the faulting entry is guaranteed not to be
1865  * cached in the TLB, we don't need to invalidate anything.
1866  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1867  * so there is no need for atomic (pte|pmd)_mkyoung operations.
1868  */
1869 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1870 {
1871 	pud_t *pud;
1872 	pmd_t *pmd;
1873 	pte_t *pte;
1874 	kvm_pfn_t pfn;
1875 	bool pfn_valid = false;
1876 
1877 	trace_kvm_access_fault(fault_ipa);
1878 
1879 	spin_lock(&vcpu->kvm->mmu_lock);
1880 
1881 	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
1882 		goto out;
1883 
1884 	if (pud) {		/* HugeTLB */
1885 		*pud = kvm_s2pud_mkyoung(*pud);
1886 		pfn = kvm_pud_pfn(*pud);
1887 		pfn_valid = true;
1888 	} else	if (pmd) {	/* THP, HugeTLB */
1889 		*pmd = pmd_mkyoung(*pmd);
1890 		pfn = pmd_pfn(*pmd);
1891 		pfn_valid = true;
1892 	} else {
1893 		*pte = pte_mkyoung(*pte);	/* Just a page... */
1894 		pfn = pte_pfn(*pte);
1895 		pfn_valid = true;
1896 	}
1897 
1898 out:
1899 	spin_unlock(&vcpu->kvm->mmu_lock);
1900 	if (pfn_valid)
1901 		kvm_set_pfn_accessed(pfn);
1902 }
1903 
1904 /**
1905  * kvm_handle_guest_abort - handles all 2nd stage aborts
1906  * @vcpu:	the VCPU pointer
1907  * @run:	the kvm_run structure
1908  *
1909  * Any abort that gets to the host is almost guaranteed to be caused by a
1910  * missing second stage translation table entry, which can mean that either the
1911  * guest simply needs more memory and we must allocate an appropriate page or it
1912  * can mean that the guest tried to access I/O memory, which is emulated by user
1913  * space. The distinction is based on the IPA causing the fault and whether this
1914  * memory region has been registered as standard RAM by user space.
1915  */
1916 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1917 {
1918 	unsigned long fault_status;
1919 	phys_addr_t fault_ipa;
1920 	struct kvm_memory_slot *memslot;
1921 	unsigned long hva;
1922 	bool is_iabt, write_fault, writable;
1923 	gfn_t gfn;
1924 	int ret, idx;
1925 
1926 	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1927 
1928 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1929 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1930 
1931 	/* Synchronous External Abort? */
1932 	if (kvm_vcpu_dabt_isextabt(vcpu)) {
1933 		/*
1934 		 * For RAS the host kernel may handle this abort.
1935 		 * There is no need to pass the error into the guest.
1936 		 */
1937 		if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
1938 			return 1;
1939 
1940 		if (unlikely(!is_iabt)) {
1941 			kvm_inject_vabt(vcpu);
1942 			return 1;
1943 		}
1944 	}
1945 
1946 	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1947 			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
1948 
1949 	/* Check the stage-2 fault is trans. fault or write fault */
1950 	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1951 	    fault_status != FSC_ACCESS) {
1952 		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1953 			kvm_vcpu_trap_get_class(vcpu),
1954 			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1955 			(unsigned long)kvm_vcpu_get_hsr(vcpu));
1956 		return -EFAULT;
1957 	}
1958 
1959 	idx = srcu_read_lock(&vcpu->kvm->srcu);
1960 
1961 	gfn = fault_ipa >> PAGE_SHIFT;
1962 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
1963 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1964 	write_fault = kvm_is_write_fault(vcpu);
1965 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1966 		if (is_iabt) {
1967 			/* Prefetch Abort on I/O address */
1968 			ret = -ENOEXEC;
1969 			goto out;
1970 		}
1971 
1972 		/*
1973 		 * Check for a cache maintenance operation. Since we
1974 		 * ended-up here, we know it is outside of any memory
1975 		 * slot. But we can't find out if that is for a device,
1976 		 * or if the guest is just being stupid. The only thing
1977 		 * we know for sure is that this range cannot be cached.
1978 		 *
1979 		 * So let's assume that the guest is just being
1980 		 * cautious, and skip the instruction.
1981 		 */
1982 		if (kvm_vcpu_dabt_is_cm(vcpu)) {
1983 			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1984 			ret = 1;
1985 			goto out_unlock;
1986 		}
1987 
1988 		/*
1989 		 * The IPA is reported as [MAX:12], so we need to
1990 		 * complement it with the bottom 12 bits from the
1991 		 * faulting VA. This is always 12 bits, irrespective
1992 		 * of the page size.
1993 		 */
1994 		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1995 		ret = io_mem_abort(vcpu, run, fault_ipa);
1996 		goto out_unlock;
1997 	}
1998 
1999 	/* Userspace should not be able to register out-of-bounds IPAs */
2000 	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
2001 
2002 	if (fault_status == FSC_ACCESS) {
2003 		handle_access_fault(vcpu, fault_ipa);
2004 		ret = 1;
2005 		goto out_unlock;
2006 	}
2007 
2008 	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
2009 	if (ret == 0)
2010 		ret = 1;
2011 out:
2012 	if (ret == -ENOEXEC) {
2013 		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2014 		ret = 1;
2015 	}
2016 out_unlock:
2017 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2018 	return ret;
2019 }
2020 
2021 static int handle_hva_to_gpa(struct kvm *kvm,
2022 			     unsigned long start,
2023 			     unsigned long end,
2024 			     int (*handler)(struct kvm *kvm,
2025 					    gpa_t gpa, u64 size,
2026 					    void *data),
2027 			     void *data)
2028 {
2029 	struct kvm_memslots *slots;
2030 	struct kvm_memory_slot *memslot;
2031 	int ret = 0;
2032 
2033 	slots = kvm_memslots(kvm);
2034 
2035 	/* we only care about the pages that the guest sees */
2036 	kvm_for_each_memslot(memslot, slots) {
2037 		unsigned long hva_start, hva_end;
2038 		gfn_t gpa;
2039 
2040 		hva_start = max(start, memslot->userspace_addr);
2041 		hva_end = min(end, memslot->userspace_addr +
2042 					(memslot->npages << PAGE_SHIFT));
2043 		if (hva_start >= hva_end)
2044 			continue;
2045 
2046 		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
2047 		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
2048 	}
2049 
2050 	return ret;
2051 }
2052 
2053 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2054 {
2055 	unmap_stage2_range(kvm, gpa, size);
2056 	return 0;
2057 }
2058 
2059 int kvm_unmap_hva_range(struct kvm *kvm,
2060 			unsigned long start, unsigned long end)
2061 {
2062 	if (!kvm->arch.pgd)
2063 		return 0;
2064 
2065 	trace_kvm_unmap_hva_range(start, end);
2066 	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
2067 	return 0;
2068 }
2069 
2070 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2071 {
2072 	pte_t *pte = (pte_t *)data;
2073 
2074 	WARN_ON(size != PAGE_SIZE);
2075 	/*
2076 	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2077 	 * flag clear because MMU notifiers will have unmapped a huge PMD before
2078 	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2079 	 * therefore stage2_set_pte() never needs to clear out a huge PMD
2080 	 * through this calling path.
2081 	 */
2082 	stage2_set_pte(kvm, NULL, gpa, pte, 0);
2083 	return 0;
2084 }
2085 
2086 
2087 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2088 {
2089 	unsigned long end = hva + PAGE_SIZE;
2090 	kvm_pfn_t pfn = pte_pfn(pte);
2091 	pte_t stage2_pte;
2092 
2093 	if (!kvm->arch.pgd)
2094 		return 0;
2095 
2096 	trace_kvm_set_spte_hva(hva);
2097 
2098 	/*
2099 	 * We've moved a page around, probably through CoW, so let's treat it
2100 	 * just like a translation fault and clean the cache to the PoC.
2101 	 */
2102 	clean_dcache_guest_page(pfn, PAGE_SIZE);
2103 	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
2104 	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2105 
2106 	return 0;
2107 }
2108 
2109 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2110 {
2111 	pud_t *pud;
2112 	pmd_t *pmd;
2113 	pte_t *pte;
2114 
2115 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2116 	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2117 		return 0;
2118 
2119 	if (pud)
2120 		return stage2_pudp_test_and_clear_young(pud);
2121 	else if (pmd)
2122 		return stage2_pmdp_test_and_clear_young(pmd);
2123 	else
2124 		return stage2_ptep_test_and_clear_young(pte);
2125 }
2126 
2127 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2128 {
2129 	pud_t *pud;
2130 	pmd_t *pmd;
2131 	pte_t *pte;
2132 
2133 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2134 	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2135 		return 0;
2136 
2137 	if (pud)
2138 		return kvm_s2pud_young(*pud);
2139 	else if (pmd)
2140 		return pmd_young(*pmd);
2141 	else
2142 		return pte_young(*pte);
2143 }
2144 
2145 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2146 {
2147 	if (!kvm->arch.pgd)
2148 		return 0;
2149 	trace_kvm_age_hva(start, end);
2150 	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2151 }
2152 
2153 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2154 {
2155 	if (!kvm->arch.pgd)
2156 		return 0;
2157 	trace_kvm_test_age_hva(hva);
2158 	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2159 				 kvm_test_age_hva_handler, NULL);
2160 }
2161 
2162 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2163 {
2164 	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
2165 }
2166 
2167 phys_addr_t kvm_mmu_get_httbr(void)
2168 {
2169 	if (__kvm_cpu_uses_extended_idmap())
2170 		return virt_to_phys(merged_hyp_pgd);
2171 	else
2172 		return virt_to_phys(hyp_pgd);
2173 }
2174 
2175 phys_addr_t kvm_get_idmap_vector(void)
2176 {
2177 	return hyp_idmap_vector;
2178 }
2179 
2180 static int kvm_map_idmap_text(pgd_t *pgd)
2181 {
2182 	int err;
2183 
2184 	/* Create the idmap in the boot page tables */
2185 	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
2186 				      hyp_idmap_start, hyp_idmap_end,
2187 				      __phys_to_pfn(hyp_idmap_start),
2188 				      PAGE_HYP_EXEC);
2189 	if (err)
2190 		kvm_err("Failed to idmap %lx-%lx\n",
2191 			hyp_idmap_start, hyp_idmap_end);
2192 
2193 	return err;
2194 }
2195 
2196 int kvm_mmu_init(void)
2197 {
2198 	int err;
2199 
2200 	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2201 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2202 	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2203 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2204 	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2205 
2206 	/*
2207 	 * We rely on the linker script to ensure at build time that the HYP
2208 	 * init code does not cross a page boundary.
2209 	 */
2210 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2211 
2212 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2213 	kvm_debug("HYP VA range: %lx:%lx\n",
2214 		  kern_hyp_va(PAGE_OFFSET),
2215 		  kern_hyp_va((unsigned long)high_memory - 1));
2216 
2217 	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2218 	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2219 	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2220 		/*
2221 		 * The idmap page is intersecting with the VA space,
2222 		 * it is not safe to continue further.
2223 		 */
2224 		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2225 		err = -EINVAL;
2226 		goto out;
2227 	}
2228 
2229 	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
2230 	if (!hyp_pgd) {
2231 		kvm_err("Hyp mode PGD not allocated\n");
2232 		err = -ENOMEM;
2233 		goto out;
2234 	}
2235 
2236 	if (__kvm_cpu_uses_extended_idmap()) {
2237 		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2238 							 hyp_pgd_order);
2239 		if (!boot_hyp_pgd) {
2240 			kvm_err("Hyp boot PGD not allocated\n");
2241 			err = -ENOMEM;
2242 			goto out;
2243 		}
2244 
2245 		err = kvm_map_idmap_text(boot_hyp_pgd);
2246 		if (err)
2247 			goto out;
2248 
2249 		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
2250 		if (!merged_hyp_pgd) {
2251 			kvm_err("Failed to allocate extra HYP pgd\n");
2252 			goto out;
2253 		}
2254 		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
2255 				    hyp_idmap_start);
2256 	} else {
2257 		err = kvm_map_idmap_text(hyp_pgd);
2258 		if (err)
2259 			goto out;
2260 	}
2261 
2262 	io_map_base = hyp_idmap_start;
2263 	return 0;
2264 out:
2265 	free_hyp_pgds();
2266 	return err;
2267 }
2268 
2269 void kvm_arch_commit_memory_region(struct kvm *kvm,
2270 				   const struct kvm_userspace_memory_region *mem,
2271 				   struct kvm_memory_slot *old,
2272 				   const struct kvm_memory_slot *new,
2273 				   enum kvm_mr_change change)
2274 {
2275 	/*
2276 	 * At this point memslot has been committed and there is an
2277 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
2278 	 * memory slot is write protected.
2279 	 */
2280 	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2281 		/*
2282 		 * If we're with initial-all-set, we don't need to write
2283 		 * protect any pages because they're all reported as dirty.
2284 		 * Huge pages and normal pages will be write protect gradually.
2285 		 */
2286 		if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2287 			kvm_mmu_wp_memory_region(kvm, mem->slot);
2288 		}
2289 	}
2290 }
2291 
2292 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2293 				   struct kvm_memory_slot *memslot,
2294 				   const struct kvm_userspace_memory_region *mem,
2295 				   enum kvm_mr_change change)
2296 {
2297 	hva_t hva = mem->userspace_addr;
2298 	hva_t reg_end = hva + mem->memory_size;
2299 	bool writable = !(mem->flags & KVM_MEM_READONLY);
2300 	int ret = 0;
2301 
2302 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2303 			change != KVM_MR_FLAGS_ONLY)
2304 		return 0;
2305 
2306 	/*
2307 	 * Prevent userspace from creating a memory region outside of the IPA
2308 	 * space addressable by the KVM guest IPA space.
2309 	 */
2310 	if (memslot->base_gfn + memslot->npages >=
2311 	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
2312 		return -EFAULT;
2313 
2314 	down_read(&current->mm->mmap_sem);
2315 	/*
2316 	 * A memory region could potentially cover multiple VMAs, and any holes
2317 	 * between them, so iterate over all of them to find out if we can map
2318 	 * any of them right now.
2319 	 *
2320 	 *     +--------------------------------------------+
2321 	 * +---------------+----------------+   +----------------+
2322 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2323 	 * +---------------+----------------+   +----------------+
2324 	 *     |               memory region                |
2325 	 *     +--------------------------------------------+
2326 	 */
2327 	do {
2328 		struct vm_area_struct *vma = find_vma(current->mm, hva);
2329 		hva_t vm_start, vm_end;
2330 
2331 		if (!vma || vma->vm_start >= reg_end)
2332 			break;
2333 
2334 		/*
2335 		 * Take the intersection of this VMA with the memory region
2336 		 */
2337 		vm_start = max(hva, vma->vm_start);
2338 		vm_end = min(reg_end, vma->vm_end);
2339 
2340 		if (vma->vm_flags & VM_PFNMAP) {
2341 			gpa_t gpa = mem->guest_phys_addr +
2342 				    (vm_start - mem->userspace_addr);
2343 			phys_addr_t pa;
2344 
2345 			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2346 			pa += vm_start - vma->vm_start;
2347 
2348 			/* IO region dirty page logging not allowed */
2349 			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2350 				ret = -EINVAL;
2351 				goto out;
2352 			}
2353 
2354 			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2355 						    vm_end - vm_start,
2356 						    writable);
2357 			if (ret)
2358 				break;
2359 		}
2360 		hva = vm_end;
2361 	} while (hva < reg_end);
2362 
2363 	if (change == KVM_MR_FLAGS_ONLY)
2364 		goto out;
2365 
2366 	spin_lock(&kvm->mmu_lock);
2367 	if (ret)
2368 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
2369 	else
2370 		stage2_flush_memslot(kvm, memslot);
2371 	spin_unlock(&kvm->mmu_lock);
2372 out:
2373 	up_read(&current->mm->mmap_sem);
2374 	return ret;
2375 }
2376 
2377 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2378 {
2379 }
2380 
2381 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2382 {
2383 }
2384 
2385 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2386 {
2387 	kvm_free_stage2_pgd(kvm);
2388 }
2389 
2390 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2391 				   struct kvm_memory_slot *slot)
2392 {
2393 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2394 	phys_addr_t size = slot->npages << PAGE_SHIFT;
2395 
2396 	spin_lock(&kvm->mmu_lock);
2397 	unmap_stage2_range(kvm, gpa, size);
2398 	spin_unlock(&kvm->mmu_lock);
2399 }
2400 
2401 /*
2402  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2403  *
2404  * Main problems:
2405  * - S/W ops are local to a CPU (not broadcast)
2406  * - We have line migration behind our back (speculation)
2407  * - System caches don't support S/W at all (damn!)
2408  *
2409  * In the face of the above, the best we can do is to try and convert
2410  * S/W ops to VA ops. Because the guest is not allowed to infer the
2411  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2412  * which is a rather good thing for us.
2413  *
2414  * Also, it is only used when turning caches on/off ("The expected
2415  * usage of the cache maintenance instructions that operate by set/way
2416  * is associated with the cache maintenance instructions associated
2417  * with the powerdown and powerup of caches, if this is required by
2418  * the implementation.").
2419  *
2420  * We use the following policy:
2421  *
2422  * - If we trap a S/W operation, we enable VM trapping to detect
2423  *   caches being turned on/off, and do a full clean.
2424  *
2425  * - We flush the caches on both caches being turned on and off.
2426  *
2427  * - Once the caches are enabled, we stop trapping VM ops.
2428  */
2429 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2430 {
2431 	unsigned long hcr = *vcpu_hcr(vcpu);
2432 
2433 	/*
2434 	 * If this is the first time we do a S/W operation
2435 	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2436 	 * VM trapping.
2437 	 *
2438 	 * Otherwise, rely on the VM trapping to wait for the MMU +
2439 	 * Caches to be turned off. At that point, we'll be able to
2440 	 * clean the caches again.
2441 	 */
2442 	if (!(hcr & HCR_TVM)) {
2443 		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2444 					vcpu_has_cache_enabled(vcpu));
2445 		stage2_flush_vm(vcpu->kvm);
2446 		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2447 	}
2448 }
2449 
2450 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2451 {
2452 	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2453 
2454 	/*
2455 	 * If switching the MMU+caches on, need to invalidate the caches.
2456 	 * If switching it off, need to clean the caches.
2457 	 * Clean + invalidate does the trick always.
2458 	 */
2459 	if (now_enabled != was_enabled)
2460 		stage2_flush_vm(vcpu->kvm);
2461 
2462 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2463 	if (now_enabled)
2464 		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2465 
2466 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2467 }
2468