xref: /openbmc/linux/arch/arm64/kvm/mmu.c (revision acddaa55)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6 
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_ras.h>
18 #include <asm/kvm_asm.h>
19 #include <asm/kvm_emulate.h>
20 #include <asm/virt.h>
21 
22 #include "trace.h"
23 
24 static pgd_t *boot_hyp_pgd;
25 static pgd_t *hyp_pgd;
26 static pgd_t *merged_hyp_pgd;
27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
28 
29 static unsigned long hyp_idmap_start;
30 static unsigned long hyp_idmap_end;
31 static phys_addr_t hyp_idmap_vector;
32 
33 static unsigned long io_map_base;
34 
35 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
36 
37 #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
38 #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
39 
40 static bool is_iomap(unsigned long flags)
41 {
42 	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
43 }
44 
45 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
46 {
47 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
48 }
49 
50 /**
51  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
52  * @kvm:	pointer to kvm structure.
53  *
54  * Interface to HYP function to flush all VM TLB entries
55  */
56 void kvm_flush_remote_tlbs(struct kvm *kvm)
57 {
58 	kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
59 }
60 
61 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
62 				   int level)
63 {
64 	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
65 }
66 
67 /*
68  * D-Cache management functions. They take the page table entries by
69  * value, as they are flushing the cache using the kernel mapping (or
70  * kmap on 32bit).
71  */
72 static void kvm_flush_dcache_pte(pte_t pte)
73 {
74 	__kvm_flush_dcache_pte(pte);
75 }
76 
77 static void kvm_flush_dcache_pmd(pmd_t pmd)
78 {
79 	__kvm_flush_dcache_pmd(pmd);
80 }
81 
82 static void kvm_flush_dcache_pud(pud_t pud)
83 {
84 	__kvm_flush_dcache_pud(pud);
85 }
86 
87 static bool kvm_is_device_pfn(unsigned long pfn)
88 {
89 	return !pfn_valid(pfn);
90 }
91 
92 /**
93  * stage2_dissolve_pmd() - clear and flush huge PMD entry
94  * @mmu:	pointer to mmu structure to operate on
95  * @addr:	IPA
96  * @pmd:	pmd pointer for IPA
97  *
98  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
99  */
100 static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
101 {
102 	if (!pmd_thp_or_huge(*pmd))
103 		return;
104 
105 	pmd_clear(pmd);
106 	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
107 	put_page(virt_to_page(pmd));
108 }
109 
110 /**
111  * stage2_dissolve_pud() - clear and flush huge PUD entry
112  * @mmu:	pointer to mmu structure to operate on
113  * @addr:	IPA
114  * @pud:	pud pointer for IPA
115  *
116  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
117  */
118 static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
119 {
120 	struct kvm *kvm = mmu->kvm;
121 
122 	if (!stage2_pud_huge(kvm, *pudp))
123 		return;
124 
125 	stage2_pud_clear(kvm, pudp);
126 	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
127 	put_page(virt_to_page(pudp));
128 }
129 
130 static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
131 {
132 	struct kvm *kvm = mmu->kvm;
133 	p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
134 	stage2_pgd_clear(kvm, pgd);
135 	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
136 	stage2_p4d_free(kvm, p4d_table);
137 	put_page(virt_to_page(pgd));
138 }
139 
140 static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
141 {
142 	struct kvm *kvm = mmu->kvm;
143 	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
144 	stage2_p4d_clear(kvm, p4d);
145 	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
146 	stage2_pud_free(kvm, pud_table);
147 	put_page(virt_to_page(p4d));
148 }
149 
150 static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
151 {
152 	struct kvm *kvm = mmu->kvm;
153 	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
154 
155 	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
156 	stage2_pud_clear(kvm, pud);
157 	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
158 	stage2_pmd_free(kvm, pmd_table);
159 	put_page(virt_to_page(pud));
160 }
161 
162 static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
163 {
164 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
165 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
166 	pmd_clear(pmd);
167 	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
168 	free_page((unsigned long)pte_table);
169 	put_page(virt_to_page(pmd));
170 }
171 
172 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
173 {
174 	WRITE_ONCE(*ptep, new_pte);
175 	dsb(ishst);
176 }
177 
178 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
179 {
180 	WRITE_ONCE(*pmdp, new_pmd);
181 	dsb(ishst);
182 }
183 
184 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
185 {
186 	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
187 }
188 
189 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
190 {
191 	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
192 	dsb(ishst);
193 }
194 
195 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
196 {
197 	WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
198 	dsb(ishst);
199 }
200 
201 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
202 {
203 #ifndef __PAGETABLE_P4D_FOLDED
204 	WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
205 	dsb(ishst);
206 #endif
207 }
208 
209 /*
210  * Unmapping vs dcache management:
211  *
212  * If a guest maps certain memory pages as uncached, all writes will
213  * bypass the data cache and go directly to RAM.  However, the CPUs
214  * can still speculate reads (not writes) and fill cache lines with
215  * data.
216  *
217  * Those cache lines will be *clean* cache lines though, so a
218  * clean+invalidate operation is equivalent to an invalidate
219  * operation, because no cache lines are marked dirty.
220  *
221  * Those clean cache lines could be filled prior to an uncached write
222  * by the guest, and the cache coherent IO subsystem would therefore
223  * end up writing old data to disk.
224  *
225  * This is why right after unmapping a page/section and invalidating
226  * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
227  * the IO subsystem will never hit in the cache.
228  *
229  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
230  * we then fully enforce cacheability of RAM, no matter what the guest
231  * does.
232  */
233 static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
234 		       phys_addr_t addr, phys_addr_t end)
235 {
236 	phys_addr_t start_addr = addr;
237 	pte_t *pte, *start_pte;
238 
239 	start_pte = pte = pte_offset_kernel(pmd, addr);
240 	do {
241 		if (!pte_none(*pte)) {
242 			pte_t old_pte = *pte;
243 
244 			kvm_set_pte(pte, __pte(0));
245 			kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
246 
247 			/* No need to invalidate the cache for device mappings */
248 			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
249 				kvm_flush_dcache_pte(old_pte);
250 
251 			put_page(virt_to_page(pte));
252 		}
253 	} while (pte++, addr += PAGE_SIZE, addr != end);
254 
255 	if (stage2_pte_table_empty(mmu->kvm, start_pte))
256 		clear_stage2_pmd_entry(mmu, pmd, start_addr);
257 }
258 
259 static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
260 		       phys_addr_t addr, phys_addr_t end)
261 {
262 	struct kvm *kvm = mmu->kvm;
263 	phys_addr_t next, start_addr = addr;
264 	pmd_t *pmd, *start_pmd;
265 
266 	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
267 	do {
268 		next = stage2_pmd_addr_end(kvm, addr, end);
269 		if (!pmd_none(*pmd)) {
270 			if (pmd_thp_or_huge(*pmd)) {
271 				pmd_t old_pmd = *pmd;
272 
273 				pmd_clear(pmd);
274 				kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
275 
276 				kvm_flush_dcache_pmd(old_pmd);
277 
278 				put_page(virt_to_page(pmd));
279 			} else {
280 				unmap_stage2_ptes(mmu, pmd, addr, next);
281 			}
282 		}
283 	} while (pmd++, addr = next, addr != end);
284 
285 	if (stage2_pmd_table_empty(kvm, start_pmd))
286 		clear_stage2_pud_entry(mmu, pud, start_addr);
287 }
288 
289 static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
290 		       phys_addr_t addr, phys_addr_t end)
291 {
292 	struct kvm *kvm = mmu->kvm;
293 	phys_addr_t next, start_addr = addr;
294 	pud_t *pud, *start_pud;
295 
296 	start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
297 	do {
298 		next = stage2_pud_addr_end(kvm, addr, end);
299 		if (!stage2_pud_none(kvm, *pud)) {
300 			if (stage2_pud_huge(kvm, *pud)) {
301 				pud_t old_pud = *pud;
302 
303 				stage2_pud_clear(kvm, pud);
304 				kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
305 				kvm_flush_dcache_pud(old_pud);
306 				put_page(virt_to_page(pud));
307 			} else {
308 				unmap_stage2_pmds(mmu, pud, addr, next);
309 			}
310 		}
311 	} while (pud++, addr = next, addr != end);
312 
313 	if (stage2_pud_table_empty(kvm, start_pud))
314 		clear_stage2_p4d_entry(mmu, p4d, start_addr);
315 }
316 
317 static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
318 		       phys_addr_t addr, phys_addr_t end)
319 {
320 	struct kvm *kvm = mmu->kvm;
321 	phys_addr_t next, start_addr = addr;
322 	p4d_t *p4d, *start_p4d;
323 
324 	start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
325 	do {
326 		next = stage2_p4d_addr_end(kvm, addr, end);
327 		if (!stage2_p4d_none(kvm, *p4d))
328 			unmap_stage2_puds(mmu, p4d, addr, next);
329 	} while (p4d++, addr = next, addr != end);
330 
331 	if (stage2_p4d_table_empty(kvm, start_p4d))
332 		clear_stage2_pgd_entry(mmu, pgd, start_addr);
333 }
334 
335 /**
336  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
337  * @kvm:   The VM pointer
338  * @start: The intermediate physical base address of the range to unmap
339  * @size:  The size of the area to unmap
340  *
341  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
342  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
343  * destroying the VM), otherwise another faulting VCPU may come in and mess
344  * with things behind our backs.
345  */
346 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
347 				 bool may_block)
348 {
349 	struct kvm *kvm = mmu->kvm;
350 	pgd_t *pgd;
351 	phys_addr_t addr = start, end = start + size;
352 	phys_addr_t next;
353 
354 	assert_spin_locked(&kvm->mmu_lock);
355 	WARN_ON(size & ~PAGE_MASK);
356 
357 	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
358 	do {
359 		/*
360 		 * Make sure the page table is still active, as another thread
361 		 * could have possibly freed the page table, while we released
362 		 * the lock.
363 		 */
364 		if (!READ_ONCE(mmu->pgd))
365 			break;
366 		next = stage2_pgd_addr_end(kvm, addr, end);
367 		if (!stage2_pgd_none(kvm, *pgd))
368 			unmap_stage2_p4ds(mmu, pgd, addr, next);
369 		/*
370 		 * If the range is too large, release the kvm->mmu_lock
371 		 * to prevent starvation and lockup detector warnings.
372 		 */
373 		if (may_block && next != end)
374 			cond_resched_lock(&kvm->mmu_lock);
375 	} while (pgd++, addr = next, addr != end);
376 }
377 
378 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
379 {
380 	__unmap_stage2_range(mmu, start, size, true);
381 }
382 
383 static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
384 			      phys_addr_t addr, phys_addr_t end)
385 {
386 	pte_t *pte;
387 
388 	pte = pte_offset_kernel(pmd, addr);
389 	do {
390 		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
391 			kvm_flush_dcache_pte(*pte);
392 	} while (pte++, addr += PAGE_SIZE, addr != end);
393 }
394 
395 static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
396 			      phys_addr_t addr, phys_addr_t end)
397 {
398 	struct kvm *kvm = mmu->kvm;
399 	pmd_t *pmd;
400 	phys_addr_t next;
401 
402 	pmd = stage2_pmd_offset(kvm, pud, addr);
403 	do {
404 		next = stage2_pmd_addr_end(kvm, addr, end);
405 		if (!pmd_none(*pmd)) {
406 			if (pmd_thp_or_huge(*pmd))
407 				kvm_flush_dcache_pmd(*pmd);
408 			else
409 				stage2_flush_ptes(mmu, pmd, addr, next);
410 		}
411 	} while (pmd++, addr = next, addr != end);
412 }
413 
414 static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
415 			      phys_addr_t addr, phys_addr_t end)
416 {
417 	struct kvm *kvm = mmu->kvm;
418 	pud_t *pud;
419 	phys_addr_t next;
420 
421 	pud = stage2_pud_offset(kvm, p4d, addr);
422 	do {
423 		next = stage2_pud_addr_end(kvm, addr, end);
424 		if (!stage2_pud_none(kvm, *pud)) {
425 			if (stage2_pud_huge(kvm, *pud))
426 				kvm_flush_dcache_pud(*pud);
427 			else
428 				stage2_flush_pmds(mmu, pud, addr, next);
429 		}
430 	} while (pud++, addr = next, addr != end);
431 }
432 
433 static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
434 			      phys_addr_t addr, phys_addr_t end)
435 {
436 	struct kvm *kvm = mmu->kvm;
437 	p4d_t *p4d;
438 	phys_addr_t next;
439 
440 	p4d = stage2_p4d_offset(kvm, pgd, addr);
441 	do {
442 		next = stage2_p4d_addr_end(kvm, addr, end);
443 		if (!stage2_p4d_none(kvm, *p4d))
444 			stage2_flush_puds(mmu, p4d, addr, next);
445 	} while (p4d++, addr = next, addr != end);
446 }
447 
448 static void stage2_flush_memslot(struct kvm *kvm,
449 				 struct kvm_memory_slot *memslot)
450 {
451 	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
452 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
453 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
454 	phys_addr_t next;
455 	pgd_t *pgd;
456 
457 	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
458 	do {
459 		next = stage2_pgd_addr_end(kvm, addr, end);
460 		if (!stage2_pgd_none(kvm, *pgd))
461 			stage2_flush_p4ds(mmu, pgd, addr, next);
462 
463 		if (next != end)
464 			cond_resched_lock(&kvm->mmu_lock);
465 	} while (pgd++, addr = next, addr != end);
466 }
467 
468 /**
469  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
470  * @kvm: The struct kvm pointer
471  *
472  * Go through the stage 2 page tables and invalidate any cache lines
473  * backing memory already mapped to the VM.
474  */
475 static void stage2_flush_vm(struct kvm *kvm)
476 {
477 	struct kvm_memslots *slots;
478 	struct kvm_memory_slot *memslot;
479 	int idx;
480 
481 	idx = srcu_read_lock(&kvm->srcu);
482 	spin_lock(&kvm->mmu_lock);
483 
484 	slots = kvm_memslots(kvm);
485 	kvm_for_each_memslot(memslot, slots)
486 		stage2_flush_memslot(kvm, memslot);
487 
488 	spin_unlock(&kvm->mmu_lock);
489 	srcu_read_unlock(&kvm->srcu, idx);
490 }
491 
492 static void clear_hyp_pgd_entry(pgd_t *pgd)
493 {
494 	p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
495 	pgd_clear(pgd);
496 	p4d_free(NULL, p4d_table);
497 	put_page(virt_to_page(pgd));
498 }
499 
500 static void clear_hyp_p4d_entry(p4d_t *p4d)
501 {
502 	pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
503 	VM_BUG_ON(p4d_huge(*p4d));
504 	p4d_clear(p4d);
505 	pud_free(NULL, pud_table);
506 	put_page(virt_to_page(p4d));
507 }
508 
509 static void clear_hyp_pud_entry(pud_t *pud)
510 {
511 	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
512 	VM_BUG_ON(pud_huge(*pud));
513 	pud_clear(pud);
514 	pmd_free(NULL, pmd_table);
515 	put_page(virt_to_page(pud));
516 }
517 
518 static void clear_hyp_pmd_entry(pmd_t *pmd)
519 {
520 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
521 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
522 	pmd_clear(pmd);
523 	pte_free_kernel(NULL, pte_table);
524 	put_page(virt_to_page(pmd));
525 }
526 
527 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
528 {
529 	pte_t *pte, *start_pte;
530 
531 	start_pte = pte = pte_offset_kernel(pmd, addr);
532 	do {
533 		if (!pte_none(*pte)) {
534 			kvm_set_pte(pte, __pte(0));
535 			put_page(virt_to_page(pte));
536 		}
537 	} while (pte++, addr += PAGE_SIZE, addr != end);
538 
539 	if (hyp_pte_table_empty(start_pte))
540 		clear_hyp_pmd_entry(pmd);
541 }
542 
543 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
544 {
545 	phys_addr_t next;
546 	pmd_t *pmd, *start_pmd;
547 
548 	start_pmd = pmd = pmd_offset(pud, addr);
549 	do {
550 		next = pmd_addr_end(addr, end);
551 		/* Hyp doesn't use huge pmds */
552 		if (!pmd_none(*pmd))
553 			unmap_hyp_ptes(pmd, addr, next);
554 	} while (pmd++, addr = next, addr != end);
555 
556 	if (hyp_pmd_table_empty(start_pmd))
557 		clear_hyp_pud_entry(pud);
558 }
559 
560 static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
561 {
562 	phys_addr_t next;
563 	pud_t *pud, *start_pud;
564 
565 	start_pud = pud = pud_offset(p4d, addr);
566 	do {
567 		next = pud_addr_end(addr, end);
568 		/* Hyp doesn't use huge puds */
569 		if (!pud_none(*pud))
570 			unmap_hyp_pmds(pud, addr, next);
571 	} while (pud++, addr = next, addr != end);
572 
573 	if (hyp_pud_table_empty(start_pud))
574 		clear_hyp_p4d_entry(p4d);
575 }
576 
577 static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
578 {
579 	phys_addr_t next;
580 	p4d_t *p4d, *start_p4d;
581 
582 	start_p4d = p4d = p4d_offset(pgd, addr);
583 	do {
584 		next = p4d_addr_end(addr, end);
585 		/* Hyp doesn't use huge p4ds */
586 		if (!p4d_none(*p4d))
587 			unmap_hyp_puds(p4d, addr, next);
588 	} while (p4d++, addr = next, addr != end);
589 
590 	if (hyp_p4d_table_empty(start_p4d))
591 		clear_hyp_pgd_entry(pgd);
592 }
593 
594 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
595 {
596 	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
597 }
598 
599 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
600 			      phys_addr_t start, u64 size)
601 {
602 	pgd_t *pgd;
603 	phys_addr_t addr = start, end = start + size;
604 	phys_addr_t next;
605 
606 	/*
607 	 * We don't unmap anything from HYP, except at the hyp tear down.
608 	 * Hence, we don't have to invalidate the TLBs here.
609 	 */
610 	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
611 	do {
612 		next = pgd_addr_end(addr, end);
613 		if (!pgd_none(*pgd))
614 			unmap_hyp_p4ds(pgd, addr, next);
615 	} while (pgd++, addr = next, addr != end);
616 }
617 
618 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
619 {
620 	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
621 }
622 
623 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
624 {
625 	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
626 }
627 
628 /**
629  * free_hyp_pgds - free Hyp-mode page tables
630  *
631  * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
632  * therefore contains either mappings in the kernel memory area (above
633  * PAGE_OFFSET), or device mappings in the idmap range.
634  *
635  * boot_hyp_pgd should only map the idmap range, and is only used in
636  * the extended idmap case.
637  */
638 void free_hyp_pgds(void)
639 {
640 	pgd_t *id_pgd;
641 
642 	mutex_lock(&kvm_hyp_pgd_mutex);
643 
644 	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
645 
646 	if (id_pgd) {
647 		/* In case we never called hyp_mmu_init() */
648 		if (!io_map_base)
649 			io_map_base = hyp_idmap_start;
650 		unmap_hyp_idmap_range(id_pgd, io_map_base,
651 				      hyp_idmap_start + PAGE_SIZE - io_map_base);
652 	}
653 
654 	if (boot_hyp_pgd) {
655 		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
656 		boot_hyp_pgd = NULL;
657 	}
658 
659 	if (hyp_pgd) {
660 		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
661 				(uintptr_t)high_memory - PAGE_OFFSET);
662 
663 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
664 		hyp_pgd = NULL;
665 	}
666 	if (merged_hyp_pgd) {
667 		clear_page(merged_hyp_pgd);
668 		free_page((unsigned long)merged_hyp_pgd);
669 		merged_hyp_pgd = NULL;
670 	}
671 
672 	mutex_unlock(&kvm_hyp_pgd_mutex);
673 }
674 
675 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
676 				    unsigned long end, unsigned long pfn,
677 				    pgprot_t prot)
678 {
679 	pte_t *pte;
680 	unsigned long addr;
681 
682 	addr = start;
683 	do {
684 		pte = pte_offset_kernel(pmd, addr);
685 		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
686 		get_page(virt_to_page(pte));
687 		pfn++;
688 	} while (addr += PAGE_SIZE, addr != end);
689 }
690 
691 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
692 				   unsigned long end, unsigned long pfn,
693 				   pgprot_t prot)
694 {
695 	pmd_t *pmd;
696 	pte_t *pte;
697 	unsigned long addr, next;
698 
699 	addr = start;
700 	do {
701 		pmd = pmd_offset(pud, addr);
702 
703 		BUG_ON(pmd_sect(*pmd));
704 
705 		if (pmd_none(*pmd)) {
706 			pte = pte_alloc_one_kernel(NULL);
707 			if (!pte) {
708 				kvm_err("Cannot allocate Hyp pte\n");
709 				return -ENOMEM;
710 			}
711 			kvm_pmd_populate(pmd, pte);
712 			get_page(virt_to_page(pmd));
713 		}
714 
715 		next = pmd_addr_end(addr, end);
716 
717 		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
718 		pfn += (next - addr) >> PAGE_SHIFT;
719 	} while (addr = next, addr != end);
720 
721 	return 0;
722 }
723 
724 static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
725 				   unsigned long end, unsigned long pfn,
726 				   pgprot_t prot)
727 {
728 	pud_t *pud;
729 	pmd_t *pmd;
730 	unsigned long addr, next;
731 	int ret;
732 
733 	addr = start;
734 	do {
735 		pud = pud_offset(p4d, addr);
736 
737 		if (pud_none_or_clear_bad(pud)) {
738 			pmd = pmd_alloc_one(NULL, addr);
739 			if (!pmd) {
740 				kvm_err("Cannot allocate Hyp pmd\n");
741 				return -ENOMEM;
742 			}
743 			kvm_pud_populate(pud, pmd);
744 			get_page(virt_to_page(pud));
745 		}
746 
747 		next = pud_addr_end(addr, end);
748 		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
749 		if (ret)
750 			return ret;
751 		pfn += (next - addr) >> PAGE_SHIFT;
752 	} while (addr = next, addr != end);
753 
754 	return 0;
755 }
756 
757 static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
758 				   unsigned long end, unsigned long pfn,
759 				   pgprot_t prot)
760 {
761 	p4d_t *p4d;
762 	pud_t *pud;
763 	unsigned long addr, next;
764 	int ret;
765 
766 	addr = start;
767 	do {
768 		p4d = p4d_offset(pgd, addr);
769 
770 		if (p4d_none(*p4d)) {
771 			pud = pud_alloc_one(NULL, addr);
772 			if (!pud) {
773 				kvm_err("Cannot allocate Hyp pud\n");
774 				return -ENOMEM;
775 			}
776 			kvm_p4d_populate(p4d, pud);
777 			get_page(virt_to_page(p4d));
778 		}
779 
780 		next = p4d_addr_end(addr, end);
781 		ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
782 		if (ret)
783 			return ret;
784 		pfn += (next - addr) >> PAGE_SHIFT;
785 	} while (addr = next, addr != end);
786 
787 	return 0;
788 }
789 
790 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
791 				 unsigned long start, unsigned long end,
792 				 unsigned long pfn, pgprot_t prot)
793 {
794 	pgd_t *pgd;
795 	p4d_t *p4d;
796 	unsigned long addr, next;
797 	int err = 0;
798 
799 	mutex_lock(&kvm_hyp_pgd_mutex);
800 	addr = start & PAGE_MASK;
801 	end = PAGE_ALIGN(end);
802 	do {
803 		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
804 
805 		if (pgd_none(*pgd)) {
806 			p4d = p4d_alloc_one(NULL, addr);
807 			if (!p4d) {
808 				kvm_err("Cannot allocate Hyp p4d\n");
809 				err = -ENOMEM;
810 				goto out;
811 			}
812 			kvm_pgd_populate(pgd, p4d);
813 			get_page(virt_to_page(pgd));
814 		}
815 
816 		next = pgd_addr_end(addr, end);
817 		err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
818 		if (err)
819 			goto out;
820 		pfn += (next - addr) >> PAGE_SHIFT;
821 	} while (addr = next, addr != end);
822 out:
823 	mutex_unlock(&kvm_hyp_pgd_mutex);
824 	return err;
825 }
826 
827 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
828 {
829 	if (!is_vmalloc_addr(kaddr)) {
830 		BUG_ON(!virt_addr_valid(kaddr));
831 		return __pa(kaddr);
832 	} else {
833 		return page_to_phys(vmalloc_to_page(kaddr)) +
834 		       offset_in_page(kaddr);
835 	}
836 }
837 
838 /**
839  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
840  * @from:	The virtual kernel start address of the range
841  * @to:		The virtual kernel end address of the range (exclusive)
842  * @prot:	The protection to be applied to this range
843  *
844  * The same virtual address as the kernel virtual address is also used
845  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
846  * physical pages.
847  */
848 int create_hyp_mappings(void *from, void *to, pgprot_t prot)
849 {
850 	phys_addr_t phys_addr;
851 	unsigned long virt_addr;
852 	unsigned long start = kern_hyp_va((unsigned long)from);
853 	unsigned long end = kern_hyp_va((unsigned long)to);
854 
855 	if (is_kernel_in_hyp_mode())
856 		return 0;
857 
858 	start = start & PAGE_MASK;
859 	end = PAGE_ALIGN(end);
860 
861 	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
862 		int err;
863 
864 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
865 		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
866 					    virt_addr, virt_addr + PAGE_SIZE,
867 					    __phys_to_pfn(phys_addr),
868 					    prot);
869 		if (err)
870 			return err;
871 	}
872 
873 	return 0;
874 }
875 
876 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
877 					unsigned long *haddr, pgprot_t prot)
878 {
879 	pgd_t *pgd = hyp_pgd;
880 	unsigned long base;
881 	int ret = 0;
882 
883 	mutex_lock(&kvm_hyp_pgd_mutex);
884 
885 	/*
886 	 * This assumes that we have enough space below the idmap
887 	 * page to allocate our VAs. If not, the check below will
888 	 * kick. A potential alternative would be to detect that
889 	 * overflow and switch to an allocation above the idmap.
890 	 *
891 	 * The allocated size is always a multiple of PAGE_SIZE.
892 	 */
893 	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
894 	base = io_map_base - size;
895 
896 	/*
897 	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
898 	 * allocating the new area, as it would indicate we've
899 	 * overflowed the idmap/IO address range.
900 	 */
901 	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
902 		ret = -ENOMEM;
903 	else
904 		io_map_base = base;
905 
906 	mutex_unlock(&kvm_hyp_pgd_mutex);
907 
908 	if (ret)
909 		goto out;
910 
911 	if (__kvm_cpu_uses_extended_idmap())
912 		pgd = boot_hyp_pgd;
913 
914 	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
915 				    base, base + size,
916 				    __phys_to_pfn(phys_addr), prot);
917 	if (ret)
918 		goto out;
919 
920 	*haddr = base + offset_in_page(phys_addr);
921 
922 out:
923 	return ret;
924 }
925 
926 /**
927  * create_hyp_io_mappings - Map IO into both kernel and HYP
928  * @phys_addr:	The physical start address which gets mapped
929  * @size:	Size of the region being mapped
930  * @kaddr:	Kernel VA for this mapping
931  * @haddr:	HYP VA for this mapping
932  */
933 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
934 			   void __iomem **kaddr,
935 			   void __iomem **haddr)
936 {
937 	unsigned long addr;
938 	int ret;
939 
940 	*kaddr = ioremap(phys_addr, size);
941 	if (!*kaddr)
942 		return -ENOMEM;
943 
944 	if (is_kernel_in_hyp_mode()) {
945 		*haddr = *kaddr;
946 		return 0;
947 	}
948 
949 	ret = __create_hyp_private_mapping(phys_addr, size,
950 					   &addr, PAGE_HYP_DEVICE);
951 	if (ret) {
952 		iounmap(*kaddr);
953 		*kaddr = NULL;
954 		*haddr = NULL;
955 		return ret;
956 	}
957 
958 	*haddr = (void __iomem *)addr;
959 	return 0;
960 }
961 
962 /**
963  * create_hyp_exec_mappings - Map an executable range into HYP
964  * @phys_addr:	The physical start address which gets mapped
965  * @size:	Size of the region being mapped
966  * @haddr:	HYP VA for this mapping
967  */
968 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
969 			     void **haddr)
970 {
971 	unsigned long addr;
972 	int ret;
973 
974 	BUG_ON(is_kernel_in_hyp_mode());
975 
976 	ret = __create_hyp_private_mapping(phys_addr, size,
977 					   &addr, PAGE_HYP_EXEC);
978 	if (ret) {
979 		*haddr = NULL;
980 		return ret;
981 	}
982 
983 	*haddr = (void *)addr;
984 	return 0;
985 }
986 
987 /**
988  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
989  * @kvm:	The pointer to the KVM structure
990  * @mmu:	The pointer to the s2 MMU structure
991  *
992  * Allocates only the stage-2 HW PGD level table(s) of size defined by
993  * stage2_pgd_size(mmu->kvm).
994  *
995  * Note we don't need locking here as this is only called when the VM is
996  * created, which can only be done once.
997  */
998 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
999 {
1000 	phys_addr_t pgd_phys;
1001 	pgd_t *pgd;
1002 	int cpu;
1003 
1004 	if (mmu->pgd != NULL) {
1005 		kvm_err("kvm_arch already initialized?\n");
1006 		return -EINVAL;
1007 	}
1008 
1009 	/* Allocate the HW PGD, making sure that each page gets its own refcount */
1010 	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
1011 	if (!pgd)
1012 		return -ENOMEM;
1013 
1014 	pgd_phys = virt_to_phys(pgd);
1015 	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
1016 		return -EINVAL;
1017 
1018 	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
1019 	if (!mmu->last_vcpu_ran) {
1020 		free_pages_exact(pgd, stage2_pgd_size(kvm));
1021 		return -ENOMEM;
1022 	}
1023 
1024 	for_each_possible_cpu(cpu)
1025 		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
1026 
1027 	mmu->kvm = kvm;
1028 	mmu->pgd = pgd;
1029 	mmu->pgd_phys = pgd_phys;
1030 	mmu->vmid.vmid_gen = 0;
1031 
1032 	return 0;
1033 }
1034 
1035 static void stage2_unmap_memslot(struct kvm *kvm,
1036 				 struct kvm_memory_slot *memslot)
1037 {
1038 	hva_t hva = memslot->userspace_addr;
1039 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
1040 	phys_addr_t size = PAGE_SIZE * memslot->npages;
1041 	hva_t reg_end = hva + size;
1042 
1043 	/*
1044 	 * A memory region could potentially cover multiple VMAs, and any holes
1045 	 * between them, so iterate over all of them to find out if we should
1046 	 * unmap any of them.
1047 	 *
1048 	 *     +--------------------------------------------+
1049 	 * +---------------+----------------+   +----------------+
1050 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1051 	 * +---------------+----------------+   +----------------+
1052 	 *     |               memory region                |
1053 	 *     +--------------------------------------------+
1054 	 */
1055 	do {
1056 		struct vm_area_struct *vma = find_vma(current->mm, hva);
1057 		hva_t vm_start, vm_end;
1058 
1059 		if (!vma || vma->vm_start >= reg_end)
1060 			break;
1061 
1062 		/*
1063 		 * Take the intersection of this VMA with the memory region
1064 		 */
1065 		vm_start = max(hva, vma->vm_start);
1066 		vm_end = min(reg_end, vma->vm_end);
1067 
1068 		if (!(vma->vm_flags & VM_PFNMAP)) {
1069 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
1070 			unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
1071 		}
1072 		hva = vm_end;
1073 	} while (hva < reg_end);
1074 }
1075 
1076 /**
1077  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
1078  * @kvm: The struct kvm pointer
1079  *
1080  * Go through the memregions and unmap any regular RAM
1081  * backing memory already mapped to the VM.
1082  */
1083 void stage2_unmap_vm(struct kvm *kvm)
1084 {
1085 	struct kvm_memslots *slots;
1086 	struct kvm_memory_slot *memslot;
1087 	int idx;
1088 
1089 	idx = srcu_read_lock(&kvm->srcu);
1090 	mmap_read_lock(current->mm);
1091 	spin_lock(&kvm->mmu_lock);
1092 
1093 	slots = kvm_memslots(kvm);
1094 	kvm_for_each_memslot(memslot, slots)
1095 		stage2_unmap_memslot(kvm, memslot);
1096 
1097 	spin_unlock(&kvm->mmu_lock);
1098 	mmap_read_unlock(current->mm);
1099 	srcu_read_unlock(&kvm->srcu, idx);
1100 }
1101 
1102 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
1103 {
1104 	struct kvm *kvm = mmu->kvm;
1105 	void *pgd = NULL;
1106 
1107 	spin_lock(&kvm->mmu_lock);
1108 	if (mmu->pgd) {
1109 		unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
1110 		pgd = READ_ONCE(mmu->pgd);
1111 		mmu->pgd = NULL;
1112 	}
1113 	spin_unlock(&kvm->mmu_lock);
1114 
1115 	/* Free the HW pgd, one page at a time */
1116 	if (pgd) {
1117 		free_pages_exact(pgd, stage2_pgd_size(kvm));
1118 		free_percpu(mmu->last_vcpu_ran);
1119 	}
1120 }
1121 
1122 static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
1123 			     phys_addr_t addr)
1124 {
1125 	struct kvm *kvm = mmu->kvm;
1126 	pgd_t *pgd;
1127 	p4d_t *p4d;
1128 
1129 	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
1130 	if (stage2_pgd_none(kvm, *pgd)) {
1131 		if (!cache)
1132 			return NULL;
1133 		p4d = kvm_mmu_memory_cache_alloc(cache);
1134 		stage2_pgd_populate(kvm, pgd, p4d);
1135 		get_page(virt_to_page(pgd));
1136 	}
1137 
1138 	return stage2_p4d_offset(kvm, pgd, addr);
1139 }
1140 
1141 static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
1142 			     phys_addr_t addr)
1143 {
1144 	struct kvm *kvm = mmu->kvm;
1145 	p4d_t *p4d;
1146 	pud_t *pud;
1147 
1148 	p4d = stage2_get_p4d(mmu, cache, addr);
1149 	if (stage2_p4d_none(kvm, *p4d)) {
1150 		if (!cache)
1151 			return NULL;
1152 		pud = kvm_mmu_memory_cache_alloc(cache);
1153 		stage2_p4d_populate(kvm, p4d, pud);
1154 		get_page(virt_to_page(p4d));
1155 	}
1156 
1157 	return stage2_pud_offset(kvm, p4d, addr);
1158 }
1159 
1160 static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
1161 			     phys_addr_t addr)
1162 {
1163 	struct kvm *kvm = mmu->kvm;
1164 	pud_t *pud;
1165 	pmd_t *pmd;
1166 
1167 	pud = stage2_get_pud(mmu, cache, addr);
1168 	if (!pud || stage2_pud_huge(kvm, *pud))
1169 		return NULL;
1170 
1171 	if (stage2_pud_none(kvm, *pud)) {
1172 		if (!cache)
1173 			return NULL;
1174 		pmd = kvm_mmu_memory_cache_alloc(cache);
1175 		stage2_pud_populate(kvm, pud, pmd);
1176 		get_page(virt_to_page(pud));
1177 	}
1178 
1179 	return stage2_pmd_offset(kvm, pud, addr);
1180 }
1181 
1182 static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
1183 			       struct kvm_mmu_memory_cache *cache,
1184 			       phys_addr_t addr, const pmd_t *new_pmd)
1185 {
1186 	pmd_t *pmd, old_pmd;
1187 
1188 retry:
1189 	pmd = stage2_get_pmd(mmu, cache, addr);
1190 	VM_BUG_ON(!pmd);
1191 
1192 	old_pmd = *pmd;
1193 	/*
1194 	 * Multiple vcpus faulting on the same PMD entry, can
1195 	 * lead to them sequentially updating the PMD with the
1196 	 * same value. Following the break-before-make
1197 	 * (pmd_clear() followed by tlb_flush()) process can
1198 	 * hinder forward progress due to refaults generated
1199 	 * on missing translations.
1200 	 *
1201 	 * Skip updating the page table if the entry is
1202 	 * unchanged.
1203 	 */
1204 	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1205 		return 0;
1206 
1207 	if (pmd_present(old_pmd)) {
1208 		/*
1209 		 * If we already have PTE level mapping for this block,
1210 		 * we must unmap it to avoid inconsistent TLB state and
1211 		 * leaking the table page. We could end up in this situation
1212 		 * if the memory slot was marked for dirty logging and was
1213 		 * reverted, leaving PTE level mappings for the pages accessed
1214 		 * during the period. So, unmap the PTE level mapping for this
1215 		 * block and retry, as we could have released the upper level
1216 		 * table in the process.
1217 		 *
1218 		 * Normal THP split/merge follows mmu_notifier callbacks and do
1219 		 * get handled accordingly.
1220 		 */
1221 		if (!pmd_thp_or_huge(old_pmd)) {
1222 			unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
1223 			goto retry;
1224 		}
1225 		/*
1226 		 * Mapping in huge pages should only happen through a
1227 		 * fault.  If a page is merged into a transparent huge
1228 		 * page, the individual subpages of that huge page
1229 		 * should be unmapped through MMU notifiers before we
1230 		 * get here.
1231 		 *
1232 		 * Merging of CompoundPages is not supported; they
1233 		 * should become splitting first, unmapped, merged,
1234 		 * and mapped back in on-demand.
1235 		 */
1236 		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1237 		pmd_clear(pmd);
1238 		kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
1239 	} else {
1240 		get_page(virt_to_page(pmd));
1241 	}
1242 
1243 	kvm_set_pmd(pmd, *new_pmd);
1244 	return 0;
1245 }
1246 
1247 static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
1248 			       struct kvm_mmu_memory_cache *cache,
1249 			       phys_addr_t addr, const pud_t *new_pudp)
1250 {
1251 	struct kvm *kvm = mmu->kvm;
1252 	pud_t *pudp, old_pud;
1253 
1254 retry:
1255 	pudp = stage2_get_pud(mmu, cache, addr);
1256 	VM_BUG_ON(!pudp);
1257 
1258 	old_pud = *pudp;
1259 
1260 	/*
1261 	 * A large number of vcpus faulting on the same stage 2 entry,
1262 	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1263 	 * Skip updating the page tables if there is no change.
1264 	 */
1265 	if (pud_val(old_pud) == pud_val(*new_pudp))
1266 		return 0;
1267 
1268 	if (stage2_pud_present(kvm, old_pud)) {
1269 		/*
1270 		 * If we already have table level mapping for this block, unmap
1271 		 * the range for this block and retry.
1272 		 */
1273 		if (!stage2_pud_huge(kvm, old_pud)) {
1274 			unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
1275 			goto retry;
1276 		}
1277 
1278 		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1279 		stage2_pud_clear(kvm, pudp);
1280 		kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
1281 	} else {
1282 		get_page(virt_to_page(pudp));
1283 	}
1284 
1285 	kvm_set_pud(pudp, *new_pudp);
1286 	return 0;
1287 }
1288 
1289 /*
1290  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1291  * true if a valid and present leaf-entry is found. A pointer to the
1292  * leaf-entry is returned in the appropriate level variable - pudpp,
1293  * pmdpp, ptepp.
1294  */
1295 static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
1296 				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1297 {
1298 	struct kvm *kvm = mmu->kvm;
1299 	pud_t *pudp;
1300 	pmd_t *pmdp;
1301 	pte_t *ptep;
1302 
1303 	*pudpp = NULL;
1304 	*pmdpp = NULL;
1305 	*ptepp = NULL;
1306 
1307 	pudp = stage2_get_pud(mmu, NULL, addr);
1308 	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1309 		return false;
1310 
1311 	if (stage2_pud_huge(kvm, *pudp)) {
1312 		*pudpp = pudp;
1313 		return true;
1314 	}
1315 
1316 	pmdp = stage2_pmd_offset(kvm, pudp, addr);
1317 	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1318 		return false;
1319 
1320 	if (pmd_thp_or_huge(*pmdp)) {
1321 		*pmdpp = pmdp;
1322 		return true;
1323 	}
1324 
1325 	ptep = pte_offset_kernel(pmdp, addr);
1326 	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1327 		return false;
1328 
1329 	*ptepp = ptep;
1330 	return true;
1331 }
1332 
1333 static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
1334 {
1335 	pud_t *pudp;
1336 	pmd_t *pmdp;
1337 	pte_t *ptep;
1338 	bool found;
1339 
1340 	found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
1341 	if (!found)
1342 		return false;
1343 
1344 	if (pudp)
1345 		return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1346 	else if (pmdp)
1347 		return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1348 	else
1349 		return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1350 }
1351 
1352 static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1353 			  struct kvm_mmu_memory_cache *cache,
1354 			  phys_addr_t addr, const pte_t *new_pte,
1355 			  unsigned long flags)
1356 {
1357 	struct kvm *kvm = mmu->kvm;
1358 	pud_t *pud;
1359 	pmd_t *pmd;
1360 	pte_t *pte, old_pte;
1361 	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1362 	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1363 
1364 	VM_BUG_ON(logging_active && !cache);
1365 
1366 	/* Create stage-2 page table mapping - Levels 0 and 1 */
1367 	pud = stage2_get_pud(mmu, cache, addr);
1368 	if (!pud) {
1369 		/*
1370 		 * Ignore calls from kvm_set_spte_hva for unallocated
1371 		 * address ranges.
1372 		 */
1373 		return 0;
1374 	}
1375 
1376 	/*
1377 	 * While dirty page logging - dissolve huge PUD, then continue
1378 	 * on to allocate page.
1379 	 */
1380 	if (logging_active)
1381 		stage2_dissolve_pud(mmu, addr, pud);
1382 
1383 	if (stage2_pud_none(kvm, *pud)) {
1384 		if (!cache)
1385 			return 0; /* ignore calls from kvm_set_spte_hva */
1386 		pmd = kvm_mmu_memory_cache_alloc(cache);
1387 		stage2_pud_populate(kvm, pud, pmd);
1388 		get_page(virt_to_page(pud));
1389 	}
1390 
1391 	pmd = stage2_pmd_offset(kvm, pud, addr);
1392 	if (!pmd) {
1393 		/*
1394 		 * Ignore calls from kvm_set_spte_hva for unallocated
1395 		 * address ranges.
1396 		 */
1397 		return 0;
1398 	}
1399 
1400 	/*
1401 	 * While dirty page logging - dissolve huge PMD, then continue on to
1402 	 * allocate page.
1403 	 */
1404 	if (logging_active)
1405 		stage2_dissolve_pmd(mmu, addr, pmd);
1406 
1407 	/* Create stage-2 page mappings - Level 2 */
1408 	if (pmd_none(*pmd)) {
1409 		if (!cache)
1410 			return 0; /* ignore calls from kvm_set_spte_hva */
1411 		pte = kvm_mmu_memory_cache_alloc(cache);
1412 		kvm_pmd_populate(pmd, pte);
1413 		get_page(virt_to_page(pmd));
1414 	}
1415 
1416 	pte = pte_offset_kernel(pmd, addr);
1417 
1418 	if (iomap && pte_present(*pte))
1419 		return -EFAULT;
1420 
1421 	/* Create 2nd stage page table mapping - Level 3 */
1422 	old_pte = *pte;
1423 	if (pte_present(old_pte)) {
1424 		/* Skip page table update if there is no change */
1425 		if (pte_val(old_pte) == pte_val(*new_pte))
1426 			return 0;
1427 
1428 		kvm_set_pte(pte, __pte(0));
1429 		kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
1430 	} else {
1431 		get_page(virt_to_page(pte));
1432 	}
1433 
1434 	kvm_set_pte(pte, *new_pte);
1435 	return 0;
1436 }
1437 
1438 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1439 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1440 {
1441 	if (pte_young(*pte)) {
1442 		*pte = pte_mkold(*pte);
1443 		return 1;
1444 	}
1445 	return 0;
1446 }
1447 #else
1448 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1449 {
1450 	return __ptep_test_and_clear_young(pte);
1451 }
1452 #endif
1453 
1454 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1455 {
1456 	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1457 }
1458 
1459 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1460 {
1461 	return stage2_ptep_test_and_clear_young((pte_t *)pud);
1462 }
1463 
1464 /**
1465  * kvm_phys_addr_ioremap - map a device range to guest IPA
1466  *
1467  * @kvm:	The KVM pointer
1468  * @guest_ipa:	The IPA at which to insert the mapping
1469  * @pa:		The physical address of the device
1470  * @size:	The size of the mapping
1471  */
1472 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1473 			  phys_addr_t pa, unsigned long size, bool writable)
1474 {
1475 	phys_addr_t addr, end;
1476 	int ret = 0;
1477 	unsigned long pfn;
1478 	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
1479 
1480 	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1481 	pfn = __phys_to_pfn(pa);
1482 
1483 	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1484 		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1485 
1486 		if (writable)
1487 			pte = kvm_s2pte_mkwrite(pte);
1488 
1489 		ret = kvm_mmu_topup_memory_cache(&cache,
1490 						 kvm_mmu_cache_min_pages(kvm));
1491 		if (ret)
1492 			goto out;
1493 		spin_lock(&kvm->mmu_lock);
1494 		ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
1495 				     KVM_S2PTE_FLAG_IS_IOMAP);
1496 		spin_unlock(&kvm->mmu_lock);
1497 		if (ret)
1498 			goto out;
1499 
1500 		pfn++;
1501 	}
1502 
1503 out:
1504 	kvm_mmu_free_memory_cache(&cache);
1505 	return ret;
1506 }
1507 
1508 /**
1509  * stage2_wp_ptes - write protect PMD range
1510  * @pmd:	pointer to pmd entry
1511  * @addr:	range start address
1512  * @end:	range end address
1513  */
1514 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1515 {
1516 	pte_t *pte;
1517 
1518 	pte = pte_offset_kernel(pmd, addr);
1519 	do {
1520 		if (!pte_none(*pte)) {
1521 			if (!kvm_s2pte_readonly(pte))
1522 				kvm_set_s2pte_readonly(pte);
1523 		}
1524 	} while (pte++, addr += PAGE_SIZE, addr != end);
1525 }
1526 
1527 /**
1528  * stage2_wp_pmds - write protect PUD range
1529  * kvm:		kvm instance for the VM
1530  * @pud:	pointer to pud entry
1531  * @addr:	range start address
1532  * @end:	range end address
1533  */
1534 static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
1535 			   phys_addr_t addr, phys_addr_t end)
1536 {
1537 	struct kvm *kvm = mmu->kvm;
1538 	pmd_t *pmd;
1539 	phys_addr_t next;
1540 
1541 	pmd = stage2_pmd_offset(kvm, pud, addr);
1542 
1543 	do {
1544 		next = stage2_pmd_addr_end(kvm, addr, end);
1545 		if (!pmd_none(*pmd)) {
1546 			if (pmd_thp_or_huge(*pmd)) {
1547 				if (!kvm_s2pmd_readonly(pmd))
1548 					kvm_set_s2pmd_readonly(pmd);
1549 			} else {
1550 				stage2_wp_ptes(pmd, addr, next);
1551 			}
1552 		}
1553 	} while (pmd++, addr = next, addr != end);
1554 }
1555 
1556 /**
1557  * stage2_wp_puds - write protect P4D range
1558  * @p4d:	pointer to p4d entry
1559  * @addr:	range start address
1560  * @end:	range end address
1561  */
1562 static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
1563 			    phys_addr_t addr, phys_addr_t end)
1564 {
1565 	struct kvm *kvm = mmu->kvm;
1566 	pud_t *pud;
1567 	phys_addr_t next;
1568 
1569 	pud = stage2_pud_offset(kvm, p4d, addr);
1570 	do {
1571 		next = stage2_pud_addr_end(kvm, addr, end);
1572 		if (!stage2_pud_none(kvm, *pud)) {
1573 			if (stage2_pud_huge(kvm, *pud)) {
1574 				if (!kvm_s2pud_readonly(pud))
1575 					kvm_set_s2pud_readonly(pud);
1576 			} else {
1577 				stage2_wp_pmds(mmu, pud, addr, next);
1578 			}
1579 		}
1580 	} while (pud++, addr = next, addr != end);
1581 }
1582 
1583 /**
1584  * stage2_wp_p4ds - write protect PGD range
1585  * @pgd:	pointer to pgd entry
1586  * @addr:	range start address
1587  * @end:	range end address
1588  */
1589 static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
1590 			    phys_addr_t addr, phys_addr_t end)
1591 {
1592 	struct kvm *kvm = mmu->kvm;
1593 	p4d_t *p4d;
1594 	phys_addr_t next;
1595 
1596 	p4d = stage2_p4d_offset(kvm, pgd, addr);
1597 	do {
1598 		next = stage2_p4d_addr_end(kvm, addr, end);
1599 		if (!stage2_p4d_none(kvm, *p4d))
1600 			stage2_wp_puds(mmu, p4d, addr, next);
1601 	} while (p4d++, addr = next, addr != end);
1602 }
1603 
1604 /**
1605  * stage2_wp_range() - write protect stage2 memory region range
1606  * @kvm:	The KVM pointer
1607  * @addr:	Start address of range
1608  * @end:	End address of range
1609  */
1610 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1611 {
1612 	struct kvm *kvm = mmu->kvm;
1613 	pgd_t *pgd;
1614 	phys_addr_t next;
1615 
1616 	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
1617 	do {
1618 		/*
1619 		 * Release kvm_mmu_lock periodically if the memory region is
1620 		 * large. Otherwise, we may see kernel panics with
1621 		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1622 		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1623 		 * will also starve other vCPUs. We have to also make sure
1624 		 * that the page tables are not freed while we released
1625 		 * the lock.
1626 		 */
1627 		cond_resched_lock(&kvm->mmu_lock);
1628 		if (!READ_ONCE(mmu->pgd))
1629 			break;
1630 		next = stage2_pgd_addr_end(kvm, addr, end);
1631 		if (stage2_pgd_present(kvm, *pgd))
1632 			stage2_wp_p4ds(mmu, pgd, addr, next);
1633 	} while (pgd++, addr = next, addr != end);
1634 }
1635 
1636 /**
1637  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1638  * @kvm:	The KVM pointer
1639  * @slot:	The memory slot to write protect
1640  *
1641  * Called to start logging dirty pages after memory region
1642  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1643  * all present PUD, PMD and PTEs are write protected in the memory region.
1644  * Afterwards read of dirty page log can be called.
1645  *
1646  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1647  * serializing operations for VM memory regions.
1648  */
1649 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1650 {
1651 	struct kvm_memslots *slots = kvm_memslots(kvm);
1652 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1653 	phys_addr_t start, end;
1654 
1655 	if (WARN_ON_ONCE(!memslot))
1656 		return;
1657 
1658 	start = memslot->base_gfn << PAGE_SHIFT;
1659 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1660 
1661 	spin_lock(&kvm->mmu_lock);
1662 	stage2_wp_range(&kvm->arch.mmu, start, end);
1663 	spin_unlock(&kvm->mmu_lock);
1664 	kvm_flush_remote_tlbs(kvm);
1665 }
1666 
1667 /**
1668  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1669  * @kvm:	The KVM pointer
1670  * @slot:	The memory slot associated with mask
1671  * @gfn_offset:	The gfn offset in memory slot
1672  * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1673  *		slot to be write protected
1674  *
1675  * Walks bits set in mask write protects the associated pte's. Caller must
1676  * acquire kvm_mmu_lock.
1677  */
1678 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1679 		struct kvm_memory_slot *slot,
1680 		gfn_t gfn_offset, unsigned long mask)
1681 {
1682 	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1683 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1684 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1685 
1686 	stage2_wp_range(&kvm->arch.mmu, start, end);
1687 }
1688 
1689 /*
1690  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1691  * dirty pages.
1692  *
1693  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1694  * enable dirty logging for them.
1695  */
1696 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1697 		struct kvm_memory_slot *slot,
1698 		gfn_t gfn_offset, unsigned long mask)
1699 {
1700 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1701 }
1702 
1703 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1704 {
1705 	__clean_dcache_guest_page(pfn, size);
1706 }
1707 
1708 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1709 {
1710 	__invalidate_icache_guest_page(pfn, size);
1711 }
1712 
1713 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1714 {
1715 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1716 }
1717 
1718 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1719 					       unsigned long hva,
1720 					       unsigned long map_size)
1721 {
1722 	gpa_t gpa_start;
1723 	hva_t uaddr_start, uaddr_end;
1724 	size_t size;
1725 
1726 	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1727 	if (map_size == PAGE_SIZE)
1728 		return true;
1729 
1730 	size = memslot->npages * PAGE_SIZE;
1731 
1732 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
1733 
1734 	uaddr_start = memslot->userspace_addr;
1735 	uaddr_end = uaddr_start + size;
1736 
1737 	/*
1738 	 * Pages belonging to memslots that don't have the same alignment
1739 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1740 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1741 	 *
1742 	 * Consider a layout like the following:
1743 	 *
1744 	 *    memslot->userspace_addr:
1745 	 *    +-----+--------------------+--------------------+---+
1746 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1747 	 *    +-----+--------------------+--------------------+---+
1748 	 *
1749 	 *    memslot->base_gfn << PAGE_SHIFT:
1750 	 *      +---+--------------------+--------------------+-----+
1751 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1752 	 *      +---+--------------------+--------------------+-----+
1753 	 *
1754 	 * If we create those stage-2 blocks, we'll end up with this incorrect
1755 	 * mapping:
1756 	 *   d -> f
1757 	 *   e -> g
1758 	 *   f -> h
1759 	 */
1760 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1761 		return false;
1762 
1763 	/*
1764 	 * Next, let's make sure we're not trying to map anything not covered
1765 	 * by the memslot. This means we have to prohibit block size mappings
1766 	 * for the beginning and end of a non-block aligned and non-block sized
1767 	 * memory slot (illustrated by the head and tail parts of the
1768 	 * userspace view above containing pages 'abcde' and 'xyz',
1769 	 * respectively).
1770 	 *
1771 	 * Note that it doesn't matter if we do the check using the
1772 	 * userspace_addr or the base_gfn, as both are equally aligned (per
1773 	 * the check above) and equally sized.
1774 	 */
1775 	return (hva & ~(map_size - 1)) >= uaddr_start &&
1776 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1777 }
1778 
1779 /*
1780  * Check if the given hva is backed by a transparent huge page (THP) and
1781  * whether it can be mapped using block mapping in stage2. If so, adjust
1782  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1783  * supported. This will need to be updated to support other THP sizes.
1784  *
1785  * Returns the size of the mapping.
1786  */
1787 static unsigned long
1788 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1789 			    unsigned long hva, kvm_pfn_t *pfnp,
1790 			    phys_addr_t *ipap)
1791 {
1792 	kvm_pfn_t pfn = *pfnp;
1793 
1794 	/*
1795 	 * Make sure the adjustment is done only for THP pages. Also make
1796 	 * sure that the HVA and IPA are sufficiently aligned and that the
1797 	 * block map is contained within the memslot.
1798 	 */
1799 	if (kvm_is_transparent_hugepage(pfn) &&
1800 	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1801 		/*
1802 		 * The address we faulted on is backed by a transparent huge
1803 		 * page.  However, because we map the compound huge page and
1804 		 * not the individual tail page, we need to transfer the
1805 		 * refcount to the head page.  We have to be careful that the
1806 		 * THP doesn't start to split while we are adjusting the
1807 		 * refcounts.
1808 		 *
1809 		 * We are sure this doesn't happen, because mmu_notifier_retry
1810 		 * was successful and we are holding the mmu_lock, so if this
1811 		 * THP is trying to split, it will be blocked in the mmu
1812 		 * notifier before touching any of the pages, specifically
1813 		 * before being able to call __split_huge_page_refcount().
1814 		 *
1815 		 * We can therefore safely transfer the refcount from PG_tail
1816 		 * to PG_head and switch the pfn from a tail page to the head
1817 		 * page accordingly.
1818 		 */
1819 		*ipap &= PMD_MASK;
1820 		kvm_release_pfn_clean(pfn);
1821 		pfn &= ~(PTRS_PER_PMD - 1);
1822 		kvm_get_pfn(pfn);
1823 		*pfnp = pfn;
1824 
1825 		return PMD_SIZE;
1826 	}
1827 
1828 	/* Use page mapping if we cannot use block mapping. */
1829 	return PAGE_SIZE;
1830 }
1831 
1832 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1833 			  struct kvm_memory_slot *memslot, unsigned long hva,
1834 			  unsigned long fault_status)
1835 {
1836 	int ret;
1837 	bool write_fault, writable, force_pte = false;
1838 	bool exec_fault, needs_exec;
1839 	unsigned long mmu_seq;
1840 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1841 	struct kvm *kvm = vcpu->kvm;
1842 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1843 	struct vm_area_struct *vma;
1844 	short vma_shift;
1845 	kvm_pfn_t pfn;
1846 	pgprot_t mem_type = PAGE_S2;
1847 	bool logging_active = memslot_is_logging(memslot);
1848 	unsigned long vma_pagesize, flags = 0;
1849 	struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1850 
1851 	write_fault = kvm_is_write_fault(vcpu);
1852 	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1853 	VM_BUG_ON(write_fault && exec_fault);
1854 
1855 	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1856 		kvm_err("Unexpected L2 read permission error\n");
1857 		return -EFAULT;
1858 	}
1859 
1860 	/* Let's check if we will get back a huge page backed by hugetlbfs */
1861 	mmap_read_lock(current->mm);
1862 	vma = find_vma_intersection(current->mm, hva, hva + 1);
1863 	if (unlikely(!vma)) {
1864 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1865 		mmap_read_unlock(current->mm);
1866 		return -EFAULT;
1867 	}
1868 
1869 	if (is_vm_hugetlb_page(vma))
1870 		vma_shift = huge_page_shift(hstate_vma(vma));
1871 	else
1872 		vma_shift = PAGE_SHIFT;
1873 
1874 	vma_pagesize = 1ULL << vma_shift;
1875 	if (logging_active ||
1876 	    (vma->vm_flags & VM_PFNMAP) ||
1877 	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1878 		force_pte = true;
1879 		vma_pagesize = PAGE_SIZE;
1880 	}
1881 
1882 	/*
1883 	 * The stage2 has a minimum of 2 level table (For arm64 see
1884 	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1885 	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1886 	 * As for PUD huge maps, we must make sure that we have at least
1887 	 * 3 levels, i.e, PMD is not folded.
1888 	 */
1889 	if (vma_pagesize == PMD_SIZE ||
1890 	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1891 		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1892 	mmap_read_unlock(current->mm);
1893 
1894 	/* We need minimum second+third level pages */
1895 	ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
1896 	if (ret)
1897 		return ret;
1898 
1899 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1900 	/*
1901 	 * Ensure the read of mmu_notifier_seq happens before we call
1902 	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1903 	 * the page we just got a reference to gets unmapped before we have a
1904 	 * chance to grab the mmu_lock, which ensure that if the page gets
1905 	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1906 	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1907 	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1908 	 */
1909 	smp_rmb();
1910 
1911 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1912 	if (pfn == KVM_PFN_ERR_HWPOISON) {
1913 		kvm_send_hwpoison_signal(hva, vma_shift);
1914 		return 0;
1915 	}
1916 	if (is_error_noslot_pfn(pfn))
1917 		return -EFAULT;
1918 
1919 	if (kvm_is_device_pfn(pfn)) {
1920 		mem_type = PAGE_S2_DEVICE;
1921 		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1922 	} else if (logging_active) {
1923 		/*
1924 		 * Faults on pages in a memslot with logging enabled
1925 		 * should not be mapped with huge pages (it introduces churn
1926 		 * and performance degradation), so force a pte mapping.
1927 		 */
1928 		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1929 
1930 		/*
1931 		 * Only actually map the page as writable if this was a write
1932 		 * fault.
1933 		 */
1934 		if (!write_fault)
1935 			writable = false;
1936 	}
1937 
1938 	if (exec_fault && is_iomap(flags))
1939 		return -ENOEXEC;
1940 
1941 	spin_lock(&kvm->mmu_lock);
1942 	if (mmu_notifier_retry(kvm, mmu_seq))
1943 		goto out_unlock;
1944 
1945 	/*
1946 	 * If we are not forced to use page mapping, check if we are
1947 	 * backed by a THP and thus use block mapping if possible.
1948 	 */
1949 	if (vma_pagesize == PAGE_SIZE && !force_pte)
1950 		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1951 							   &pfn, &fault_ipa);
1952 	if (writable)
1953 		kvm_set_pfn_dirty(pfn);
1954 
1955 	if (fault_status != FSC_PERM && !is_iomap(flags))
1956 		clean_dcache_guest_page(pfn, vma_pagesize);
1957 
1958 	if (exec_fault)
1959 		invalidate_icache_guest_page(pfn, vma_pagesize);
1960 
1961 	/*
1962 	 * If we took an execution fault we have made the
1963 	 * icache/dcache coherent above and should now let the s2
1964 	 * mapping be executable.
1965 	 *
1966 	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1967 	 * execute permissions, and we preserve whatever we have.
1968 	 */
1969 	needs_exec = exec_fault ||
1970 		(fault_status == FSC_PERM &&
1971 		 stage2_is_exec(mmu, fault_ipa, vma_pagesize));
1972 
1973 	if (vma_pagesize == PUD_SIZE) {
1974 		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1975 
1976 		new_pud = kvm_pud_mkhuge(new_pud);
1977 		if (writable)
1978 			new_pud = kvm_s2pud_mkwrite(new_pud);
1979 
1980 		if (needs_exec)
1981 			new_pud = kvm_s2pud_mkexec(new_pud);
1982 
1983 		ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
1984 	} else if (vma_pagesize == PMD_SIZE) {
1985 		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1986 
1987 		new_pmd = kvm_pmd_mkhuge(new_pmd);
1988 
1989 		if (writable)
1990 			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1991 
1992 		if (needs_exec)
1993 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
1994 
1995 		ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
1996 	} else {
1997 		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1998 
1999 		if (writable) {
2000 			new_pte = kvm_s2pte_mkwrite(new_pte);
2001 			mark_page_dirty(kvm, gfn);
2002 		}
2003 
2004 		if (needs_exec)
2005 			new_pte = kvm_s2pte_mkexec(new_pte);
2006 
2007 		ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
2008 	}
2009 
2010 out_unlock:
2011 	spin_unlock(&kvm->mmu_lock);
2012 	kvm_set_pfn_accessed(pfn);
2013 	kvm_release_pfn_clean(pfn);
2014 	return ret;
2015 }
2016 
2017 /*
2018  * Resolve the access fault by making the page young again.
2019  * Note that because the faulting entry is guaranteed not to be
2020  * cached in the TLB, we don't need to invalidate anything.
2021  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
2022  * so there is no need for atomic (pte|pmd)_mkyoung operations.
2023  */
2024 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
2025 {
2026 	pud_t *pud;
2027 	pmd_t *pmd;
2028 	pte_t *pte;
2029 	kvm_pfn_t pfn;
2030 	bool pfn_valid = false;
2031 
2032 	trace_kvm_access_fault(fault_ipa);
2033 
2034 	spin_lock(&vcpu->kvm->mmu_lock);
2035 
2036 	if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
2037 		goto out;
2038 
2039 	if (pud) {		/* HugeTLB */
2040 		*pud = kvm_s2pud_mkyoung(*pud);
2041 		pfn = kvm_pud_pfn(*pud);
2042 		pfn_valid = true;
2043 	} else	if (pmd) {	/* THP, HugeTLB */
2044 		*pmd = pmd_mkyoung(*pmd);
2045 		pfn = pmd_pfn(*pmd);
2046 		pfn_valid = true;
2047 	} else {
2048 		*pte = pte_mkyoung(*pte);	/* Just a page... */
2049 		pfn = pte_pfn(*pte);
2050 		pfn_valid = true;
2051 	}
2052 
2053 out:
2054 	spin_unlock(&vcpu->kvm->mmu_lock);
2055 	if (pfn_valid)
2056 		kvm_set_pfn_accessed(pfn);
2057 }
2058 
2059 /**
2060  * kvm_handle_guest_abort - handles all 2nd stage aborts
2061  * @vcpu:	the VCPU pointer
2062  *
2063  * Any abort that gets to the host is almost guaranteed to be caused by a
2064  * missing second stage translation table entry, which can mean that either the
2065  * guest simply needs more memory and we must allocate an appropriate page or it
2066  * can mean that the guest tried to access I/O memory, which is emulated by user
2067  * space. The distinction is based on the IPA causing the fault and whether this
2068  * memory region has been registered as standard RAM by user space.
2069  */
2070 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
2071 {
2072 	unsigned long fault_status;
2073 	phys_addr_t fault_ipa;
2074 	struct kvm_memory_slot *memslot;
2075 	unsigned long hva;
2076 	bool is_iabt, write_fault, writable;
2077 	gfn_t gfn;
2078 	int ret, idx;
2079 
2080 	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
2081 
2082 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
2083 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
2084 
2085 	/* Synchronous External Abort? */
2086 	if (kvm_vcpu_abt_issea(vcpu)) {
2087 		/*
2088 		 * For RAS the host kernel may handle this abort.
2089 		 * There is no need to pass the error into the guest.
2090 		 */
2091 		if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
2092 			kvm_inject_vabt(vcpu);
2093 
2094 		return 1;
2095 	}
2096 
2097 	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
2098 			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
2099 
2100 	/* Check the stage-2 fault is trans. fault or write fault */
2101 	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
2102 	    fault_status != FSC_ACCESS) {
2103 		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
2104 			kvm_vcpu_trap_get_class(vcpu),
2105 			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
2106 			(unsigned long)kvm_vcpu_get_esr(vcpu));
2107 		return -EFAULT;
2108 	}
2109 
2110 	idx = srcu_read_lock(&vcpu->kvm->srcu);
2111 
2112 	gfn = fault_ipa >> PAGE_SHIFT;
2113 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
2114 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
2115 	write_fault = kvm_is_write_fault(vcpu);
2116 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
2117 		/*
2118 		 * The guest has put either its instructions or its page-tables
2119 		 * somewhere it shouldn't have. Userspace won't be able to do
2120 		 * anything about this (there's no syndrome for a start), so
2121 		 * re-inject the abort back into the guest.
2122 		 */
2123 		if (is_iabt) {
2124 			ret = -ENOEXEC;
2125 			goto out;
2126 		}
2127 
2128 		if (kvm_vcpu_dabt_iss1tw(vcpu)) {
2129 			kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2130 			ret = 1;
2131 			goto out_unlock;
2132 		}
2133 
2134 		/*
2135 		 * Check for a cache maintenance operation. Since we
2136 		 * ended-up here, we know it is outside of any memory
2137 		 * slot. But we can't find out if that is for a device,
2138 		 * or if the guest is just being stupid. The only thing
2139 		 * we know for sure is that this range cannot be cached.
2140 		 *
2141 		 * So let's assume that the guest is just being
2142 		 * cautious, and skip the instruction.
2143 		 */
2144 		if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
2145 			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
2146 			ret = 1;
2147 			goto out_unlock;
2148 		}
2149 
2150 		/*
2151 		 * The IPA is reported as [MAX:12], so we need to
2152 		 * complement it with the bottom 12 bits from the
2153 		 * faulting VA. This is always 12 bits, irrespective
2154 		 * of the page size.
2155 		 */
2156 		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
2157 		ret = io_mem_abort(vcpu, fault_ipa);
2158 		goto out_unlock;
2159 	}
2160 
2161 	/* Userspace should not be able to register out-of-bounds IPAs */
2162 	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
2163 
2164 	if (fault_status == FSC_ACCESS) {
2165 		handle_access_fault(vcpu, fault_ipa);
2166 		ret = 1;
2167 		goto out_unlock;
2168 	}
2169 
2170 	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
2171 	if (ret == 0)
2172 		ret = 1;
2173 out:
2174 	if (ret == -ENOEXEC) {
2175 		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2176 		ret = 1;
2177 	}
2178 out_unlock:
2179 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2180 	return ret;
2181 }
2182 
2183 static int handle_hva_to_gpa(struct kvm *kvm,
2184 			     unsigned long start,
2185 			     unsigned long end,
2186 			     int (*handler)(struct kvm *kvm,
2187 					    gpa_t gpa, u64 size,
2188 					    void *data),
2189 			     void *data)
2190 {
2191 	struct kvm_memslots *slots;
2192 	struct kvm_memory_slot *memslot;
2193 	int ret = 0;
2194 
2195 	slots = kvm_memslots(kvm);
2196 
2197 	/* we only care about the pages that the guest sees */
2198 	kvm_for_each_memslot(memslot, slots) {
2199 		unsigned long hva_start, hva_end;
2200 		gfn_t gpa;
2201 
2202 		hva_start = max(start, memslot->userspace_addr);
2203 		hva_end = min(end, memslot->userspace_addr +
2204 					(memslot->npages << PAGE_SHIFT));
2205 		if (hva_start >= hva_end)
2206 			continue;
2207 
2208 		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
2209 		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
2210 	}
2211 
2212 	return ret;
2213 }
2214 
2215 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2216 {
2217 	unsigned flags = *(unsigned *)data;
2218 	bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
2219 
2220 	__unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
2221 	return 0;
2222 }
2223 
2224 int kvm_unmap_hva_range(struct kvm *kvm,
2225 			unsigned long start, unsigned long end, unsigned flags)
2226 {
2227 	if (!kvm->arch.mmu.pgd)
2228 		return 0;
2229 
2230 	trace_kvm_unmap_hva_range(start, end);
2231 	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
2232 	return 0;
2233 }
2234 
2235 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2236 {
2237 	pte_t *pte = (pte_t *)data;
2238 
2239 	WARN_ON(size != PAGE_SIZE);
2240 	/*
2241 	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2242 	 * flag clear because MMU notifiers will have unmapped a huge PMD before
2243 	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2244 	 * therefore stage2_set_pte() never needs to clear out a huge PMD
2245 	 * through this calling path.
2246 	 */
2247 	stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
2248 	return 0;
2249 }
2250 
2251 
2252 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2253 {
2254 	unsigned long end = hva + PAGE_SIZE;
2255 	kvm_pfn_t pfn = pte_pfn(pte);
2256 	pte_t stage2_pte;
2257 
2258 	if (!kvm->arch.mmu.pgd)
2259 		return 0;
2260 
2261 	trace_kvm_set_spte_hva(hva);
2262 
2263 	/*
2264 	 * We've moved a page around, probably through CoW, so let's treat it
2265 	 * just like a translation fault and clean the cache to the PoC.
2266 	 */
2267 	clean_dcache_guest_page(pfn, PAGE_SIZE);
2268 	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
2269 	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2270 
2271 	return 0;
2272 }
2273 
2274 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2275 {
2276 	pud_t *pud;
2277 	pmd_t *pmd;
2278 	pte_t *pte;
2279 
2280 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2281 	if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
2282 		return 0;
2283 
2284 	if (pud)
2285 		return stage2_pudp_test_and_clear_young(pud);
2286 	else if (pmd)
2287 		return stage2_pmdp_test_and_clear_young(pmd);
2288 	else
2289 		return stage2_ptep_test_and_clear_young(pte);
2290 }
2291 
2292 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2293 {
2294 	pud_t *pud;
2295 	pmd_t *pmd;
2296 	pte_t *pte;
2297 
2298 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2299 	if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
2300 		return 0;
2301 
2302 	if (pud)
2303 		return kvm_s2pud_young(*pud);
2304 	else if (pmd)
2305 		return pmd_young(*pmd);
2306 	else
2307 		return pte_young(*pte);
2308 }
2309 
2310 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2311 {
2312 	if (!kvm->arch.mmu.pgd)
2313 		return 0;
2314 	trace_kvm_age_hva(start, end);
2315 	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2316 }
2317 
2318 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2319 {
2320 	if (!kvm->arch.mmu.pgd)
2321 		return 0;
2322 	trace_kvm_test_age_hva(hva);
2323 	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2324 				 kvm_test_age_hva_handler, NULL);
2325 }
2326 
2327 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2328 {
2329 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
2330 }
2331 
2332 phys_addr_t kvm_mmu_get_httbr(void)
2333 {
2334 	if (__kvm_cpu_uses_extended_idmap())
2335 		return virt_to_phys(merged_hyp_pgd);
2336 	else
2337 		return virt_to_phys(hyp_pgd);
2338 }
2339 
2340 phys_addr_t kvm_get_idmap_vector(void)
2341 {
2342 	return hyp_idmap_vector;
2343 }
2344 
2345 static int kvm_map_idmap_text(pgd_t *pgd)
2346 {
2347 	int err;
2348 
2349 	/* Create the idmap in the boot page tables */
2350 	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
2351 				      hyp_idmap_start, hyp_idmap_end,
2352 				      __phys_to_pfn(hyp_idmap_start),
2353 				      PAGE_HYP_EXEC);
2354 	if (err)
2355 		kvm_err("Failed to idmap %lx-%lx\n",
2356 			hyp_idmap_start, hyp_idmap_end);
2357 
2358 	return err;
2359 }
2360 
2361 int kvm_mmu_init(void)
2362 {
2363 	int err;
2364 
2365 	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2366 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2367 	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2368 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2369 	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2370 
2371 	/*
2372 	 * We rely on the linker script to ensure at build time that the HYP
2373 	 * init code does not cross a page boundary.
2374 	 */
2375 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2376 
2377 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2378 	kvm_debug("HYP VA range: %lx:%lx\n",
2379 		  kern_hyp_va(PAGE_OFFSET),
2380 		  kern_hyp_va((unsigned long)high_memory - 1));
2381 
2382 	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2383 	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2384 	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2385 		/*
2386 		 * The idmap page is intersecting with the VA space,
2387 		 * it is not safe to continue further.
2388 		 */
2389 		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2390 		err = -EINVAL;
2391 		goto out;
2392 	}
2393 
2394 	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
2395 	if (!hyp_pgd) {
2396 		kvm_err("Hyp mode PGD not allocated\n");
2397 		err = -ENOMEM;
2398 		goto out;
2399 	}
2400 
2401 	if (__kvm_cpu_uses_extended_idmap()) {
2402 		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2403 							 hyp_pgd_order);
2404 		if (!boot_hyp_pgd) {
2405 			kvm_err("Hyp boot PGD not allocated\n");
2406 			err = -ENOMEM;
2407 			goto out;
2408 		}
2409 
2410 		err = kvm_map_idmap_text(boot_hyp_pgd);
2411 		if (err)
2412 			goto out;
2413 
2414 		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
2415 		if (!merged_hyp_pgd) {
2416 			kvm_err("Failed to allocate extra HYP pgd\n");
2417 			goto out;
2418 		}
2419 		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
2420 				    hyp_idmap_start);
2421 	} else {
2422 		err = kvm_map_idmap_text(hyp_pgd);
2423 		if (err)
2424 			goto out;
2425 	}
2426 
2427 	io_map_base = hyp_idmap_start;
2428 	return 0;
2429 out:
2430 	free_hyp_pgds();
2431 	return err;
2432 }
2433 
2434 void kvm_arch_commit_memory_region(struct kvm *kvm,
2435 				   const struct kvm_userspace_memory_region *mem,
2436 				   struct kvm_memory_slot *old,
2437 				   const struct kvm_memory_slot *new,
2438 				   enum kvm_mr_change change)
2439 {
2440 	/*
2441 	 * At this point memslot has been committed and there is an
2442 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
2443 	 * memory slot is write protected.
2444 	 */
2445 	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2446 		/*
2447 		 * If we're with initial-all-set, we don't need to write
2448 		 * protect any pages because they're all reported as dirty.
2449 		 * Huge pages and normal pages will be write protect gradually.
2450 		 */
2451 		if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2452 			kvm_mmu_wp_memory_region(kvm, mem->slot);
2453 		}
2454 	}
2455 }
2456 
2457 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2458 				   struct kvm_memory_slot *memslot,
2459 				   const struct kvm_userspace_memory_region *mem,
2460 				   enum kvm_mr_change change)
2461 {
2462 	hva_t hva = mem->userspace_addr;
2463 	hva_t reg_end = hva + mem->memory_size;
2464 	bool writable = !(mem->flags & KVM_MEM_READONLY);
2465 	int ret = 0;
2466 
2467 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2468 			change != KVM_MR_FLAGS_ONLY)
2469 		return 0;
2470 
2471 	/*
2472 	 * Prevent userspace from creating a memory region outside of the IPA
2473 	 * space addressable by the KVM guest IPA space.
2474 	 */
2475 	if (memslot->base_gfn + memslot->npages >=
2476 	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
2477 		return -EFAULT;
2478 
2479 	mmap_read_lock(current->mm);
2480 	/*
2481 	 * A memory region could potentially cover multiple VMAs, and any holes
2482 	 * between them, so iterate over all of them to find out if we can map
2483 	 * any of them right now.
2484 	 *
2485 	 *     +--------------------------------------------+
2486 	 * +---------------+----------------+   +----------------+
2487 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2488 	 * +---------------+----------------+   +----------------+
2489 	 *     |               memory region                |
2490 	 *     +--------------------------------------------+
2491 	 */
2492 	do {
2493 		struct vm_area_struct *vma = find_vma(current->mm, hva);
2494 		hva_t vm_start, vm_end;
2495 
2496 		if (!vma || vma->vm_start >= reg_end)
2497 			break;
2498 
2499 		/*
2500 		 * Take the intersection of this VMA with the memory region
2501 		 */
2502 		vm_start = max(hva, vma->vm_start);
2503 		vm_end = min(reg_end, vma->vm_end);
2504 
2505 		if (vma->vm_flags & VM_PFNMAP) {
2506 			gpa_t gpa = mem->guest_phys_addr +
2507 				    (vm_start - mem->userspace_addr);
2508 			phys_addr_t pa;
2509 
2510 			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2511 			pa += vm_start - vma->vm_start;
2512 
2513 			/* IO region dirty page logging not allowed */
2514 			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2515 				ret = -EINVAL;
2516 				goto out;
2517 			}
2518 
2519 			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2520 						    vm_end - vm_start,
2521 						    writable);
2522 			if (ret)
2523 				break;
2524 		}
2525 		hva = vm_end;
2526 	} while (hva < reg_end);
2527 
2528 	if (change == KVM_MR_FLAGS_ONLY)
2529 		goto out;
2530 
2531 	spin_lock(&kvm->mmu_lock);
2532 	if (ret)
2533 		unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
2534 	else
2535 		stage2_flush_memslot(kvm, memslot);
2536 	spin_unlock(&kvm->mmu_lock);
2537 out:
2538 	mmap_read_unlock(current->mm);
2539 	return ret;
2540 }
2541 
2542 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2543 {
2544 }
2545 
2546 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2547 {
2548 }
2549 
2550 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2551 {
2552 	kvm_free_stage2_pgd(&kvm->arch.mmu);
2553 }
2554 
2555 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2556 				   struct kvm_memory_slot *slot)
2557 {
2558 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2559 	phys_addr_t size = slot->npages << PAGE_SHIFT;
2560 
2561 	spin_lock(&kvm->mmu_lock);
2562 	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2563 	spin_unlock(&kvm->mmu_lock);
2564 }
2565 
2566 /*
2567  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2568  *
2569  * Main problems:
2570  * - S/W ops are local to a CPU (not broadcast)
2571  * - We have line migration behind our back (speculation)
2572  * - System caches don't support S/W at all (damn!)
2573  *
2574  * In the face of the above, the best we can do is to try and convert
2575  * S/W ops to VA ops. Because the guest is not allowed to infer the
2576  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2577  * which is a rather good thing for us.
2578  *
2579  * Also, it is only used when turning caches on/off ("The expected
2580  * usage of the cache maintenance instructions that operate by set/way
2581  * is associated with the cache maintenance instructions associated
2582  * with the powerdown and powerup of caches, if this is required by
2583  * the implementation.").
2584  *
2585  * We use the following policy:
2586  *
2587  * - If we trap a S/W operation, we enable VM trapping to detect
2588  *   caches being turned on/off, and do a full clean.
2589  *
2590  * - We flush the caches on both caches being turned on and off.
2591  *
2592  * - Once the caches are enabled, we stop trapping VM ops.
2593  */
2594 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2595 {
2596 	unsigned long hcr = *vcpu_hcr(vcpu);
2597 
2598 	/*
2599 	 * If this is the first time we do a S/W operation
2600 	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2601 	 * VM trapping.
2602 	 *
2603 	 * Otherwise, rely on the VM trapping to wait for the MMU +
2604 	 * Caches to be turned off. At that point, we'll be able to
2605 	 * clean the caches again.
2606 	 */
2607 	if (!(hcr & HCR_TVM)) {
2608 		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2609 					vcpu_has_cache_enabled(vcpu));
2610 		stage2_flush_vm(vcpu->kvm);
2611 		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2612 	}
2613 }
2614 
2615 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2616 {
2617 	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2618 
2619 	/*
2620 	 * If switching the MMU+caches on, need to invalidate the caches.
2621 	 * If switching it off, need to clean the caches.
2622 	 * Clean + invalidate does the trick always.
2623 	 */
2624 	if (now_enabled != was_enabled)
2625 		stage2_flush_vm(vcpu->kvm);
2626 
2627 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2628 	if (now_enabled)
2629 		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2630 
2631 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2632 }
2633