xref: /openbmc/linux/arch/arm64/kvm/mmu.c (revision 62975d27)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6 
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_ras.h>
18 #include <asm/kvm_asm.h>
19 #include <asm/kvm_emulate.h>
20 #include <asm/virt.h>
21 
22 #include "trace.h"
23 
24 static pgd_t *boot_hyp_pgd;
25 static pgd_t *hyp_pgd;
26 static pgd_t *merged_hyp_pgd;
27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
28 
29 static unsigned long hyp_idmap_start;
30 static unsigned long hyp_idmap_end;
31 static phys_addr_t hyp_idmap_vector;
32 
33 static unsigned long io_map_base;
34 
35 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
36 
37 #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
38 #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
39 
40 static bool is_iomap(unsigned long flags)
41 {
42 	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
43 }
44 
45 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
46 {
47 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
48 }
49 
50 /**
51  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
52  * @kvm:	pointer to kvm structure.
53  *
54  * Interface to HYP function to flush all VM TLB entries
55  */
56 void kvm_flush_remote_tlbs(struct kvm *kvm)
57 {
58 	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
59 }
60 
61 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
62 {
63 	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
64 }
65 
66 /*
67  * D-Cache management functions. They take the page table entries by
68  * value, as they are flushing the cache using the kernel mapping (or
69  * kmap on 32bit).
70  */
71 static void kvm_flush_dcache_pte(pte_t pte)
72 {
73 	__kvm_flush_dcache_pte(pte);
74 }
75 
76 static void kvm_flush_dcache_pmd(pmd_t pmd)
77 {
78 	__kvm_flush_dcache_pmd(pmd);
79 }
80 
81 static void kvm_flush_dcache_pud(pud_t pud)
82 {
83 	__kvm_flush_dcache_pud(pud);
84 }
85 
86 static bool kvm_is_device_pfn(unsigned long pfn)
87 {
88 	return !pfn_valid(pfn);
89 }
90 
91 /**
92  * stage2_dissolve_pmd() - clear and flush huge PMD entry
93  * @kvm:	pointer to kvm structure.
94  * @addr:	IPA
95  * @pmd:	pmd pointer for IPA
96  *
97  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
98  */
99 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
100 {
101 	if (!pmd_thp_or_huge(*pmd))
102 		return;
103 
104 	pmd_clear(pmd);
105 	kvm_tlb_flush_vmid_ipa(kvm, addr);
106 	put_page(virt_to_page(pmd));
107 }
108 
109 /**
110  * stage2_dissolve_pud() - clear and flush huge PUD entry
111  * @kvm:	pointer to kvm structure.
112  * @addr:	IPA
113  * @pud:	pud pointer for IPA
114  *
115  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
116  */
117 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
118 {
119 	if (!stage2_pud_huge(kvm, *pudp))
120 		return;
121 
122 	stage2_pud_clear(kvm, pudp);
123 	kvm_tlb_flush_vmid_ipa(kvm, addr);
124 	put_page(virt_to_page(pudp));
125 }
126 
127 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
128 				  int min, int max)
129 {
130 	void *page;
131 
132 	BUG_ON(max > KVM_NR_MEM_OBJS);
133 	if (cache->nobjs >= min)
134 		return 0;
135 	while (cache->nobjs < max) {
136 		page = (void *)__get_free_page(GFP_PGTABLE_USER);
137 		if (!page)
138 			return -ENOMEM;
139 		cache->objects[cache->nobjs++] = page;
140 	}
141 	return 0;
142 }
143 
144 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
145 {
146 	while (mc->nobjs)
147 		free_page((unsigned long)mc->objects[--mc->nobjs]);
148 }
149 
150 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
151 {
152 	void *p;
153 
154 	BUG_ON(!mc || !mc->nobjs);
155 	p = mc->objects[--mc->nobjs];
156 	return p;
157 }
158 
159 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
160 {
161 	p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
162 	stage2_pgd_clear(kvm, pgd);
163 	kvm_tlb_flush_vmid_ipa(kvm, addr);
164 	stage2_p4d_free(kvm, p4d_table);
165 	put_page(virt_to_page(pgd));
166 }
167 
168 static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr)
169 {
170 	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
171 	stage2_p4d_clear(kvm, p4d);
172 	kvm_tlb_flush_vmid_ipa(kvm, addr);
173 	stage2_pud_free(kvm, pud_table);
174 	put_page(virt_to_page(p4d));
175 }
176 
177 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
178 {
179 	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
180 	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
181 	stage2_pud_clear(kvm, pud);
182 	kvm_tlb_flush_vmid_ipa(kvm, addr);
183 	stage2_pmd_free(kvm, pmd_table);
184 	put_page(virt_to_page(pud));
185 }
186 
187 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
188 {
189 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
190 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
191 	pmd_clear(pmd);
192 	kvm_tlb_flush_vmid_ipa(kvm, addr);
193 	free_page((unsigned long)pte_table);
194 	put_page(virt_to_page(pmd));
195 }
196 
197 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
198 {
199 	WRITE_ONCE(*ptep, new_pte);
200 	dsb(ishst);
201 }
202 
203 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
204 {
205 	WRITE_ONCE(*pmdp, new_pmd);
206 	dsb(ishst);
207 }
208 
209 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
210 {
211 	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
212 }
213 
214 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
215 {
216 	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
217 	dsb(ishst);
218 }
219 
220 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
221 {
222 	WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
223 	dsb(ishst);
224 }
225 
226 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
227 {
228 #ifndef __PAGETABLE_P4D_FOLDED
229 	WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
230 	dsb(ishst);
231 #endif
232 }
233 
234 /*
235  * Unmapping vs dcache management:
236  *
237  * If a guest maps certain memory pages as uncached, all writes will
238  * bypass the data cache and go directly to RAM.  However, the CPUs
239  * can still speculate reads (not writes) and fill cache lines with
240  * data.
241  *
242  * Those cache lines will be *clean* cache lines though, so a
243  * clean+invalidate operation is equivalent to an invalidate
244  * operation, because no cache lines are marked dirty.
245  *
246  * Those clean cache lines could be filled prior to an uncached write
247  * by the guest, and the cache coherent IO subsystem would therefore
248  * end up writing old data to disk.
249  *
250  * This is why right after unmapping a page/section and invalidating
251  * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
252  * the IO subsystem will never hit in the cache.
253  *
254  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
255  * we then fully enforce cacheability of RAM, no matter what the guest
256  * does.
257  */
258 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
259 		       phys_addr_t addr, phys_addr_t end)
260 {
261 	phys_addr_t start_addr = addr;
262 	pte_t *pte, *start_pte;
263 
264 	start_pte = pte = pte_offset_kernel(pmd, addr);
265 	do {
266 		if (!pte_none(*pte)) {
267 			pte_t old_pte = *pte;
268 
269 			kvm_set_pte(pte, __pte(0));
270 			kvm_tlb_flush_vmid_ipa(kvm, addr);
271 
272 			/* No need to invalidate the cache for device mappings */
273 			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
274 				kvm_flush_dcache_pte(old_pte);
275 
276 			put_page(virt_to_page(pte));
277 		}
278 	} while (pte++, addr += PAGE_SIZE, addr != end);
279 
280 	if (stage2_pte_table_empty(kvm, start_pte))
281 		clear_stage2_pmd_entry(kvm, pmd, start_addr);
282 }
283 
284 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
285 		       phys_addr_t addr, phys_addr_t end)
286 {
287 	phys_addr_t next, start_addr = addr;
288 	pmd_t *pmd, *start_pmd;
289 
290 	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
291 	do {
292 		next = stage2_pmd_addr_end(kvm, addr, end);
293 		if (!pmd_none(*pmd)) {
294 			if (pmd_thp_or_huge(*pmd)) {
295 				pmd_t old_pmd = *pmd;
296 
297 				pmd_clear(pmd);
298 				kvm_tlb_flush_vmid_ipa(kvm, addr);
299 
300 				kvm_flush_dcache_pmd(old_pmd);
301 
302 				put_page(virt_to_page(pmd));
303 			} else {
304 				unmap_stage2_ptes(kvm, pmd, addr, next);
305 			}
306 		}
307 	} while (pmd++, addr = next, addr != end);
308 
309 	if (stage2_pmd_table_empty(kvm, start_pmd))
310 		clear_stage2_pud_entry(kvm, pud, start_addr);
311 }
312 
313 static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d,
314 		       phys_addr_t addr, phys_addr_t end)
315 {
316 	phys_addr_t next, start_addr = addr;
317 	pud_t *pud, *start_pud;
318 
319 	start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
320 	do {
321 		next = stage2_pud_addr_end(kvm, addr, end);
322 		if (!stage2_pud_none(kvm, *pud)) {
323 			if (stage2_pud_huge(kvm, *pud)) {
324 				pud_t old_pud = *pud;
325 
326 				stage2_pud_clear(kvm, pud);
327 				kvm_tlb_flush_vmid_ipa(kvm, addr);
328 				kvm_flush_dcache_pud(old_pud);
329 				put_page(virt_to_page(pud));
330 			} else {
331 				unmap_stage2_pmds(kvm, pud, addr, next);
332 			}
333 		}
334 	} while (pud++, addr = next, addr != end);
335 
336 	if (stage2_pud_table_empty(kvm, start_pud))
337 		clear_stage2_p4d_entry(kvm, p4d, start_addr);
338 }
339 
340 static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd,
341 		       phys_addr_t addr, phys_addr_t end)
342 {
343 	phys_addr_t next, start_addr = addr;
344 	p4d_t *p4d, *start_p4d;
345 
346 	start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
347 	do {
348 		next = stage2_p4d_addr_end(kvm, addr, end);
349 		if (!stage2_p4d_none(kvm, *p4d))
350 			unmap_stage2_puds(kvm, p4d, addr, next);
351 	} while (p4d++, addr = next, addr != end);
352 
353 	if (stage2_p4d_table_empty(kvm, start_p4d))
354 		clear_stage2_pgd_entry(kvm, pgd, start_addr);
355 }
356 
357 /**
358  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
359  * @kvm:   The VM pointer
360  * @start: The intermediate physical base address of the range to unmap
361  * @size:  The size of the area to unmap
362  *
363  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
364  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
365  * destroying the VM), otherwise another faulting VCPU may come in and mess
366  * with things behind our backs.
367  */
368 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
369 {
370 	pgd_t *pgd;
371 	phys_addr_t addr = start, end = start + size;
372 	phys_addr_t next;
373 
374 	assert_spin_locked(&kvm->mmu_lock);
375 	WARN_ON(size & ~PAGE_MASK);
376 
377 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
378 	do {
379 		/*
380 		 * Make sure the page table is still active, as another thread
381 		 * could have possibly freed the page table, while we released
382 		 * the lock.
383 		 */
384 		if (!READ_ONCE(kvm->arch.pgd))
385 			break;
386 		next = stage2_pgd_addr_end(kvm, addr, end);
387 		if (!stage2_pgd_none(kvm, *pgd))
388 			unmap_stage2_p4ds(kvm, pgd, addr, next);
389 		/*
390 		 * If the range is too large, release the kvm->mmu_lock
391 		 * to prevent starvation and lockup detector warnings.
392 		 */
393 		if (next != end)
394 			cond_resched_lock(&kvm->mmu_lock);
395 	} while (pgd++, addr = next, addr != end);
396 }
397 
398 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
399 			      phys_addr_t addr, phys_addr_t end)
400 {
401 	pte_t *pte;
402 
403 	pte = pte_offset_kernel(pmd, addr);
404 	do {
405 		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
406 			kvm_flush_dcache_pte(*pte);
407 	} while (pte++, addr += PAGE_SIZE, addr != end);
408 }
409 
410 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
411 			      phys_addr_t addr, phys_addr_t end)
412 {
413 	pmd_t *pmd;
414 	phys_addr_t next;
415 
416 	pmd = stage2_pmd_offset(kvm, pud, addr);
417 	do {
418 		next = stage2_pmd_addr_end(kvm, addr, end);
419 		if (!pmd_none(*pmd)) {
420 			if (pmd_thp_or_huge(*pmd))
421 				kvm_flush_dcache_pmd(*pmd);
422 			else
423 				stage2_flush_ptes(kvm, pmd, addr, next);
424 		}
425 	} while (pmd++, addr = next, addr != end);
426 }
427 
428 static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d,
429 			      phys_addr_t addr, phys_addr_t end)
430 {
431 	pud_t *pud;
432 	phys_addr_t next;
433 
434 	pud = stage2_pud_offset(kvm, p4d, addr);
435 	do {
436 		next = stage2_pud_addr_end(kvm, addr, end);
437 		if (!stage2_pud_none(kvm, *pud)) {
438 			if (stage2_pud_huge(kvm, *pud))
439 				kvm_flush_dcache_pud(*pud);
440 			else
441 				stage2_flush_pmds(kvm, pud, addr, next);
442 		}
443 	} while (pud++, addr = next, addr != end);
444 }
445 
446 static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd,
447 			      phys_addr_t addr, phys_addr_t end)
448 {
449 	p4d_t *p4d;
450 	phys_addr_t next;
451 
452 	p4d = stage2_p4d_offset(kvm, pgd, addr);
453 	do {
454 		next = stage2_p4d_addr_end(kvm, addr, end);
455 		if (!stage2_p4d_none(kvm, *p4d))
456 			stage2_flush_puds(kvm, p4d, addr, next);
457 	} while (p4d++, addr = next, addr != end);
458 }
459 
460 static void stage2_flush_memslot(struct kvm *kvm,
461 				 struct kvm_memory_slot *memslot)
462 {
463 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
464 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
465 	phys_addr_t next;
466 	pgd_t *pgd;
467 
468 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
469 	do {
470 		next = stage2_pgd_addr_end(kvm, addr, end);
471 		if (!stage2_pgd_none(kvm, *pgd))
472 			stage2_flush_p4ds(kvm, pgd, addr, next);
473 
474 		if (next != end)
475 			cond_resched_lock(&kvm->mmu_lock);
476 	} while (pgd++, addr = next, addr != end);
477 }
478 
479 /**
480  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
481  * @kvm: The struct kvm pointer
482  *
483  * Go through the stage 2 page tables and invalidate any cache lines
484  * backing memory already mapped to the VM.
485  */
486 static void stage2_flush_vm(struct kvm *kvm)
487 {
488 	struct kvm_memslots *slots;
489 	struct kvm_memory_slot *memslot;
490 	int idx;
491 
492 	idx = srcu_read_lock(&kvm->srcu);
493 	spin_lock(&kvm->mmu_lock);
494 
495 	slots = kvm_memslots(kvm);
496 	kvm_for_each_memslot(memslot, slots)
497 		stage2_flush_memslot(kvm, memslot);
498 
499 	spin_unlock(&kvm->mmu_lock);
500 	srcu_read_unlock(&kvm->srcu, idx);
501 }
502 
503 static void clear_hyp_pgd_entry(pgd_t *pgd)
504 {
505 	p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
506 	pgd_clear(pgd);
507 	p4d_free(NULL, p4d_table);
508 	put_page(virt_to_page(pgd));
509 }
510 
511 static void clear_hyp_p4d_entry(p4d_t *p4d)
512 {
513 	pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
514 	VM_BUG_ON(p4d_huge(*p4d));
515 	p4d_clear(p4d);
516 	pud_free(NULL, pud_table);
517 	put_page(virt_to_page(p4d));
518 }
519 
520 static void clear_hyp_pud_entry(pud_t *pud)
521 {
522 	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
523 	VM_BUG_ON(pud_huge(*pud));
524 	pud_clear(pud);
525 	pmd_free(NULL, pmd_table);
526 	put_page(virt_to_page(pud));
527 }
528 
529 static void clear_hyp_pmd_entry(pmd_t *pmd)
530 {
531 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
532 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
533 	pmd_clear(pmd);
534 	pte_free_kernel(NULL, pte_table);
535 	put_page(virt_to_page(pmd));
536 }
537 
538 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
539 {
540 	pte_t *pte, *start_pte;
541 
542 	start_pte = pte = pte_offset_kernel(pmd, addr);
543 	do {
544 		if (!pte_none(*pte)) {
545 			kvm_set_pte(pte, __pte(0));
546 			put_page(virt_to_page(pte));
547 		}
548 	} while (pte++, addr += PAGE_SIZE, addr != end);
549 
550 	if (hyp_pte_table_empty(start_pte))
551 		clear_hyp_pmd_entry(pmd);
552 }
553 
554 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
555 {
556 	phys_addr_t next;
557 	pmd_t *pmd, *start_pmd;
558 
559 	start_pmd = pmd = pmd_offset(pud, addr);
560 	do {
561 		next = pmd_addr_end(addr, end);
562 		/* Hyp doesn't use huge pmds */
563 		if (!pmd_none(*pmd))
564 			unmap_hyp_ptes(pmd, addr, next);
565 	} while (pmd++, addr = next, addr != end);
566 
567 	if (hyp_pmd_table_empty(start_pmd))
568 		clear_hyp_pud_entry(pud);
569 }
570 
571 static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
572 {
573 	phys_addr_t next;
574 	pud_t *pud, *start_pud;
575 
576 	start_pud = pud = pud_offset(p4d, addr);
577 	do {
578 		next = pud_addr_end(addr, end);
579 		/* Hyp doesn't use huge puds */
580 		if (!pud_none(*pud))
581 			unmap_hyp_pmds(pud, addr, next);
582 	} while (pud++, addr = next, addr != end);
583 
584 	if (hyp_pud_table_empty(start_pud))
585 		clear_hyp_p4d_entry(p4d);
586 }
587 
588 static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
589 {
590 	phys_addr_t next;
591 	p4d_t *p4d, *start_p4d;
592 
593 	start_p4d = p4d = p4d_offset(pgd, addr);
594 	do {
595 		next = p4d_addr_end(addr, end);
596 		/* Hyp doesn't use huge p4ds */
597 		if (!p4d_none(*p4d))
598 			unmap_hyp_puds(p4d, addr, next);
599 	} while (p4d++, addr = next, addr != end);
600 
601 	if (hyp_p4d_table_empty(start_p4d))
602 		clear_hyp_pgd_entry(pgd);
603 }
604 
605 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
606 {
607 	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
608 }
609 
610 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
611 			      phys_addr_t start, u64 size)
612 {
613 	pgd_t *pgd;
614 	phys_addr_t addr = start, end = start + size;
615 	phys_addr_t next;
616 
617 	/*
618 	 * We don't unmap anything from HYP, except at the hyp tear down.
619 	 * Hence, we don't have to invalidate the TLBs here.
620 	 */
621 	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
622 	do {
623 		next = pgd_addr_end(addr, end);
624 		if (!pgd_none(*pgd))
625 			unmap_hyp_p4ds(pgd, addr, next);
626 	} while (pgd++, addr = next, addr != end);
627 }
628 
629 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
630 {
631 	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
632 }
633 
634 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
635 {
636 	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
637 }
638 
639 /**
640  * free_hyp_pgds - free Hyp-mode page tables
641  *
642  * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
643  * therefore contains either mappings in the kernel memory area (above
644  * PAGE_OFFSET), or device mappings in the idmap range.
645  *
646  * boot_hyp_pgd should only map the idmap range, and is only used in
647  * the extended idmap case.
648  */
649 void free_hyp_pgds(void)
650 {
651 	pgd_t *id_pgd;
652 
653 	mutex_lock(&kvm_hyp_pgd_mutex);
654 
655 	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
656 
657 	if (id_pgd) {
658 		/* In case we never called hyp_mmu_init() */
659 		if (!io_map_base)
660 			io_map_base = hyp_idmap_start;
661 		unmap_hyp_idmap_range(id_pgd, io_map_base,
662 				      hyp_idmap_start + PAGE_SIZE - io_map_base);
663 	}
664 
665 	if (boot_hyp_pgd) {
666 		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
667 		boot_hyp_pgd = NULL;
668 	}
669 
670 	if (hyp_pgd) {
671 		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
672 				(uintptr_t)high_memory - PAGE_OFFSET);
673 
674 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
675 		hyp_pgd = NULL;
676 	}
677 	if (merged_hyp_pgd) {
678 		clear_page(merged_hyp_pgd);
679 		free_page((unsigned long)merged_hyp_pgd);
680 		merged_hyp_pgd = NULL;
681 	}
682 
683 	mutex_unlock(&kvm_hyp_pgd_mutex);
684 }
685 
686 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
687 				    unsigned long end, unsigned long pfn,
688 				    pgprot_t prot)
689 {
690 	pte_t *pte;
691 	unsigned long addr;
692 
693 	addr = start;
694 	do {
695 		pte = pte_offset_kernel(pmd, addr);
696 		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
697 		get_page(virt_to_page(pte));
698 		pfn++;
699 	} while (addr += PAGE_SIZE, addr != end);
700 }
701 
702 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
703 				   unsigned long end, unsigned long pfn,
704 				   pgprot_t prot)
705 {
706 	pmd_t *pmd;
707 	pte_t *pte;
708 	unsigned long addr, next;
709 
710 	addr = start;
711 	do {
712 		pmd = pmd_offset(pud, addr);
713 
714 		BUG_ON(pmd_sect(*pmd));
715 
716 		if (pmd_none(*pmd)) {
717 			pte = pte_alloc_one_kernel(NULL);
718 			if (!pte) {
719 				kvm_err("Cannot allocate Hyp pte\n");
720 				return -ENOMEM;
721 			}
722 			kvm_pmd_populate(pmd, pte);
723 			get_page(virt_to_page(pmd));
724 		}
725 
726 		next = pmd_addr_end(addr, end);
727 
728 		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
729 		pfn += (next - addr) >> PAGE_SHIFT;
730 	} while (addr = next, addr != end);
731 
732 	return 0;
733 }
734 
735 static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
736 				   unsigned long end, unsigned long pfn,
737 				   pgprot_t prot)
738 {
739 	pud_t *pud;
740 	pmd_t *pmd;
741 	unsigned long addr, next;
742 	int ret;
743 
744 	addr = start;
745 	do {
746 		pud = pud_offset(p4d, addr);
747 
748 		if (pud_none_or_clear_bad(pud)) {
749 			pmd = pmd_alloc_one(NULL, addr);
750 			if (!pmd) {
751 				kvm_err("Cannot allocate Hyp pmd\n");
752 				return -ENOMEM;
753 			}
754 			kvm_pud_populate(pud, pmd);
755 			get_page(virt_to_page(pud));
756 		}
757 
758 		next = pud_addr_end(addr, end);
759 		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
760 		if (ret)
761 			return ret;
762 		pfn += (next - addr) >> PAGE_SHIFT;
763 	} while (addr = next, addr != end);
764 
765 	return 0;
766 }
767 
768 static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
769 				   unsigned long end, unsigned long pfn,
770 				   pgprot_t prot)
771 {
772 	p4d_t *p4d;
773 	pud_t *pud;
774 	unsigned long addr, next;
775 	int ret;
776 
777 	addr = start;
778 	do {
779 		p4d = p4d_offset(pgd, addr);
780 
781 		if (p4d_none(*p4d)) {
782 			pud = pud_alloc_one(NULL, addr);
783 			if (!pud) {
784 				kvm_err("Cannot allocate Hyp pud\n");
785 				return -ENOMEM;
786 			}
787 			kvm_p4d_populate(p4d, pud);
788 			get_page(virt_to_page(p4d));
789 		}
790 
791 		next = p4d_addr_end(addr, end);
792 		ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
793 		if (ret)
794 			return ret;
795 		pfn += (next - addr) >> PAGE_SHIFT;
796 	} while (addr = next, addr != end);
797 
798 	return 0;
799 }
800 
801 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
802 				 unsigned long start, unsigned long end,
803 				 unsigned long pfn, pgprot_t prot)
804 {
805 	pgd_t *pgd;
806 	p4d_t *p4d;
807 	unsigned long addr, next;
808 	int err = 0;
809 
810 	mutex_lock(&kvm_hyp_pgd_mutex);
811 	addr = start & PAGE_MASK;
812 	end = PAGE_ALIGN(end);
813 	do {
814 		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
815 
816 		if (pgd_none(*pgd)) {
817 			p4d = p4d_alloc_one(NULL, addr);
818 			if (!p4d) {
819 				kvm_err("Cannot allocate Hyp p4d\n");
820 				err = -ENOMEM;
821 				goto out;
822 			}
823 			kvm_pgd_populate(pgd, p4d);
824 			get_page(virt_to_page(pgd));
825 		}
826 
827 		next = pgd_addr_end(addr, end);
828 		err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
829 		if (err)
830 			goto out;
831 		pfn += (next - addr) >> PAGE_SHIFT;
832 	} while (addr = next, addr != end);
833 out:
834 	mutex_unlock(&kvm_hyp_pgd_mutex);
835 	return err;
836 }
837 
838 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
839 {
840 	if (!is_vmalloc_addr(kaddr)) {
841 		BUG_ON(!virt_addr_valid(kaddr));
842 		return __pa(kaddr);
843 	} else {
844 		return page_to_phys(vmalloc_to_page(kaddr)) +
845 		       offset_in_page(kaddr);
846 	}
847 }
848 
849 /**
850  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
851  * @from:	The virtual kernel start address of the range
852  * @to:		The virtual kernel end address of the range (exclusive)
853  * @prot:	The protection to be applied to this range
854  *
855  * The same virtual address as the kernel virtual address is also used
856  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
857  * physical pages.
858  */
859 int create_hyp_mappings(void *from, void *to, pgprot_t prot)
860 {
861 	phys_addr_t phys_addr;
862 	unsigned long virt_addr;
863 	unsigned long start = kern_hyp_va((unsigned long)from);
864 	unsigned long end = kern_hyp_va((unsigned long)to);
865 
866 	if (is_kernel_in_hyp_mode())
867 		return 0;
868 
869 	start = start & PAGE_MASK;
870 	end = PAGE_ALIGN(end);
871 
872 	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
873 		int err;
874 
875 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
876 		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
877 					    virt_addr, virt_addr + PAGE_SIZE,
878 					    __phys_to_pfn(phys_addr),
879 					    prot);
880 		if (err)
881 			return err;
882 	}
883 
884 	return 0;
885 }
886 
887 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
888 					unsigned long *haddr, pgprot_t prot)
889 {
890 	pgd_t *pgd = hyp_pgd;
891 	unsigned long base;
892 	int ret = 0;
893 
894 	mutex_lock(&kvm_hyp_pgd_mutex);
895 
896 	/*
897 	 * This assumes that we have enough space below the idmap
898 	 * page to allocate our VAs. If not, the check below will
899 	 * kick. A potential alternative would be to detect that
900 	 * overflow and switch to an allocation above the idmap.
901 	 *
902 	 * The allocated size is always a multiple of PAGE_SIZE.
903 	 */
904 	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
905 	base = io_map_base - size;
906 
907 	/*
908 	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
909 	 * allocating the new area, as it would indicate we've
910 	 * overflowed the idmap/IO address range.
911 	 */
912 	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
913 		ret = -ENOMEM;
914 	else
915 		io_map_base = base;
916 
917 	mutex_unlock(&kvm_hyp_pgd_mutex);
918 
919 	if (ret)
920 		goto out;
921 
922 	if (__kvm_cpu_uses_extended_idmap())
923 		pgd = boot_hyp_pgd;
924 
925 	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
926 				    base, base + size,
927 				    __phys_to_pfn(phys_addr), prot);
928 	if (ret)
929 		goto out;
930 
931 	*haddr = base + offset_in_page(phys_addr);
932 
933 out:
934 	return ret;
935 }
936 
937 /**
938  * create_hyp_io_mappings - Map IO into both kernel and HYP
939  * @phys_addr:	The physical start address which gets mapped
940  * @size:	Size of the region being mapped
941  * @kaddr:	Kernel VA for this mapping
942  * @haddr:	HYP VA for this mapping
943  */
944 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
945 			   void __iomem **kaddr,
946 			   void __iomem **haddr)
947 {
948 	unsigned long addr;
949 	int ret;
950 
951 	*kaddr = ioremap(phys_addr, size);
952 	if (!*kaddr)
953 		return -ENOMEM;
954 
955 	if (is_kernel_in_hyp_mode()) {
956 		*haddr = *kaddr;
957 		return 0;
958 	}
959 
960 	ret = __create_hyp_private_mapping(phys_addr, size,
961 					   &addr, PAGE_HYP_DEVICE);
962 	if (ret) {
963 		iounmap(*kaddr);
964 		*kaddr = NULL;
965 		*haddr = NULL;
966 		return ret;
967 	}
968 
969 	*haddr = (void __iomem *)addr;
970 	return 0;
971 }
972 
973 /**
974  * create_hyp_exec_mappings - Map an executable range into HYP
975  * @phys_addr:	The physical start address which gets mapped
976  * @size:	Size of the region being mapped
977  * @haddr:	HYP VA for this mapping
978  */
979 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
980 			     void **haddr)
981 {
982 	unsigned long addr;
983 	int ret;
984 
985 	BUG_ON(is_kernel_in_hyp_mode());
986 
987 	ret = __create_hyp_private_mapping(phys_addr, size,
988 					   &addr, PAGE_HYP_EXEC);
989 	if (ret) {
990 		*haddr = NULL;
991 		return ret;
992 	}
993 
994 	*haddr = (void *)addr;
995 	return 0;
996 }
997 
998 /**
999  * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
1000  * @kvm:	The KVM struct pointer for the VM.
1001  *
1002  * Allocates only the stage-2 HW PGD level table(s) of size defined by
1003  * stage2_pgd_size(kvm).
1004  *
1005  * Note we don't need locking here as this is only called when the VM is
1006  * created, which can only be done once.
1007  */
1008 int kvm_alloc_stage2_pgd(struct kvm *kvm)
1009 {
1010 	phys_addr_t pgd_phys;
1011 	pgd_t *pgd;
1012 
1013 	if (kvm->arch.pgd != NULL) {
1014 		kvm_err("kvm_arch already initialized?\n");
1015 		return -EINVAL;
1016 	}
1017 
1018 	/* Allocate the HW PGD, making sure that each page gets its own refcount */
1019 	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
1020 	if (!pgd)
1021 		return -ENOMEM;
1022 
1023 	pgd_phys = virt_to_phys(pgd);
1024 	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
1025 		return -EINVAL;
1026 
1027 	kvm->arch.pgd = pgd;
1028 	kvm->arch.pgd_phys = pgd_phys;
1029 	return 0;
1030 }
1031 
1032 static void stage2_unmap_memslot(struct kvm *kvm,
1033 				 struct kvm_memory_slot *memslot)
1034 {
1035 	hva_t hva = memslot->userspace_addr;
1036 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
1037 	phys_addr_t size = PAGE_SIZE * memslot->npages;
1038 	hva_t reg_end = hva + size;
1039 
1040 	/*
1041 	 * A memory region could potentially cover multiple VMAs, and any holes
1042 	 * between them, so iterate over all of them to find out if we should
1043 	 * unmap any of them.
1044 	 *
1045 	 *     +--------------------------------------------+
1046 	 * +---------------+----------------+   +----------------+
1047 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1048 	 * +---------------+----------------+   +----------------+
1049 	 *     |               memory region                |
1050 	 *     +--------------------------------------------+
1051 	 */
1052 	do {
1053 		struct vm_area_struct *vma = find_vma(current->mm, hva);
1054 		hva_t vm_start, vm_end;
1055 
1056 		if (!vma || vma->vm_start >= reg_end)
1057 			break;
1058 
1059 		/*
1060 		 * Take the intersection of this VMA with the memory region
1061 		 */
1062 		vm_start = max(hva, vma->vm_start);
1063 		vm_end = min(reg_end, vma->vm_end);
1064 
1065 		if (!(vma->vm_flags & VM_PFNMAP)) {
1066 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
1067 			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
1068 		}
1069 		hva = vm_end;
1070 	} while (hva < reg_end);
1071 }
1072 
1073 /**
1074  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
1075  * @kvm: The struct kvm pointer
1076  *
1077  * Go through the memregions and unmap any regular RAM
1078  * backing memory already mapped to the VM.
1079  */
1080 void stage2_unmap_vm(struct kvm *kvm)
1081 {
1082 	struct kvm_memslots *slots;
1083 	struct kvm_memory_slot *memslot;
1084 	int idx;
1085 
1086 	idx = srcu_read_lock(&kvm->srcu);
1087 	mmap_read_lock(current->mm);
1088 	spin_lock(&kvm->mmu_lock);
1089 
1090 	slots = kvm_memslots(kvm);
1091 	kvm_for_each_memslot(memslot, slots)
1092 		stage2_unmap_memslot(kvm, memslot);
1093 
1094 	spin_unlock(&kvm->mmu_lock);
1095 	mmap_read_unlock(current->mm);
1096 	srcu_read_unlock(&kvm->srcu, idx);
1097 }
1098 
1099 /**
1100  * kvm_free_stage2_pgd - free all stage-2 tables
1101  * @kvm:	The KVM struct pointer for the VM.
1102  *
1103  * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
1104  * underlying level-2 and level-3 tables before freeing the actual level-1 table
1105  * and setting the struct pointer to NULL.
1106  */
1107 void kvm_free_stage2_pgd(struct kvm *kvm)
1108 {
1109 	void *pgd = NULL;
1110 
1111 	spin_lock(&kvm->mmu_lock);
1112 	if (kvm->arch.pgd) {
1113 		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
1114 		pgd = READ_ONCE(kvm->arch.pgd);
1115 		kvm->arch.pgd = NULL;
1116 		kvm->arch.pgd_phys = 0;
1117 	}
1118 	spin_unlock(&kvm->mmu_lock);
1119 
1120 	/* Free the HW pgd, one page at a time */
1121 	if (pgd)
1122 		free_pages_exact(pgd, stage2_pgd_size(kvm));
1123 }
1124 
1125 static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1126 			     phys_addr_t addr)
1127 {
1128 	pgd_t *pgd;
1129 	p4d_t *p4d;
1130 
1131 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1132 	if (stage2_pgd_none(kvm, *pgd)) {
1133 		if (!cache)
1134 			return NULL;
1135 		p4d = mmu_memory_cache_alloc(cache);
1136 		stage2_pgd_populate(kvm, pgd, p4d);
1137 		get_page(virt_to_page(pgd));
1138 	}
1139 
1140 	return stage2_p4d_offset(kvm, pgd, addr);
1141 }
1142 
1143 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1144 			     phys_addr_t addr)
1145 {
1146 	p4d_t *p4d;
1147 	pud_t *pud;
1148 
1149 	p4d = stage2_get_p4d(kvm, cache, addr);
1150 	if (stage2_p4d_none(kvm, *p4d)) {
1151 		if (!cache)
1152 			return NULL;
1153 		pud = mmu_memory_cache_alloc(cache);
1154 		stage2_p4d_populate(kvm, p4d, pud);
1155 		get_page(virt_to_page(p4d));
1156 	}
1157 
1158 	return stage2_pud_offset(kvm, p4d, addr);
1159 }
1160 
1161 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1162 			     phys_addr_t addr)
1163 {
1164 	pud_t *pud;
1165 	pmd_t *pmd;
1166 
1167 	pud = stage2_get_pud(kvm, cache, addr);
1168 	if (!pud || stage2_pud_huge(kvm, *pud))
1169 		return NULL;
1170 
1171 	if (stage2_pud_none(kvm, *pud)) {
1172 		if (!cache)
1173 			return NULL;
1174 		pmd = mmu_memory_cache_alloc(cache);
1175 		stage2_pud_populate(kvm, pud, pmd);
1176 		get_page(virt_to_page(pud));
1177 	}
1178 
1179 	return stage2_pmd_offset(kvm, pud, addr);
1180 }
1181 
1182 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1183 			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
1184 {
1185 	pmd_t *pmd, old_pmd;
1186 
1187 retry:
1188 	pmd = stage2_get_pmd(kvm, cache, addr);
1189 	VM_BUG_ON(!pmd);
1190 
1191 	old_pmd = *pmd;
1192 	/*
1193 	 * Multiple vcpus faulting on the same PMD entry, can
1194 	 * lead to them sequentially updating the PMD with the
1195 	 * same value. Following the break-before-make
1196 	 * (pmd_clear() followed by tlb_flush()) process can
1197 	 * hinder forward progress due to refaults generated
1198 	 * on missing translations.
1199 	 *
1200 	 * Skip updating the page table if the entry is
1201 	 * unchanged.
1202 	 */
1203 	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1204 		return 0;
1205 
1206 	if (pmd_present(old_pmd)) {
1207 		/*
1208 		 * If we already have PTE level mapping for this block,
1209 		 * we must unmap it to avoid inconsistent TLB state and
1210 		 * leaking the table page. We could end up in this situation
1211 		 * if the memory slot was marked for dirty logging and was
1212 		 * reverted, leaving PTE level mappings for the pages accessed
1213 		 * during the period. So, unmap the PTE level mapping for this
1214 		 * block and retry, as we could have released the upper level
1215 		 * table in the process.
1216 		 *
1217 		 * Normal THP split/merge follows mmu_notifier callbacks and do
1218 		 * get handled accordingly.
1219 		 */
1220 		if (!pmd_thp_or_huge(old_pmd)) {
1221 			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1222 			goto retry;
1223 		}
1224 		/*
1225 		 * Mapping in huge pages should only happen through a
1226 		 * fault.  If a page is merged into a transparent huge
1227 		 * page, the individual subpages of that huge page
1228 		 * should be unmapped through MMU notifiers before we
1229 		 * get here.
1230 		 *
1231 		 * Merging of CompoundPages is not supported; they
1232 		 * should become splitting first, unmapped, merged,
1233 		 * and mapped back in on-demand.
1234 		 */
1235 		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1236 		pmd_clear(pmd);
1237 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1238 	} else {
1239 		get_page(virt_to_page(pmd));
1240 	}
1241 
1242 	kvm_set_pmd(pmd, *new_pmd);
1243 	return 0;
1244 }
1245 
1246 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1247 			       phys_addr_t addr, const pud_t *new_pudp)
1248 {
1249 	pud_t *pudp, old_pud;
1250 
1251 retry:
1252 	pudp = stage2_get_pud(kvm, cache, addr);
1253 	VM_BUG_ON(!pudp);
1254 
1255 	old_pud = *pudp;
1256 
1257 	/*
1258 	 * A large number of vcpus faulting on the same stage 2 entry,
1259 	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1260 	 * Skip updating the page tables if there is no change.
1261 	 */
1262 	if (pud_val(old_pud) == pud_val(*new_pudp))
1263 		return 0;
1264 
1265 	if (stage2_pud_present(kvm, old_pud)) {
1266 		/*
1267 		 * If we already have table level mapping for this block, unmap
1268 		 * the range for this block and retry.
1269 		 */
1270 		if (!stage2_pud_huge(kvm, old_pud)) {
1271 			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1272 			goto retry;
1273 		}
1274 
1275 		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1276 		stage2_pud_clear(kvm, pudp);
1277 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1278 	} else {
1279 		get_page(virt_to_page(pudp));
1280 	}
1281 
1282 	kvm_set_pud(pudp, *new_pudp);
1283 	return 0;
1284 }
1285 
1286 /*
1287  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1288  * true if a valid and present leaf-entry is found. A pointer to the
1289  * leaf-entry is returned in the appropriate level variable - pudpp,
1290  * pmdpp, ptepp.
1291  */
1292 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
1293 				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1294 {
1295 	pud_t *pudp;
1296 	pmd_t *pmdp;
1297 	pte_t *ptep;
1298 
1299 	*pudpp = NULL;
1300 	*pmdpp = NULL;
1301 	*ptepp = NULL;
1302 
1303 	pudp = stage2_get_pud(kvm, NULL, addr);
1304 	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1305 		return false;
1306 
1307 	if (stage2_pud_huge(kvm, *pudp)) {
1308 		*pudpp = pudp;
1309 		return true;
1310 	}
1311 
1312 	pmdp = stage2_pmd_offset(kvm, pudp, addr);
1313 	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1314 		return false;
1315 
1316 	if (pmd_thp_or_huge(*pmdp)) {
1317 		*pmdpp = pmdp;
1318 		return true;
1319 	}
1320 
1321 	ptep = pte_offset_kernel(pmdp, addr);
1322 	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1323 		return false;
1324 
1325 	*ptepp = ptep;
1326 	return true;
1327 }
1328 
1329 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr, unsigned long sz)
1330 {
1331 	pud_t *pudp;
1332 	pmd_t *pmdp;
1333 	pte_t *ptep;
1334 	bool found;
1335 
1336 	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
1337 	if (!found)
1338 		return false;
1339 
1340 	if (pudp)
1341 		return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1342 	else if (pmdp)
1343 		return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1344 	else
1345 		return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1346 }
1347 
1348 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1349 			  phys_addr_t addr, const pte_t *new_pte,
1350 			  unsigned long flags)
1351 {
1352 	pud_t *pud;
1353 	pmd_t *pmd;
1354 	pte_t *pte, old_pte;
1355 	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1356 	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1357 
1358 	VM_BUG_ON(logging_active && !cache);
1359 
1360 	/* Create stage-2 page table mapping - Levels 0 and 1 */
1361 	pud = stage2_get_pud(kvm, cache, addr);
1362 	if (!pud) {
1363 		/*
1364 		 * Ignore calls from kvm_set_spte_hva for unallocated
1365 		 * address ranges.
1366 		 */
1367 		return 0;
1368 	}
1369 
1370 	/*
1371 	 * While dirty page logging - dissolve huge PUD, then continue
1372 	 * on to allocate page.
1373 	 */
1374 	if (logging_active)
1375 		stage2_dissolve_pud(kvm, addr, pud);
1376 
1377 	if (stage2_pud_none(kvm, *pud)) {
1378 		if (!cache)
1379 			return 0; /* ignore calls from kvm_set_spte_hva */
1380 		pmd = mmu_memory_cache_alloc(cache);
1381 		stage2_pud_populate(kvm, pud, pmd);
1382 		get_page(virt_to_page(pud));
1383 	}
1384 
1385 	pmd = stage2_pmd_offset(kvm, pud, addr);
1386 	if (!pmd) {
1387 		/*
1388 		 * Ignore calls from kvm_set_spte_hva for unallocated
1389 		 * address ranges.
1390 		 */
1391 		return 0;
1392 	}
1393 
1394 	/*
1395 	 * While dirty page logging - dissolve huge PMD, then continue on to
1396 	 * allocate page.
1397 	 */
1398 	if (logging_active)
1399 		stage2_dissolve_pmd(kvm, addr, pmd);
1400 
1401 	/* Create stage-2 page mappings - Level 2 */
1402 	if (pmd_none(*pmd)) {
1403 		if (!cache)
1404 			return 0; /* ignore calls from kvm_set_spte_hva */
1405 		pte = mmu_memory_cache_alloc(cache);
1406 		kvm_pmd_populate(pmd, pte);
1407 		get_page(virt_to_page(pmd));
1408 	}
1409 
1410 	pte = pte_offset_kernel(pmd, addr);
1411 
1412 	if (iomap && pte_present(*pte))
1413 		return -EFAULT;
1414 
1415 	/* Create 2nd stage page table mapping - Level 3 */
1416 	old_pte = *pte;
1417 	if (pte_present(old_pte)) {
1418 		/* Skip page table update if there is no change */
1419 		if (pte_val(old_pte) == pte_val(*new_pte))
1420 			return 0;
1421 
1422 		kvm_set_pte(pte, __pte(0));
1423 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1424 	} else {
1425 		get_page(virt_to_page(pte));
1426 	}
1427 
1428 	kvm_set_pte(pte, *new_pte);
1429 	return 0;
1430 }
1431 
1432 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1433 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1434 {
1435 	if (pte_young(*pte)) {
1436 		*pte = pte_mkold(*pte);
1437 		return 1;
1438 	}
1439 	return 0;
1440 }
1441 #else
1442 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1443 {
1444 	return __ptep_test_and_clear_young(pte);
1445 }
1446 #endif
1447 
1448 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1449 {
1450 	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1451 }
1452 
1453 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1454 {
1455 	return stage2_ptep_test_and_clear_young((pte_t *)pud);
1456 }
1457 
1458 /**
1459  * kvm_phys_addr_ioremap - map a device range to guest IPA
1460  *
1461  * @kvm:	The KVM pointer
1462  * @guest_ipa:	The IPA at which to insert the mapping
1463  * @pa:		The physical address of the device
1464  * @size:	The size of the mapping
1465  */
1466 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1467 			  phys_addr_t pa, unsigned long size, bool writable)
1468 {
1469 	phys_addr_t addr, end;
1470 	int ret = 0;
1471 	unsigned long pfn;
1472 	struct kvm_mmu_memory_cache cache = { 0, };
1473 
1474 	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1475 	pfn = __phys_to_pfn(pa);
1476 
1477 	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1478 		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1479 
1480 		if (writable)
1481 			pte = kvm_s2pte_mkwrite(pte);
1482 
1483 		ret = mmu_topup_memory_cache(&cache,
1484 					     kvm_mmu_cache_min_pages(kvm),
1485 					     KVM_NR_MEM_OBJS);
1486 		if (ret)
1487 			goto out;
1488 		spin_lock(&kvm->mmu_lock);
1489 		ret = stage2_set_pte(kvm, &cache, addr, &pte,
1490 						KVM_S2PTE_FLAG_IS_IOMAP);
1491 		spin_unlock(&kvm->mmu_lock);
1492 		if (ret)
1493 			goto out;
1494 
1495 		pfn++;
1496 	}
1497 
1498 out:
1499 	mmu_free_memory_cache(&cache);
1500 	return ret;
1501 }
1502 
1503 /**
1504  * stage2_wp_ptes - write protect PMD range
1505  * @pmd:	pointer to pmd entry
1506  * @addr:	range start address
1507  * @end:	range end address
1508  */
1509 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1510 {
1511 	pte_t *pte;
1512 
1513 	pte = pte_offset_kernel(pmd, addr);
1514 	do {
1515 		if (!pte_none(*pte)) {
1516 			if (!kvm_s2pte_readonly(pte))
1517 				kvm_set_s2pte_readonly(pte);
1518 		}
1519 	} while (pte++, addr += PAGE_SIZE, addr != end);
1520 }
1521 
1522 /**
1523  * stage2_wp_pmds - write protect PUD range
1524  * kvm:		kvm instance for the VM
1525  * @pud:	pointer to pud entry
1526  * @addr:	range start address
1527  * @end:	range end address
1528  */
1529 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1530 			   phys_addr_t addr, phys_addr_t end)
1531 {
1532 	pmd_t *pmd;
1533 	phys_addr_t next;
1534 
1535 	pmd = stage2_pmd_offset(kvm, pud, addr);
1536 
1537 	do {
1538 		next = stage2_pmd_addr_end(kvm, addr, end);
1539 		if (!pmd_none(*pmd)) {
1540 			if (pmd_thp_or_huge(*pmd)) {
1541 				if (!kvm_s2pmd_readonly(pmd))
1542 					kvm_set_s2pmd_readonly(pmd);
1543 			} else {
1544 				stage2_wp_ptes(pmd, addr, next);
1545 			}
1546 		}
1547 	} while (pmd++, addr = next, addr != end);
1548 }
1549 
1550 /**
1551  * stage2_wp_puds - write protect P4D range
1552  * @pgd:	pointer to pgd entry
1553  * @addr:	range start address
1554  * @end:	range end address
1555  */
1556 static void  stage2_wp_puds(struct kvm *kvm, p4d_t *p4d,
1557 			    phys_addr_t addr, phys_addr_t end)
1558 {
1559 	pud_t *pud;
1560 	phys_addr_t next;
1561 
1562 	pud = stage2_pud_offset(kvm, p4d, addr);
1563 	do {
1564 		next = stage2_pud_addr_end(kvm, addr, end);
1565 		if (!stage2_pud_none(kvm, *pud)) {
1566 			if (stage2_pud_huge(kvm, *pud)) {
1567 				if (!kvm_s2pud_readonly(pud))
1568 					kvm_set_s2pud_readonly(pud);
1569 			} else {
1570 				stage2_wp_pmds(kvm, pud, addr, next);
1571 			}
1572 		}
1573 	} while (pud++, addr = next, addr != end);
1574 }
1575 
1576 /**
1577  * stage2_wp_p4ds - write protect PGD range
1578  * @pgd:	pointer to pgd entry
1579  * @addr:	range start address
1580  * @end:	range end address
1581  */
1582 static void  stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd,
1583 			    phys_addr_t addr, phys_addr_t end)
1584 {
1585 	p4d_t *p4d;
1586 	phys_addr_t next;
1587 
1588 	p4d = stage2_p4d_offset(kvm, pgd, addr);
1589 	do {
1590 		next = stage2_p4d_addr_end(kvm, addr, end);
1591 		if (!stage2_p4d_none(kvm, *p4d))
1592 			stage2_wp_puds(kvm, p4d, addr, next);
1593 	} while (p4d++, addr = next, addr != end);
1594 }
1595 
1596 /**
1597  * stage2_wp_range() - write protect stage2 memory region range
1598  * @kvm:	The KVM pointer
1599  * @addr:	Start address of range
1600  * @end:	End address of range
1601  */
1602 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1603 {
1604 	pgd_t *pgd;
1605 	phys_addr_t next;
1606 
1607 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1608 	do {
1609 		/*
1610 		 * Release kvm_mmu_lock periodically if the memory region is
1611 		 * large. Otherwise, we may see kernel panics with
1612 		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1613 		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1614 		 * will also starve other vCPUs. We have to also make sure
1615 		 * that the page tables are not freed while we released
1616 		 * the lock.
1617 		 */
1618 		cond_resched_lock(&kvm->mmu_lock);
1619 		if (!READ_ONCE(kvm->arch.pgd))
1620 			break;
1621 		next = stage2_pgd_addr_end(kvm, addr, end);
1622 		if (stage2_pgd_present(kvm, *pgd))
1623 			stage2_wp_p4ds(kvm, pgd, addr, next);
1624 	} while (pgd++, addr = next, addr != end);
1625 }
1626 
1627 /**
1628  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1629  * @kvm:	The KVM pointer
1630  * @slot:	The memory slot to write protect
1631  *
1632  * Called to start logging dirty pages after memory region
1633  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1634  * all present PUD, PMD and PTEs are write protected in the memory region.
1635  * Afterwards read of dirty page log can be called.
1636  *
1637  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1638  * serializing operations for VM memory regions.
1639  */
1640 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1641 {
1642 	struct kvm_memslots *slots = kvm_memslots(kvm);
1643 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1644 	phys_addr_t start, end;
1645 
1646 	if (WARN_ON_ONCE(!memslot))
1647 		return;
1648 
1649 	start = memslot->base_gfn << PAGE_SHIFT;
1650 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1651 
1652 	spin_lock(&kvm->mmu_lock);
1653 	stage2_wp_range(kvm, start, end);
1654 	spin_unlock(&kvm->mmu_lock);
1655 	kvm_flush_remote_tlbs(kvm);
1656 }
1657 
1658 /**
1659  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1660  * @kvm:	The KVM pointer
1661  * @slot:	The memory slot associated with mask
1662  * @gfn_offset:	The gfn offset in memory slot
1663  * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1664  *		slot to be write protected
1665  *
1666  * Walks bits set in mask write protects the associated pte's. Caller must
1667  * acquire kvm_mmu_lock.
1668  */
1669 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1670 		struct kvm_memory_slot *slot,
1671 		gfn_t gfn_offset, unsigned long mask)
1672 {
1673 	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1674 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1675 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1676 
1677 	stage2_wp_range(kvm, start, end);
1678 }
1679 
1680 /*
1681  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1682  * dirty pages.
1683  *
1684  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1685  * enable dirty logging for them.
1686  */
1687 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1688 		struct kvm_memory_slot *slot,
1689 		gfn_t gfn_offset, unsigned long mask)
1690 {
1691 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1692 }
1693 
1694 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1695 {
1696 	__clean_dcache_guest_page(pfn, size);
1697 }
1698 
1699 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1700 {
1701 	__invalidate_icache_guest_page(pfn, size);
1702 }
1703 
1704 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1705 {
1706 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1707 }
1708 
1709 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1710 					       unsigned long hva,
1711 					       unsigned long map_size)
1712 {
1713 	gpa_t gpa_start;
1714 	hva_t uaddr_start, uaddr_end;
1715 	size_t size;
1716 
1717 	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1718 	if (map_size == PAGE_SIZE)
1719 		return true;
1720 
1721 	size = memslot->npages * PAGE_SIZE;
1722 
1723 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
1724 
1725 	uaddr_start = memslot->userspace_addr;
1726 	uaddr_end = uaddr_start + size;
1727 
1728 	/*
1729 	 * Pages belonging to memslots that don't have the same alignment
1730 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1731 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1732 	 *
1733 	 * Consider a layout like the following:
1734 	 *
1735 	 *    memslot->userspace_addr:
1736 	 *    +-----+--------------------+--------------------+---+
1737 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1738 	 *    +-----+--------------------+--------------------+---+
1739 	 *
1740 	 *    memslot->base_gfn << PAGE_SHIFT:
1741 	 *      +---+--------------------+--------------------+-----+
1742 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1743 	 *      +---+--------------------+--------------------+-----+
1744 	 *
1745 	 * If we create those stage-2 blocks, we'll end up with this incorrect
1746 	 * mapping:
1747 	 *   d -> f
1748 	 *   e -> g
1749 	 *   f -> h
1750 	 */
1751 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1752 		return false;
1753 
1754 	/*
1755 	 * Next, let's make sure we're not trying to map anything not covered
1756 	 * by the memslot. This means we have to prohibit block size mappings
1757 	 * for the beginning and end of a non-block aligned and non-block sized
1758 	 * memory slot (illustrated by the head and tail parts of the
1759 	 * userspace view above containing pages 'abcde' and 'xyz',
1760 	 * respectively).
1761 	 *
1762 	 * Note that it doesn't matter if we do the check using the
1763 	 * userspace_addr or the base_gfn, as both are equally aligned (per
1764 	 * the check above) and equally sized.
1765 	 */
1766 	return (hva & ~(map_size - 1)) >= uaddr_start &&
1767 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1768 }
1769 
1770 /*
1771  * Check if the given hva is backed by a transparent huge page (THP) and
1772  * whether it can be mapped using block mapping in stage2. If so, adjust
1773  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1774  * supported. This will need to be updated to support other THP sizes.
1775  *
1776  * Returns the size of the mapping.
1777  */
1778 static unsigned long
1779 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1780 			    unsigned long hva, kvm_pfn_t *pfnp,
1781 			    phys_addr_t *ipap)
1782 {
1783 	kvm_pfn_t pfn = *pfnp;
1784 
1785 	/*
1786 	 * Make sure the adjustment is done only for THP pages. Also make
1787 	 * sure that the HVA and IPA are sufficiently aligned and that the
1788 	 * block map is contained within the memslot.
1789 	 */
1790 	if (kvm_is_transparent_hugepage(pfn) &&
1791 	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1792 		/*
1793 		 * The address we faulted on is backed by a transparent huge
1794 		 * page.  However, because we map the compound huge page and
1795 		 * not the individual tail page, we need to transfer the
1796 		 * refcount to the head page.  We have to be careful that the
1797 		 * THP doesn't start to split while we are adjusting the
1798 		 * refcounts.
1799 		 *
1800 		 * We are sure this doesn't happen, because mmu_notifier_retry
1801 		 * was successful and we are holding the mmu_lock, so if this
1802 		 * THP is trying to split, it will be blocked in the mmu
1803 		 * notifier before touching any of the pages, specifically
1804 		 * before being able to call __split_huge_page_refcount().
1805 		 *
1806 		 * We can therefore safely transfer the refcount from PG_tail
1807 		 * to PG_head and switch the pfn from a tail page to the head
1808 		 * page accordingly.
1809 		 */
1810 		*ipap &= PMD_MASK;
1811 		kvm_release_pfn_clean(pfn);
1812 		pfn &= ~(PTRS_PER_PMD - 1);
1813 		kvm_get_pfn(pfn);
1814 		*pfnp = pfn;
1815 
1816 		return PMD_SIZE;
1817 	}
1818 
1819 	/* Use page mapping if we cannot use block mapping. */
1820 	return PAGE_SIZE;
1821 }
1822 
1823 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1824 			  struct kvm_memory_slot *memslot, unsigned long hva,
1825 			  unsigned long fault_status)
1826 {
1827 	int ret;
1828 	bool write_fault, writable, force_pte = false;
1829 	bool exec_fault, needs_exec;
1830 	unsigned long mmu_seq;
1831 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1832 	struct kvm *kvm = vcpu->kvm;
1833 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1834 	struct vm_area_struct *vma;
1835 	short vma_shift;
1836 	kvm_pfn_t pfn;
1837 	pgprot_t mem_type = PAGE_S2;
1838 	bool logging_active = memslot_is_logging(memslot);
1839 	unsigned long vma_pagesize, flags = 0;
1840 
1841 	write_fault = kvm_is_write_fault(vcpu);
1842 	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1843 	VM_BUG_ON(write_fault && exec_fault);
1844 
1845 	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1846 		kvm_err("Unexpected L2 read permission error\n");
1847 		return -EFAULT;
1848 	}
1849 
1850 	/* Let's check if we will get back a huge page backed by hugetlbfs */
1851 	mmap_read_lock(current->mm);
1852 	vma = find_vma_intersection(current->mm, hva, hva + 1);
1853 	if (unlikely(!vma)) {
1854 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1855 		mmap_read_unlock(current->mm);
1856 		return -EFAULT;
1857 	}
1858 
1859 	if (is_vm_hugetlb_page(vma))
1860 		vma_shift = huge_page_shift(hstate_vma(vma));
1861 	else
1862 		vma_shift = PAGE_SHIFT;
1863 
1864 	vma_pagesize = 1ULL << vma_shift;
1865 	if (logging_active ||
1866 	    (vma->vm_flags & VM_PFNMAP) ||
1867 	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1868 		force_pte = true;
1869 		vma_pagesize = PAGE_SIZE;
1870 	}
1871 
1872 	/*
1873 	 * The stage2 has a minimum of 2 level table (For arm64 see
1874 	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1875 	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1876 	 * As for PUD huge maps, we must make sure that we have at least
1877 	 * 3 levels, i.e, PMD is not folded.
1878 	 */
1879 	if (vma_pagesize == PMD_SIZE ||
1880 	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1881 		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1882 	mmap_read_unlock(current->mm);
1883 
1884 	/* We need minimum second+third level pages */
1885 	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
1886 				     KVM_NR_MEM_OBJS);
1887 	if (ret)
1888 		return ret;
1889 
1890 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1891 	/*
1892 	 * Ensure the read of mmu_notifier_seq happens before we call
1893 	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1894 	 * the page we just got a reference to gets unmapped before we have a
1895 	 * chance to grab the mmu_lock, which ensure that if the page gets
1896 	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1897 	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1898 	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1899 	 */
1900 	smp_rmb();
1901 
1902 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1903 	if (pfn == KVM_PFN_ERR_HWPOISON) {
1904 		kvm_send_hwpoison_signal(hva, vma_shift);
1905 		return 0;
1906 	}
1907 	if (is_error_noslot_pfn(pfn))
1908 		return -EFAULT;
1909 
1910 	if (kvm_is_device_pfn(pfn)) {
1911 		mem_type = PAGE_S2_DEVICE;
1912 		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1913 	} else if (logging_active) {
1914 		/*
1915 		 * Faults on pages in a memslot with logging enabled
1916 		 * should not be mapped with huge pages (it introduces churn
1917 		 * and performance degradation), so force a pte mapping.
1918 		 */
1919 		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1920 
1921 		/*
1922 		 * Only actually map the page as writable if this was a write
1923 		 * fault.
1924 		 */
1925 		if (!write_fault)
1926 			writable = false;
1927 	}
1928 
1929 	if (exec_fault && is_iomap(flags))
1930 		return -ENOEXEC;
1931 
1932 	spin_lock(&kvm->mmu_lock);
1933 	if (mmu_notifier_retry(kvm, mmu_seq))
1934 		goto out_unlock;
1935 
1936 	/*
1937 	 * If we are not forced to use page mapping, check if we are
1938 	 * backed by a THP and thus use block mapping if possible.
1939 	 */
1940 	if (vma_pagesize == PAGE_SIZE && !force_pte)
1941 		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1942 							   &pfn, &fault_ipa);
1943 	if (writable)
1944 		kvm_set_pfn_dirty(pfn);
1945 
1946 	if (fault_status != FSC_PERM && !is_iomap(flags))
1947 		clean_dcache_guest_page(pfn, vma_pagesize);
1948 
1949 	if (exec_fault)
1950 		invalidate_icache_guest_page(pfn, vma_pagesize);
1951 
1952 	/*
1953 	 * If we took an execution fault we have made the
1954 	 * icache/dcache coherent above and should now let the s2
1955 	 * mapping be executable.
1956 	 *
1957 	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1958 	 * execute permissions, and we preserve whatever we have.
1959 	 */
1960 	needs_exec = exec_fault ||
1961 		(fault_status == FSC_PERM &&
1962 		 stage2_is_exec(kvm, fault_ipa, vma_pagesize));
1963 
1964 	if (vma_pagesize == PUD_SIZE) {
1965 		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1966 
1967 		new_pud = kvm_pud_mkhuge(new_pud);
1968 		if (writable)
1969 			new_pud = kvm_s2pud_mkwrite(new_pud);
1970 
1971 		if (needs_exec)
1972 			new_pud = kvm_s2pud_mkexec(new_pud);
1973 
1974 		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
1975 	} else if (vma_pagesize == PMD_SIZE) {
1976 		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1977 
1978 		new_pmd = kvm_pmd_mkhuge(new_pmd);
1979 
1980 		if (writable)
1981 			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1982 
1983 		if (needs_exec)
1984 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
1985 
1986 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1987 	} else {
1988 		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1989 
1990 		if (writable) {
1991 			new_pte = kvm_s2pte_mkwrite(new_pte);
1992 			mark_page_dirty(kvm, gfn);
1993 		}
1994 
1995 		if (needs_exec)
1996 			new_pte = kvm_s2pte_mkexec(new_pte);
1997 
1998 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1999 	}
2000 
2001 out_unlock:
2002 	spin_unlock(&kvm->mmu_lock);
2003 	kvm_set_pfn_accessed(pfn);
2004 	kvm_release_pfn_clean(pfn);
2005 	return ret;
2006 }
2007 
2008 /*
2009  * Resolve the access fault by making the page young again.
2010  * Note that because the faulting entry is guaranteed not to be
2011  * cached in the TLB, we don't need to invalidate anything.
2012  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
2013  * so there is no need for atomic (pte|pmd)_mkyoung operations.
2014  */
2015 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
2016 {
2017 	pud_t *pud;
2018 	pmd_t *pmd;
2019 	pte_t *pte;
2020 	kvm_pfn_t pfn;
2021 	bool pfn_valid = false;
2022 
2023 	trace_kvm_access_fault(fault_ipa);
2024 
2025 	spin_lock(&vcpu->kvm->mmu_lock);
2026 
2027 	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
2028 		goto out;
2029 
2030 	if (pud) {		/* HugeTLB */
2031 		*pud = kvm_s2pud_mkyoung(*pud);
2032 		pfn = kvm_pud_pfn(*pud);
2033 		pfn_valid = true;
2034 	} else	if (pmd) {	/* THP, HugeTLB */
2035 		*pmd = pmd_mkyoung(*pmd);
2036 		pfn = pmd_pfn(*pmd);
2037 		pfn_valid = true;
2038 	} else {
2039 		*pte = pte_mkyoung(*pte);	/* Just a page... */
2040 		pfn = pte_pfn(*pte);
2041 		pfn_valid = true;
2042 	}
2043 
2044 out:
2045 	spin_unlock(&vcpu->kvm->mmu_lock);
2046 	if (pfn_valid)
2047 		kvm_set_pfn_accessed(pfn);
2048 }
2049 
2050 /**
2051  * kvm_handle_guest_abort - handles all 2nd stage aborts
2052  * @vcpu:	the VCPU pointer
2053  * @run:	the kvm_run structure
2054  *
2055  * Any abort that gets to the host is almost guaranteed to be caused by a
2056  * missing second stage translation table entry, which can mean that either the
2057  * guest simply needs more memory and we must allocate an appropriate page or it
2058  * can mean that the guest tried to access I/O memory, which is emulated by user
2059  * space. The distinction is based on the IPA causing the fault and whether this
2060  * memory region has been registered as standard RAM by user space.
2061  */
2062 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
2063 {
2064 	unsigned long fault_status;
2065 	phys_addr_t fault_ipa;
2066 	struct kvm_memory_slot *memslot;
2067 	unsigned long hva;
2068 	bool is_iabt, write_fault, writable;
2069 	gfn_t gfn;
2070 	int ret, idx;
2071 
2072 	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
2073 
2074 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
2075 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
2076 
2077 	/* Synchronous External Abort? */
2078 	if (kvm_vcpu_dabt_isextabt(vcpu)) {
2079 		/*
2080 		 * For RAS the host kernel may handle this abort.
2081 		 * There is no need to pass the error into the guest.
2082 		 */
2083 		if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
2084 			return 1;
2085 
2086 		if (unlikely(!is_iabt)) {
2087 			kvm_inject_vabt(vcpu);
2088 			return 1;
2089 		}
2090 	}
2091 
2092 	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
2093 			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
2094 
2095 	/* Check the stage-2 fault is trans. fault or write fault */
2096 	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
2097 	    fault_status != FSC_ACCESS) {
2098 		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
2099 			kvm_vcpu_trap_get_class(vcpu),
2100 			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
2101 			(unsigned long)kvm_vcpu_get_hsr(vcpu));
2102 		return -EFAULT;
2103 	}
2104 
2105 	idx = srcu_read_lock(&vcpu->kvm->srcu);
2106 
2107 	gfn = fault_ipa >> PAGE_SHIFT;
2108 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
2109 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
2110 	write_fault = kvm_is_write_fault(vcpu);
2111 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
2112 		if (is_iabt) {
2113 			/* Prefetch Abort on I/O address */
2114 			ret = -ENOEXEC;
2115 			goto out;
2116 		}
2117 
2118 		/*
2119 		 * Check for a cache maintenance operation. Since we
2120 		 * ended-up here, we know it is outside of any memory
2121 		 * slot. But we can't find out if that is for a device,
2122 		 * or if the guest is just being stupid. The only thing
2123 		 * we know for sure is that this range cannot be cached.
2124 		 *
2125 		 * So let's assume that the guest is just being
2126 		 * cautious, and skip the instruction.
2127 		 */
2128 		if (kvm_vcpu_dabt_is_cm(vcpu)) {
2129 			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
2130 			ret = 1;
2131 			goto out_unlock;
2132 		}
2133 
2134 		/*
2135 		 * The IPA is reported as [MAX:12], so we need to
2136 		 * complement it with the bottom 12 bits from the
2137 		 * faulting VA. This is always 12 bits, irrespective
2138 		 * of the page size.
2139 		 */
2140 		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
2141 		ret = io_mem_abort(vcpu, run, fault_ipa);
2142 		goto out_unlock;
2143 	}
2144 
2145 	/* Userspace should not be able to register out-of-bounds IPAs */
2146 	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
2147 
2148 	if (fault_status == FSC_ACCESS) {
2149 		handle_access_fault(vcpu, fault_ipa);
2150 		ret = 1;
2151 		goto out_unlock;
2152 	}
2153 
2154 	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
2155 	if (ret == 0)
2156 		ret = 1;
2157 out:
2158 	if (ret == -ENOEXEC) {
2159 		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2160 		ret = 1;
2161 	}
2162 out_unlock:
2163 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2164 	return ret;
2165 }
2166 
2167 static int handle_hva_to_gpa(struct kvm *kvm,
2168 			     unsigned long start,
2169 			     unsigned long end,
2170 			     int (*handler)(struct kvm *kvm,
2171 					    gpa_t gpa, u64 size,
2172 					    void *data),
2173 			     void *data)
2174 {
2175 	struct kvm_memslots *slots;
2176 	struct kvm_memory_slot *memslot;
2177 	int ret = 0;
2178 
2179 	slots = kvm_memslots(kvm);
2180 
2181 	/* we only care about the pages that the guest sees */
2182 	kvm_for_each_memslot(memslot, slots) {
2183 		unsigned long hva_start, hva_end;
2184 		gfn_t gpa;
2185 
2186 		hva_start = max(start, memslot->userspace_addr);
2187 		hva_end = min(end, memslot->userspace_addr +
2188 					(memslot->npages << PAGE_SHIFT));
2189 		if (hva_start >= hva_end)
2190 			continue;
2191 
2192 		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
2193 		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
2194 	}
2195 
2196 	return ret;
2197 }
2198 
2199 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2200 {
2201 	unmap_stage2_range(kvm, gpa, size);
2202 	return 0;
2203 }
2204 
2205 int kvm_unmap_hva_range(struct kvm *kvm,
2206 			unsigned long start, unsigned long end)
2207 {
2208 	if (!kvm->arch.pgd)
2209 		return 0;
2210 
2211 	trace_kvm_unmap_hva_range(start, end);
2212 	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
2213 	return 0;
2214 }
2215 
2216 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2217 {
2218 	pte_t *pte = (pte_t *)data;
2219 
2220 	WARN_ON(size != PAGE_SIZE);
2221 	/*
2222 	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2223 	 * flag clear because MMU notifiers will have unmapped a huge PMD before
2224 	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2225 	 * therefore stage2_set_pte() never needs to clear out a huge PMD
2226 	 * through this calling path.
2227 	 */
2228 	stage2_set_pte(kvm, NULL, gpa, pte, 0);
2229 	return 0;
2230 }
2231 
2232 
2233 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2234 {
2235 	unsigned long end = hva + PAGE_SIZE;
2236 	kvm_pfn_t pfn = pte_pfn(pte);
2237 	pte_t stage2_pte;
2238 
2239 	if (!kvm->arch.pgd)
2240 		return 0;
2241 
2242 	trace_kvm_set_spte_hva(hva);
2243 
2244 	/*
2245 	 * We've moved a page around, probably through CoW, so let's treat it
2246 	 * just like a translation fault and clean the cache to the PoC.
2247 	 */
2248 	clean_dcache_guest_page(pfn, PAGE_SIZE);
2249 	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
2250 	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2251 
2252 	return 0;
2253 }
2254 
2255 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2256 {
2257 	pud_t *pud;
2258 	pmd_t *pmd;
2259 	pte_t *pte;
2260 
2261 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2262 	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2263 		return 0;
2264 
2265 	if (pud)
2266 		return stage2_pudp_test_and_clear_young(pud);
2267 	else if (pmd)
2268 		return stage2_pmdp_test_and_clear_young(pmd);
2269 	else
2270 		return stage2_ptep_test_and_clear_young(pte);
2271 }
2272 
2273 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2274 {
2275 	pud_t *pud;
2276 	pmd_t *pmd;
2277 	pte_t *pte;
2278 
2279 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2280 	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2281 		return 0;
2282 
2283 	if (pud)
2284 		return kvm_s2pud_young(*pud);
2285 	else if (pmd)
2286 		return pmd_young(*pmd);
2287 	else
2288 		return pte_young(*pte);
2289 }
2290 
2291 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2292 {
2293 	if (!kvm->arch.pgd)
2294 		return 0;
2295 	trace_kvm_age_hva(start, end);
2296 	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2297 }
2298 
2299 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2300 {
2301 	if (!kvm->arch.pgd)
2302 		return 0;
2303 	trace_kvm_test_age_hva(hva);
2304 	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2305 				 kvm_test_age_hva_handler, NULL);
2306 }
2307 
2308 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2309 {
2310 	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
2311 }
2312 
2313 phys_addr_t kvm_mmu_get_httbr(void)
2314 {
2315 	if (__kvm_cpu_uses_extended_idmap())
2316 		return virt_to_phys(merged_hyp_pgd);
2317 	else
2318 		return virt_to_phys(hyp_pgd);
2319 }
2320 
2321 phys_addr_t kvm_get_idmap_vector(void)
2322 {
2323 	return hyp_idmap_vector;
2324 }
2325 
2326 static int kvm_map_idmap_text(pgd_t *pgd)
2327 {
2328 	int err;
2329 
2330 	/* Create the idmap in the boot page tables */
2331 	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
2332 				      hyp_idmap_start, hyp_idmap_end,
2333 				      __phys_to_pfn(hyp_idmap_start),
2334 				      PAGE_HYP_EXEC);
2335 	if (err)
2336 		kvm_err("Failed to idmap %lx-%lx\n",
2337 			hyp_idmap_start, hyp_idmap_end);
2338 
2339 	return err;
2340 }
2341 
2342 int kvm_mmu_init(void)
2343 {
2344 	int err;
2345 
2346 	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2347 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2348 	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2349 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2350 	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2351 
2352 	/*
2353 	 * We rely on the linker script to ensure at build time that the HYP
2354 	 * init code does not cross a page boundary.
2355 	 */
2356 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2357 
2358 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2359 	kvm_debug("HYP VA range: %lx:%lx\n",
2360 		  kern_hyp_va(PAGE_OFFSET),
2361 		  kern_hyp_va((unsigned long)high_memory - 1));
2362 
2363 	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2364 	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2365 	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2366 		/*
2367 		 * The idmap page is intersecting with the VA space,
2368 		 * it is not safe to continue further.
2369 		 */
2370 		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2371 		err = -EINVAL;
2372 		goto out;
2373 	}
2374 
2375 	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
2376 	if (!hyp_pgd) {
2377 		kvm_err("Hyp mode PGD not allocated\n");
2378 		err = -ENOMEM;
2379 		goto out;
2380 	}
2381 
2382 	if (__kvm_cpu_uses_extended_idmap()) {
2383 		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2384 							 hyp_pgd_order);
2385 		if (!boot_hyp_pgd) {
2386 			kvm_err("Hyp boot PGD not allocated\n");
2387 			err = -ENOMEM;
2388 			goto out;
2389 		}
2390 
2391 		err = kvm_map_idmap_text(boot_hyp_pgd);
2392 		if (err)
2393 			goto out;
2394 
2395 		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
2396 		if (!merged_hyp_pgd) {
2397 			kvm_err("Failed to allocate extra HYP pgd\n");
2398 			goto out;
2399 		}
2400 		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
2401 				    hyp_idmap_start);
2402 	} else {
2403 		err = kvm_map_idmap_text(hyp_pgd);
2404 		if (err)
2405 			goto out;
2406 	}
2407 
2408 	io_map_base = hyp_idmap_start;
2409 	return 0;
2410 out:
2411 	free_hyp_pgds();
2412 	return err;
2413 }
2414 
2415 void kvm_arch_commit_memory_region(struct kvm *kvm,
2416 				   const struct kvm_userspace_memory_region *mem,
2417 				   struct kvm_memory_slot *old,
2418 				   const struct kvm_memory_slot *new,
2419 				   enum kvm_mr_change change)
2420 {
2421 	/*
2422 	 * At this point memslot has been committed and there is an
2423 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
2424 	 * memory slot is write protected.
2425 	 */
2426 	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2427 		/*
2428 		 * If we're with initial-all-set, we don't need to write
2429 		 * protect any pages because they're all reported as dirty.
2430 		 * Huge pages and normal pages will be write protect gradually.
2431 		 */
2432 		if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2433 			kvm_mmu_wp_memory_region(kvm, mem->slot);
2434 		}
2435 	}
2436 }
2437 
2438 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2439 				   struct kvm_memory_slot *memslot,
2440 				   const struct kvm_userspace_memory_region *mem,
2441 				   enum kvm_mr_change change)
2442 {
2443 	hva_t hva = mem->userspace_addr;
2444 	hva_t reg_end = hva + mem->memory_size;
2445 	bool writable = !(mem->flags & KVM_MEM_READONLY);
2446 	int ret = 0;
2447 
2448 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2449 			change != KVM_MR_FLAGS_ONLY)
2450 		return 0;
2451 
2452 	/*
2453 	 * Prevent userspace from creating a memory region outside of the IPA
2454 	 * space addressable by the KVM guest IPA space.
2455 	 */
2456 	if (memslot->base_gfn + memslot->npages >=
2457 	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
2458 		return -EFAULT;
2459 
2460 	mmap_read_lock(current->mm);
2461 	/*
2462 	 * A memory region could potentially cover multiple VMAs, and any holes
2463 	 * between them, so iterate over all of them to find out if we can map
2464 	 * any of them right now.
2465 	 *
2466 	 *     +--------------------------------------------+
2467 	 * +---------------+----------------+   +----------------+
2468 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2469 	 * +---------------+----------------+   +----------------+
2470 	 *     |               memory region                |
2471 	 *     +--------------------------------------------+
2472 	 */
2473 	do {
2474 		struct vm_area_struct *vma = find_vma(current->mm, hva);
2475 		hva_t vm_start, vm_end;
2476 
2477 		if (!vma || vma->vm_start >= reg_end)
2478 			break;
2479 
2480 		/*
2481 		 * Take the intersection of this VMA with the memory region
2482 		 */
2483 		vm_start = max(hva, vma->vm_start);
2484 		vm_end = min(reg_end, vma->vm_end);
2485 
2486 		if (vma->vm_flags & VM_PFNMAP) {
2487 			gpa_t gpa = mem->guest_phys_addr +
2488 				    (vm_start - mem->userspace_addr);
2489 			phys_addr_t pa;
2490 
2491 			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2492 			pa += vm_start - vma->vm_start;
2493 
2494 			/* IO region dirty page logging not allowed */
2495 			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2496 				ret = -EINVAL;
2497 				goto out;
2498 			}
2499 
2500 			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2501 						    vm_end - vm_start,
2502 						    writable);
2503 			if (ret)
2504 				break;
2505 		}
2506 		hva = vm_end;
2507 	} while (hva < reg_end);
2508 
2509 	if (change == KVM_MR_FLAGS_ONLY)
2510 		goto out;
2511 
2512 	spin_lock(&kvm->mmu_lock);
2513 	if (ret)
2514 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
2515 	else
2516 		stage2_flush_memslot(kvm, memslot);
2517 	spin_unlock(&kvm->mmu_lock);
2518 out:
2519 	mmap_read_unlock(current->mm);
2520 	return ret;
2521 }
2522 
2523 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2524 {
2525 }
2526 
2527 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2528 {
2529 }
2530 
2531 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2532 {
2533 	kvm_free_stage2_pgd(kvm);
2534 }
2535 
2536 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2537 				   struct kvm_memory_slot *slot)
2538 {
2539 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2540 	phys_addr_t size = slot->npages << PAGE_SHIFT;
2541 
2542 	spin_lock(&kvm->mmu_lock);
2543 	unmap_stage2_range(kvm, gpa, size);
2544 	spin_unlock(&kvm->mmu_lock);
2545 }
2546 
2547 /*
2548  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2549  *
2550  * Main problems:
2551  * - S/W ops are local to a CPU (not broadcast)
2552  * - We have line migration behind our back (speculation)
2553  * - System caches don't support S/W at all (damn!)
2554  *
2555  * In the face of the above, the best we can do is to try and convert
2556  * S/W ops to VA ops. Because the guest is not allowed to infer the
2557  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2558  * which is a rather good thing for us.
2559  *
2560  * Also, it is only used when turning caches on/off ("The expected
2561  * usage of the cache maintenance instructions that operate by set/way
2562  * is associated with the cache maintenance instructions associated
2563  * with the powerdown and powerup of caches, if this is required by
2564  * the implementation.").
2565  *
2566  * We use the following policy:
2567  *
2568  * - If we trap a S/W operation, we enable VM trapping to detect
2569  *   caches being turned on/off, and do a full clean.
2570  *
2571  * - We flush the caches on both caches being turned on and off.
2572  *
2573  * - Once the caches are enabled, we stop trapping VM ops.
2574  */
2575 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2576 {
2577 	unsigned long hcr = *vcpu_hcr(vcpu);
2578 
2579 	/*
2580 	 * If this is the first time we do a S/W operation
2581 	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2582 	 * VM trapping.
2583 	 *
2584 	 * Otherwise, rely on the VM trapping to wait for the MMU +
2585 	 * Caches to be turned off. At that point, we'll be able to
2586 	 * clean the caches again.
2587 	 */
2588 	if (!(hcr & HCR_TVM)) {
2589 		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2590 					vcpu_has_cache_enabled(vcpu));
2591 		stage2_flush_vm(vcpu->kvm);
2592 		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2593 	}
2594 }
2595 
2596 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2597 {
2598 	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2599 
2600 	/*
2601 	 * If switching the MMU+caches on, need to invalidate the caches.
2602 	 * If switching it off, need to clean the caches.
2603 	 * Clean + invalidate does the trick always.
2604 	 */
2605 	if (now_enabled != was_enabled)
2606 		stage2_flush_vm(vcpu->kvm);
2607 
2608 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2609 	if (now_enabled)
2610 		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2611 
2612 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2613 }
2614