xref: /openbmc/linux/arch/arm64/kvm/hyp/pgtable.c (revision 62eab49f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
4  * No bombay mix was harmed in the writing of this file.
5  *
6  * Copyright (C) 2020 Google LLC
7  * Author: Will Deacon <will@kernel.org>
8  */
9 
10 #include <linux/bitfield.h>
11 #include <asm/kvm_pgtable.h>
12 
13 #define KVM_PGTABLE_MAX_LEVELS		4U
14 
15 #define KVM_PTE_VALID			BIT(0)
16 
17 #define KVM_PTE_TYPE			BIT(1)
18 #define KVM_PTE_TYPE_BLOCK		0
19 #define KVM_PTE_TYPE_PAGE		1
20 #define KVM_PTE_TYPE_TABLE		1
21 
22 #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
23 #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
24 
25 #define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
26 
27 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
28 #define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
29 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
30 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
31 #define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
32 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
33 #define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
34 
35 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
36 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
37 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
38 #define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
39 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
40 #define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
41 
42 #define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
43 
44 #define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
45 
46 #define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
47 
48 #define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
49 					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
50 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
51 
52 struct kvm_pgtable_walk_data {
53 	struct kvm_pgtable		*pgt;
54 	struct kvm_pgtable_walker	*walker;
55 
56 	u64				addr;
57 	u64				end;
58 };
59 
60 static u64 kvm_granule_shift(u32 level)
61 {
62 	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
63 	return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
64 }
65 
66 static u64 kvm_granule_size(u32 level)
67 {
68 	return BIT(kvm_granule_shift(level));
69 }
70 
71 static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
72 {
73 	u64 granule = kvm_granule_size(level);
74 
75 	/*
76 	 * Reject invalid block mappings and don't bother with 4TB mappings for
77 	 * 52-bit PAs.
78 	 */
79 	if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
80 		return false;
81 
82 	if (granule > (end - addr))
83 		return false;
84 
85 	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
86 }
87 
88 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
89 {
90 	u64 shift = kvm_granule_shift(level);
91 	u64 mask = BIT(PAGE_SHIFT - 3) - 1;
92 
93 	return (data->addr >> shift) & mask;
94 }
95 
96 static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
97 {
98 	u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
99 	u64 mask = BIT(pgt->ia_bits) - 1;
100 
101 	return (addr & mask) >> shift;
102 }
103 
104 static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
105 {
106 	return __kvm_pgd_page_idx(data->pgt, data->addr);
107 }
108 
109 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
110 {
111 	struct kvm_pgtable pgt = {
112 		.ia_bits	= ia_bits,
113 		.start_level	= start_level,
114 	};
115 
116 	return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
117 }
118 
119 static bool kvm_pte_valid(kvm_pte_t pte)
120 {
121 	return pte & KVM_PTE_VALID;
122 }
123 
124 static bool kvm_pte_table(kvm_pte_t pte, u32 level)
125 {
126 	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
127 		return false;
128 
129 	if (!kvm_pte_valid(pte))
130 		return false;
131 
132 	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
133 }
134 
135 static u64 kvm_pte_to_phys(kvm_pte_t pte)
136 {
137 	u64 pa = pte & KVM_PTE_ADDR_MASK;
138 
139 	if (PAGE_SHIFT == 16)
140 		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
141 
142 	return pa;
143 }
144 
145 static kvm_pte_t kvm_phys_to_pte(u64 pa)
146 {
147 	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
148 
149 	if (PAGE_SHIFT == 16)
150 		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
151 
152 	return pte;
153 }
154 
155 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
156 {
157 	return __va(kvm_pte_to_phys(pte));
158 }
159 
160 static void kvm_set_invalid_pte(kvm_pte_t *ptep)
161 {
162 	kvm_pte_t pte = *ptep;
163 	WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
164 }
165 
166 static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
167 {
168 	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
169 
170 	pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
171 	pte |= KVM_PTE_VALID;
172 
173 	WARN_ON(kvm_pte_valid(old));
174 	smp_store_release(ptep, pte);
175 }
176 
177 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
178 {
179 	kvm_pte_t pte = kvm_phys_to_pte(pa);
180 	u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
181 							   KVM_PTE_TYPE_BLOCK;
182 
183 	pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
184 	pte |= FIELD_PREP(KVM_PTE_TYPE, type);
185 	pte |= KVM_PTE_VALID;
186 
187 	return pte;
188 }
189 
190 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
191 				  u32 level, kvm_pte_t *ptep,
192 				  enum kvm_pgtable_walk_flags flag)
193 {
194 	struct kvm_pgtable_walker *walker = data->walker;
195 	return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
196 }
197 
198 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
199 			      kvm_pte_t *pgtable, u32 level);
200 
201 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
202 				      kvm_pte_t *ptep, u32 level)
203 {
204 	int ret = 0;
205 	u64 addr = data->addr;
206 	kvm_pte_t *childp, pte = *ptep;
207 	bool table = kvm_pte_table(pte, level);
208 	enum kvm_pgtable_walk_flags flags = data->walker->flags;
209 
210 	if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
211 		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
212 					     KVM_PGTABLE_WALK_TABLE_PRE);
213 	}
214 
215 	if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
216 		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
217 					     KVM_PGTABLE_WALK_LEAF);
218 		pte = *ptep;
219 		table = kvm_pte_table(pte, level);
220 	}
221 
222 	if (ret)
223 		goto out;
224 
225 	if (!table) {
226 		data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
227 		data->addr += kvm_granule_size(level);
228 		goto out;
229 	}
230 
231 	childp = kvm_pte_follow(pte);
232 	ret = __kvm_pgtable_walk(data, childp, level + 1);
233 	if (ret)
234 		goto out;
235 
236 	if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
237 		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
238 					     KVM_PGTABLE_WALK_TABLE_POST);
239 	}
240 
241 out:
242 	return ret;
243 }
244 
245 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
246 			      kvm_pte_t *pgtable, u32 level)
247 {
248 	u32 idx;
249 	int ret = 0;
250 
251 	if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
252 		return -EINVAL;
253 
254 	for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
255 		kvm_pte_t *ptep = &pgtable[idx];
256 
257 		if (data->addr >= data->end)
258 			break;
259 
260 		ret = __kvm_pgtable_visit(data, ptep, level);
261 		if (ret)
262 			break;
263 	}
264 
265 	return ret;
266 }
267 
268 static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
269 {
270 	u32 idx;
271 	int ret = 0;
272 	struct kvm_pgtable *pgt = data->pgt;
273 	u64 limit = BIT(pgt->ia_bits);
274 
275 	if (data->addr > limit || data->end > limit)
276 		return -ERANGE;
277 
278 	if (!pgt->pgd)
279 		return -EINVAL;
280 
281 	for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
282 		kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
283 
284 		ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
285 		if (ret)
286 			break;
287 	}
288 
289 	return ret;
290 }
291 
292 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
293 		     struct kvm_pgtable_walker *walker)
294 {
295 	struct kvm_pgtable_walk_data walk_data = {
296 		.pgt	= pgt,
297 		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
298 		.end	= PAGE_ALIGN(walk_data.addr + size),
299 		.walker	= walker,
300 	};
301 
302 	return _kvm_pgtable_walk(&walk_data);
303 }
304 
305 struct hyp_map_data {
306 	u64		phys;
307 	kvm_pte_t	attr;
308 };
309 
310 static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
311 				 struct hyp_map_data *data)
312 {
313 	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
314 	u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
315 	kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
316 	u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
317 	u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
318 					       KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
319 
320 	if (!(prot & KVM_PGTABLE_PROT_R))
321 		return -EINVAL;
322 
323 	if (prot & KVM_PGTABLE_PROT_X) {
324 		if (prot & KVM_PGTABLE_PROT_W)
325 			return -EINVAL;
326 
327 		if (device)
328 			return -EINVAL;
329 	} else {
330 		attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
331 	}
332 
333 	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
334 	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
335 	attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
336 	data->attr = attr;
337 	return 0;
338 }
339 
340 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
341 				    kvm_pte_t *ptep, struct hyp_map_data *data)
342 {
343 	kvm_pte_t new, old = *ptep;
344 	u64 granule = kvm_granule_size(level), phys = data->phys;
345 
346 	if (!kvm_block_mapping_supported(addr, end, phys, level))
347 		return false;
348 
349 	/* Tolerate KVM recreating the exact same mapping */
350 	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
351 	if (old != new && !WARN_ON(kvm_pte_valid(old)))
352 		smp_store_release(ptep, new);
353 
354 	data->phys += granule;
355 	return true;
356 }
357 
358 static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
359 			  enum kvm_pgtable_walk_flags flag, void * const arg)
360 {
361 	kvm_pte_t *childp;
362 
363 	if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
364 		return 0;
365 
366 	if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
367 		return -EINVAL;
368 
369 	childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
370 	if (!childp)
371 		return -ENOMEM;
372 
373 	kvm_set_table_pte(ptep, childp);
374 	return 0;
375 }
376 
377 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
378 			enum kvm_pgtable_prot prot)
379 {
380 	int ret;
381 	struct hyp_map_data map_data = {
382 		.phys	= ALIGN_DOWN(phys, PAGE_SIZE),
383 	};
384 	struct kvm_pgtable_walker walker = {
385 		.cb	= hyp_map_walker,
386 		.flags	= KVM_PGTABLE_WALK_LEAF,
387 		.arg	= &map_data,
388 	};
389 
390 	ret = hyp_map_set_prot_attr(prot, &map_data);
391 	if (ret)
392 		return ret;
393 
394 	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
395 	dsb(ishst);
396 	isb();
397 	return ret;
398 }
399 
400 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
401 {
402 	u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
403 
404 	pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
405 	if (!pgt->pgd)
406 		return -ENOMEM;
407 
408 	pgt->ia_bits		= va_bits;
409 	pgt->start_level	= KVM_PGTABLE_MAX_LEVELS - levels;
410 	pgt->mmu		= NULL;
411 	return 0;
412 }
413 
414 static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
415 			   enum kvm_pgtable_walk_flags flag, void * const arg)
416 {
417 	free_page((unsigned long)kvm_pte_follow(*ptep));
418 	return 0;
419 }
420 
421 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
422 {
423 	struct kvm_pgtable_walker walker = {
424 		.cb	= hyp_free_walker,
425 		.flags	= KVM_PGTABLE_WALK_TABLE_POST,
426 	};
427 
428 	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
429 	free_page((unsigned long)pgt->pgd);
430 	pgt->pgd = NULL;
431 }
432 
433 struct stage2_map_data {
434 	u64				phys;
435 	kvm_pte_t			attr;
436 
437 	kvm_pte_t			*anchor;
438 
439 	struct kvm_s2_mmu		*mmu;
440 	struct kvm_mmu_memory_cache	*memcache;
441 };
442 
443 static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
444 				    struct stage2_map_data *data)
445 {
446 	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
447 	kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
448 			    PAGE_S2_MEMATTR(NORMAL);
449 	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
450 
451 	if (!(prot & KVM_PGTABLE_PROT_X))
452 		attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
453 	else if (device)
454 		return -EINVAL;
455 
456 	if (prot & KVM_PGTABLE_PROT_R)
457 		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
458 
459 	if (prot & KVM_PGTABLE_PROT_W)
460 		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
461 
462 	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
463 	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
464 	data->attr = attr;
465 	return 0;
466 }
467 
468 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
469 				      kvm_pte_t *ptep,
470 				      struct stage2_map_data *data)
471 {
472 	kvm_pte_t new, old = *ptep;
473 	u64 granule = kvm_granule_size(level), phys = data->phys;
474 	struct page *page = virt_to_page(ptep);
475 
476 	if (!kvm_block_mapping_supported(addr, end, phys, level))
477 		return -E2BIG;
478 
479 	new = kvm_init_valid_leaf_pte(phys, data->attr, level);
480 	if (kvm_pte_valid(old)) {
481 		/*
482 		 * Skip updating the PTE if we are trying to recreate the exact
483 		 * same mapping or only change the access permissions. Instead,
484 		 * the vCPU will exit one more time from guest if still needed
485 		 * and then go through the path of relaxing permissions.
486 		 */
487 		if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
488 			return -EAGAIN;
489 
490 		/*
491 		 * There's an existing different valid leaf entry, so perform
492 		 * break-before-make.
493 		 */
494 		kvm_set_invalid_pte(ptep);
495 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
496 		put_page(page);
497 	}
498 
499 	smp_store_release(ptep, new);
500 	get_page(page);
501 	data->phys += granule;
502 	return 0;
503 }
504 
505 static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
506 				     kvm_pte_t *ptep,
507 				     struct stage2_map_data *data)
508 {
509 	if (data->anchor)
510 		return 0;
511 
512 	if (!kvm_block_mapping_supported(addr, end, data->phys, level))
513 		return 0;
514 
515 	kvm_set_invalid_pte(ptep);
516 
517 	/*
518 	 * Invalidate the whole stage-2, as we may have numerous leaf
519 	 * entries below us which would otherwise need invalidating
520 	 * individually.
521 	 */
522 	kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
523 	data->anchor = ptep;
524 	return 0;
525 }
526 
527 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
528 				struct stage2_map_data *data)
529 {
530 	int ret;
531 	kvm_pte_t *childp, pte = *ptep;
532 	struct page *page = virt_to_page(ptep);
533 
534 	if (data->anchor) {
535 		if (kvm_pte_valid(pte))
536 			put_page(page);
537 
538 		return 0;
539 	}
540 
541 	ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data);
542 	if (ret != -E2BIG)
543 		return ret;
544 
545 	if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
546 		return -EINVAL;
547 
548 	if (!data->memcache)
549 		return -ENOMEM;
550 
551 	childp = kvm_mmu_memory_cache_alloc(data->memcache);
552 	if (!childp)
553 		return -ENOMEM;
554 
555 	/*
556 	 * If we've run into an existing block mapping then replace it with
557 	 * a table. Accesses beyond 'end' that fall within the new table
558 	 * will be mapped lazily.
559 	 */
560 	if (kvm_pte_valid(pte)) {
561 		kvm_set_invalid_pte(ptep);
562 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
563 		put_page(page);
564 	}
565 
566 	kvm_set_table_pte(ptep, childp);
567 	get_page(page);
568 
569 	return 0;
570 }
571 
572 static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
573 				      kvm_pte_t *ptep,
574 				      struct stage2_map_data *data)
575 {
576 	int ret = 0;
577 
578 	if (!data->anchor)
579 		return 0;
580 
581 	free_page((unsigned long)kvm_pte_follow(*ptep));
582 	put_page(virt_to_page(ptep));
583 
584 	if (data->anchor == ptep) {
585 		data->anchor = NULL;
586 		ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
587 	}
588 
589 	return ret;
590 }
591 
592 /*
593  * This is a little fiddly, as we use all three of the walk flags. The idea
594  * is that the TABLE_PRE callback runs for table entries on the way down,
595  * looking for table entries which we could conceivably replace with a
596  * block entry for this mapping. If it finds one, then it sets the 'anchor'
597  * field in 'struct stage2_map_data' to point at the table entry, before
598  * clearing the entry to zero and descending into the now detached table.
599  *
600  * The behaviour of the LEAF callback then depends on whether or not the
601  * anchor has been set. If not, then we're not using a block mapping higher
602  * up the table and we perform the mapping at the existing leaves instead.
603  * If, on the other hand, the anchor _is_ set, then we drop references to
604  * all valid leaves so that the pages beneath the anchor can be freed.
605  *
606  * Finally, the TABLE_POST callback does nothing if the anchor has not
607  * been set, but otherwise frees the page-table pages while walking back up
608  * the page-table, installing the block entry when it revisits the anchor
609  * pointer and clearing the anchor to NULL.
610  */
611 static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
612 			     enum kvm_pgtable_walk_flags flag, void * const arg)
613 {
614 	struct stage2_map_data *data = arg;
615 
616 	switch (flag) {
617 	case KVM_PGTABLE_WALK_TABLE_PRE:
618 		return stage2_map_walk_table_pre(addr, end, level, ptep, data);
619 	case KVM_PGTABLE_WALK_LEAF:
620 		return stage2_map_walk_leaf(addr, end, level, ptep, data);
621 	case KVM_PGTABLE_WALK_TABLE_POST:
622 		return stage2_map_walk_table_post(addr, end, level, ptep, data);
623 	}
624 
625 	return -EINVAL;
626 }
627 
628 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
629 			   u64 phys, enum kvm_pgtable_prot prot,
630 			   struct kvm_mmu_memory_cache *mc)
631 {
632 	int ret;
633 	struct stage2_map_data map_data = {
634 		.phys		= ALIGN_DOWN(phys, PAGE_SIZE),
635 		.mmu		= pgt->mmu,
636 		.memcache	= mc,
637 	};
638 	struct kvm_pgtable_walker walker = {
639 		.cb		= stage2_map_walker,
640 		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
641 				  KVM_PGTABLE_WALK_LEAF |
642 				  KVM_PGTABLE_WALK_TABLE_POST,
643 		.arg		= &map_data,
644 	};
645 
646 	ret = stage2_map_set_prot_attr(prot, &map_data);
647 	if (ret)
648 		return ret;
649 
650 	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
651 	dsb(ishst);
652 	return ret;
653 }
654 
655 static void stage2_flush_dcache(void *addr, u64 size)
656 {
657 	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
658 		return;
659 
660 	__flush_dcache_area(addr, size);
661 }
662 
663 static bool stage2_pte_cacheable(kvm_pte_t pte)
664 {
665 	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
666 	return memattr == PAGE_S2_MEMATTR(NORMAL);
667 }
668 
669 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
670 			       enum kvm_pgtable_walk_flags flag,
671 			       void * const arg)
672 {
673 	struct kvm_s2_mmu *mmu = arg;
674 	kvm_pte_t pte = *ptep, *childp = NULL;
675 	bool need_flush = false;
676 
677 	if (!kvm_pte_valid(pte))
678 		return 0;
679 
680 	if (kvm_pte_table(pte, level)) {
681 		childp = kvm_pte_follow(pte);
682 
683 		if (page_count(virt_to_page(childp)) != 1)
684 			return 0;
685 	} else if (stage2_pte_cacheable(pte)) {
686 		need_flush = true;
687 	}
688 
689 	/*
690 	 * This is similar to the map() path in that we unmap the entire
691 	 * block entry and rely on the remaining portions being faulted
692 	 * back lazily.
693 	 */
694 	kvm_set_invalid_pte(ptep);
695 	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
696 	put_page(virt_to_page(ptep));
697 
698 	if (need_flush) {
699 		stage2_flush_dcache(kvm_pte_follow(pte),
700 				    kvm_granule_size(level));
701 	}
702 
703 	if (childp)
704 		free_page((unsigned long)childp);
705 
706 	return 0;
707 }
708 
709 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
710 {
711 	struct kvm_pgtable_walker walker = {
712 		.cb	= stage2_unmap_walker,
713 		.arg	= pgt->mmu,
714 		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
715 	};
716 
717 	return kvm_pgtable_walk(pgt, addr, size, &walker);
718 }
719 
720 struct stage2_attr_data {
721 	kvm_pte_t	attr_set;
722 	kvm_pte_t	attr_clr;
723 	kvm_pte_t	pte;
724 	u32		level;
725 };
726 
727 static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
728 			      enum kvm_pgtable_walk_flags flag,
729 			      void * const arg)
730 {
731 	kvm_pte_t pte = *ptep;
732 	struct stage2_attr_data *data = arg;
733 
734 	if (!kvm_pte_valid(pte))
735 		return 0;
736 
737 	data->level = level;
738 	data->pte = pte;
739 	pte &= ~data->attr_clr;
740 	pte |= data->attr_set;
741 
742 	/*
743 	 * We may race with the CPU trying to set the access flag here,
744 	 * but worst-case the access flag update gets lost and will be
745 	 * set on the next access instead.
746 	 */
747 	if (data->pte != pte)
748 		WRITE_ONCE(*ptep, pte);
749 
750 	return 0;
751 }
752 
753 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
754 				    u64 size, kvm_pte_t attr_set,
755 				    kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
756 				    u32 *level)
757 {
758 	int ret;
759 	kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
760 	struct stage2_attr_data data = {
761 		.attr_set	= attr_set & attr_mask,
762 		.attr_clr	= attr_clr & attr_mask,
763 	};
764 	struct kvm_pgtable_walker walker = {
765 		.cb		= stage2_attr_walker,
766 		.arg		= &data,
767 		.flags		= KVM_PGTABLE_WALK_LEAF,
768 	};
769 
770 	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
771 	if (ret)
772 		return ret;
773 
774 	if (orig_pte)
775 		*orig_pte = data.pte;
776 
777 	if (level)
778 		*level = data.level;
779 	return 0;
780 }
781 
782 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
783 {
784 	return stage2_update_leaf_attrs(pgt, addr, size, 0,
785 					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
786 					NULL, NULL);
787 }
788 
789 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
790 {
791 	kvm_pte_t pte = 0;
792 	stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
793 				 &pte, NULL);
794 	dsb(ishst);
795 	return pte;
796 }
797 
798 kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
799 {
800 	kvm_pte_t pte = 0;
801 	stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
802 				 &pte, NULL);
803 	/*
804 	 * "But where's the TLBI?!", you scream.
805 	 * "Over in the core code", I sigh.
806 	 *
807 	 * See the '->clear_flush_young()' callback on the KVM mmu notifier.
808 	 */
809 	return pte;
810 }
811 
812 bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
813 {
814 	kvm_pte_t pte = 0;
815 	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
816 	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
817 }
818 
819 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
820 				   enum kvm_pgtable_prot prot)
821 {
822 	int ret;
823 	u32 level;
824 	kvm_pte_t set = 0, clr = 0;
825 
826 	if (prot & KVM_PGTABLE_PROT_R)
827 		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
828 
829 	if (prot & KVM_PGTABLE_PROT_W)
830 		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
831 
832 	if (prot & KVM_PGTABLE_PROT_X)
833 		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
834 
835 	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
836 	if (!ret)
837 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
838 	return ret;
839 }
840 
841 static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
842 			       enum kvm_pgtable_walk_flags flag,
843 			       void * const arg)
844 {
845 	kvm_pte_t pte = *ptep;
846 
847 	if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
848 		return 0;
849 
850 	stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
851 	return 0;
852 }
853 
854 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
855 {
856 	struct kvm_pgtable_walker walker = {
857 		.cb	= stage2_flush_walker,
858 		.flags	= KVM_PGTABLE_WALK_LEAF,
859 	};
860 
861 	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
862 		return 0;
863 
864 	return kvm_pgtable_walk(pgt, addr, size, &walker);
865 }
866 
867 int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
868 {
869 	size_t pgd_sz;
870 	u64 vtcr = kvm->arch.vtcr;
871 	u32 ia_bits = VTCR_EL2_IPA(vtcr);
872 	u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
873 	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
874 
875 	pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
876 	pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
877 	if (!pgt->pgd)
878 		return -ENOMEM;
879 
880 	pgt->ia_bits		= ia_bits;
881 	pgt->start_level	= start_level;
882 	pgt->mmu		= &kvm->arch.mmu;
883 
884 	/* Ensure zeroed PGD pages are visible to the hardware walker */
885 	dsb(ishst);
886 	return 0;
887 }
888 
889 static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
890 			      enum kvm_pgtable_walk_flags flag,
891 			      void * const arg)
892 {
893 	kvm_pte_t pte = *ptep;
894 
895 	if (!kvm_pte_valid(pte))
896 		return 0;
897 
898 	put_page(virt_to_page(ptep));
899 
900 	if (kvm_pte_table(pte, level))
901 		free_page((unsigned long)kvm_pte_follow(pte));
902 
903 	return 0;
904 }
905 
906 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
907 {
908 	size_t pgd_sz;
909 	struct kvm_pgtable_walker walker = {
910 		.cb	= stage2_free_walker,
911 		.flags	= KVM_PGTABLE_WALK_LEAF |
912 			  KVM_PGTABLE_WALK_TABLE_POST,
913 	};
914 
915 	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
916 	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
917 	free_pages_exact(pgt->pgd, pgd_sz);
918 	pgt->pgd = NULL;
919 }
920