xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 208012f0)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 }
29 
30 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
31 							     bool shared)
32 {
33 	if (shared)
34 		lockdep_assert_held_read(&kvm->mmu_lock);
35 	else
36 		lockdep_assert_held_write(&kvm->mmu_lock);
37 }
38 
39 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
40 {
41 	if (!kvm->arch.tdp_mmu_enabled)
42 		return;
43 
44 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45 
46 	/*
47 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 	 * can run before the VM is torn down.
49 	 */
50 	rcu_barrier();
51 }
52 
53 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
54 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
55 			  bool shared);
56 
57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
58 {
59 	free_page((unsigned long)sp->spt);
60 	kmem_cache_free(mmu_page_header_cache, sp);
61 }
62 
63 /*
64  * This is called through call_rcu in order to free TDP page table memory
65  * safely with respect to other kernel threads that may be operating on
66  * the memory.
67  * By only accessing TDP MMU page table memory in an RCU read critical
68  * section, and freeing it after a grace period, lockless access to that
69  * memory won't use it after it is freed.
70  */
71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72 {
73 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 					       rcu_head);
75 
76 	tdp_mmu_free_sp(sp);
77 }
78 
79 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
80 			  bool shared)
81 {
82 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
83 
84 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
85 
86 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
87 		return;
88 
89 	WARN_ON(!root->tdp_mmu_page);
90 
91 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92 	list_del_rcu(&root->link);
93 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94 
95 	zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
96 
97 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
98 }
99 
100 /*
101  * Finds the next valid root after root (or the first valid root if root
102  * is NULL), takes a reference on it, and returns that next root. If root
103  * is not NULL, this thread should have already taken a reference on it, and
104  * that reference will be dropped. If no valid root is found, this
105  * function will return NULL.
106  */
107 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
108 					      struct kvm_mmu_page *prev_root,
109 					      bool shared)
110 {
111 	struct kvm_mmu_page *next_root;
112 
113 	rcu_read_lock();
114 
115 	if (prev_root)
116 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
117 						  &prev_root->link,
118 						  typeof(*prev_root), link);
119 	else
120 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121 						   typeof(*next_root), link);
122 
123 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
124 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
125 				&next_root->link, typeof(*next_root), link);
126 
127 	rcu_read_unlock();
128 
129 	if (prev_root)
130 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
131 
132 	return next_root;
133 }
134 
135 /*
136  * Note: this iterator gets and puts references to the roots it iterates over.
137  * This makes it safe to release the MMU lock and yield within the loop, but
138  * if exiting the loop early, the caller must drop the reference to the most
139  * recent root. (Unless keeping a live reference is desirable.)
140  *
141  * If shared is set, this function is operating under the MMU lock in read
142  * mode. In the unlikely event that this thread must free a root, the lock
143  * will be temporarily dropped and reacquired in write mode.
144  */
145 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
146 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
147 	     _root;							\
148 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
149 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
150 		} else
151 
152 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
153 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
154 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
155 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
156 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
157 		} else
158 
159 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
160 						   int level)
161 {
162 	union kvm_mmu_page_role role;
163 
164 	role = vcpu->arch.mmu->mmu_role.base;
165 	role.level = level;
166 	role.direct = true;
167 	role.gpte_is_8_bytes = true;
168 	role.access = ACC_ALL;
169 
170 	return role;
171 }
172 
173 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
174 					       int level)
175 {
176 	struct kvm_mmu_page *sp;
177 
178 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
179 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
180 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
181 
182 	sp->role.word = page_role_for_level(vcpu, level).word;
183 	sp->gfn = gfn;
184 	sp->tdp_mmu_page = true;
185 
186 	trace_kvm_mmu_get_page(sp, true);
187 
188 	return sp;
189 }
190 
191 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
192 {
193 	union kvm_mmu_page_role role;
194 	struct kvm *kvm = vcpu->kvm;
195 	struct kvm_mmu_page *root;
196 
197 	lockdep_assert_held_write(&kvm->mmu_lock);
198 
199 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
200 
201 	/* Check for an existing root before allocating a new one. */
202 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
203 		if (root->role.word == role.word &&
204 		    kvm_tdp_mmu_get_root(kvm, root))
205 			goto out;
206 	}
207 
208 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
209 	refcount_set(&root->tdp_mmu_root_count, 1);
210 
211 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
212 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
213 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
214 
215 out:
216 	return __pa(root->spt);
217 }
218 
219 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
220 				u64 old_spte, u64 new_spte, int level,
221 				bool shared);
222 
223 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
224 {
225 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
226 		return;
227 
228 	if (is_accessed_spte(old_spte) &&
229 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
230 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
231 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
232 }
233 
234 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
235 					  u64 old_spte, u64 new_spte, int level)
236 {
237 	bool pfn_changed;
238 	struct kvm_memory_slot *slot;
239 
240 	if (level > PG_LEVEL_4K)
241 		return;
242 
243 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
244 
245 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
246 	    is_writable_pte(new_spte)) {
247 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
248 		mark_page_dirty_in_slot(kvm, slot, gfn);
249 	}
250 }
251 
252 /**
253  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
254  *
255  * @kvm: kvm instance
256  * @sp: the new page
257  * @shared: This operation may not be running under the exclusive use of
258  *	    the MMU lock and the operation must synchronize with other
259  *	    threads that might be adding or removing pages.
260  * @account_nx: This page replaces a NX large page and should be marked for
261  *		eventual reclaim.
262  */
263 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
264 			      bool shared, bool account_nx)
265 {
266 	if (shared)
267 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
268 	else
269 		lockdep_assert_held_write(&kvm->mmu_lock);
270 
271 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
272 	if (account_nx)
273 		account_huge_nx_page(kvm, sp);
274 
275 	if (shared)
276 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
277 }
278 
279 /**
280  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
281  *
282  * @kvm: kvm instance
283  * @sp: the page to be removed
284  * @shared: This operation may not be running under the exclusive use of
285  *	    the MMU lock and the operation must synchronize with other
286  *	    threads that might be adding or removing pages.
287  */
288 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
289 				bool shared)
290 {
291 	if (shared)
292 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
293 	else
294 		lockdep_assert_held_write(&kvm->mmu_lock);
295 
296 	list_del(&sp->link);
297 	if (sp->lpage_disallowed)
298 		unaccount_huge_nx_page(kvm, sp);
299 
300 	if (shared)
301 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
302 }
303 
304 /**
305  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
306  *
307  * @kvm: kvm instance
308  * @pt: the page removed from the paging structure
309  * @shared: This operation may not be running under the exclusive use
310  *	    of the MMU lock and the operation must synchronize with other
311  *	    threads that might be modifying SPTEs.
312  *
313  * Given a page table that has been removed from the TDP paging structure,
314  * iterates through the page table to clear SPTEs and free child page tables.
315  *
316  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
317  * protection. Since this thread removed it from the paging structure,
318  * this thread will be responsible for ensuring the page is freed. Hence the
319  * early rcu_dereferences in the function.
320  */
321 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
322 					bool shared)
323 {
324 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
325 	int level = sp->role.level;
326 	gfn_t base_gfn = sp->gfn;
327 	u64 old_child_spte;
328 	u64 *sptep;
329 	gfn_t gfn;
330 	int i;
331 
332 	trace_kvm_mmu_prepare_zap_page(sp);
333 
334 	tdp_mmu_unlink_page(kvm, sp, shared);
335 
336 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
337 		sptep = rcu_dereference(pt) + i;
338 		gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
339 
340 		if (shared) {
341 			/*
342 			 * Set the SPTE to a nonpresent value that other
343 			 * threads will not overwrite. If the SPTE was
344 			 * already marked as removed then another thread
345 			 * handling a page fault could overwrite it, so
346 			 * set the SPTE until it is set from some other
347 			 * value to the removed SPTE value.
348 			 */
349 			for (;;) {
350 				old_child_spte = xchg(sptep, REMOVED_SPTE);
351 				if (!is_removed_spte(old_child_spte))
352 					break;
353 				cpu_relax();
354 			}
355 		} else {
356 			/*
357 			 * If the SPTE is not MMU-present, there is no backing
358 			 * page associated with the SPTE and so no side effects
359 			 * that need to be recorded, and exclusive ownership of
360 			 * mmu_lock ensures the SPTE can't be made present.
361 			 * Note, zapping MMIO SPTEs is also unnecessary as they
362 			 * are guarded by the memslots generation, not by being
363 			 * unreachable.
364 			 */
365 			old_child_spte = READ_ONCE(*sptep);
366 			if (!is_shadow_present_pte(old_child_spte))
367 				continue;
368 
369 			/*
370 			 * Marking the SPTE as a removed SPTE is not
371 			 * strictly necessary here as the MMU lock will
372 			 * stop other threads from concurrently modifying
373 			 * this SPTE. Using the removed SPTE value keeps
374 			 * the two branches consistent and simplifies
375 			 * the function.
376 			 */
377 			WRITE_ONCE(*sptep, REMOVED_SPTE);
378 		}
379 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
380 				    old_child_spte, REMOVED_SPTE, level - 1,
381 				    shared);
382 	}
383 
384 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
385 					   KVM_PAGES_PER_HPAGE(level));
386 
387 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
388 }
389 
390 /**
391  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
392  * @kvm: kvm instance
393  * @as_id: the address space of the paging structure the SPTE was a part of
394  * @gfn: the base GFN that was mapped by the SPTE
395  * @old_spte: The value of the SPTE before the change
396  * @new_spte: The value of the SPTE after the change
397  * @level: the level of the PT the SPTE is part of in the paging structure
398  * @shared: This operation may not be running under the exclusive use of
399  *	    the MMU lock and the operation must synchronize with other
400  *	    threads that might be modifying SPTEs.
401  *
402  * Handle bookkeeping that might result from the modification of a SPTE.
403  * This function must be called for all TDP SPTE modifications.
404  */
405 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
406 				  u64 old_spte, u64 new_spte, int level,
407 				  bool shared)
408 {
409 	bool was_present = is_shadow_present_pte(old_spte);
410 	bool is_present = is_shadow_present_pte(new_spte);
411 	bool was_leaf = was_present && is_last_spte(old_spte, level);
412 	bool is_leaf = is_present && is_last_spte(new_spte, level);
413 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
414 
415 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
416 	WARN_ON(level < PG_LEVEL_4K);
417 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
418 
419 	/*
420 	 * If this warning were to trigger it would indicate that there was a
421 	 * missing MMU notifier or a race with some notifier handler.
422 	 * A present, leaf SPTE should never be directly replaced with another
423 	 * present leaf SPTE pointing to a different PFN. A notifier handler
424 	 * should be zapping the SPTE before the main MM's page table is
425 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
426 	 * thread before replacement.
427 	 */
428 	if (was_leaf && is_leaf && pfn_changed) {
429 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
430 		       "SPTE with another present leaf SPTE mapping a\n"
431 		       "different PFN!\n"
432 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
433 		       as_id, gfn, old_spte, new_spte, level);
434 
435 		/*
436 		 * Crash the host to prevent error propagation and guest data
437 		 * corruption.
438 		 */
439 		BUG();
440 	}
441 
442 	if (old_spte == new_spte)
443 		return;
444 
445 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
446 
447 	if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
448 		if (is_large_pte(old_spte))
449 			atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
450 		else
451 			atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
452 	}
453 
454 	/*
455 	 * The only times a SPTE should be changed from a non-present to
456 	 * non-present state is when an MMIO entry is installed/modified/
457 	 * removed. In that case, there is nothing to do here.
458 	 */
459 	if (!was_present && !is_present) {
460 		/*
461 		 * If this change does not involve a MMIO SPTE or removed SPTE,
462 		 * it is unexpected. Log the change, though it should not
463 		 * impact the guest since both the former and current SPTEs
464 		 * are nonpresent.
465 		 */
466 		if (WARN_ON(!is_mmio_spte(old_spte) &&
467 			    !is_mmio_spte(new_spte) &&
468 			    !is_removed_spte(new_spte)))
469 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
470 			       "should not be replaced with another,\n"
471 			       "different nonpresent SPTE, unless one or both\n"
472 			       "are MMIO SPTEs, or the new SPTE is\n"
473 			       "a temporary removed SPTE.\n"
474 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
475 			       as_id, gfn, old_spte, new_spte, level);
476 		return;
477 	}
478 
479 
480 	if (was_leaf && is_dirty_spte(old_spte) &&
481 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
482 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
483 
484 	/*
485 	 * Recursively handle child PTs if the change removed a subtree from
486 	 * the paging structure.
487 	 */
488 	if (was_present && !was_leaf && (pfn_changed || !is_present))
489 		handle_removed_tdp_mmu_page(kvm,
490 				spte_to_child_pt(old_spte, level), shared);
491 }
492 
493 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
494 				u64 old_spte, u64 new_spte, int level,
495 				bool shared)
496 {
497 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
498 			      shared);
499 	handle_changed_spte_acc_track(old_spte, new_spte, level);
500 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
501 				      new_spte, level);
502 }
503 
504 /*
505  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
506  * and handle the associated bookkeeping, but do not mark the page dirty
507  * in KVM's dirty bitmaps.
508  *
509  * @kvm: kvm instance
510  * @iter: a tdp_iter instance currently on the SPTE that should be set
511  * @new_spte: The value the SPTE should be set to
512  * Returns: true if the SPTE was set, false if it was not. If false is returned,
513  *	    this function will have no side-effects.
514  */
515 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
516 							struct tdp_iter *iter,
517 							u64 new_spte)
518 {
519 	lockdep_assert_held_read(&kvm->mmu_lock);
520 
521 	/*
522 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
523 	 * may modify it.
524 	 */
525 	if (is_removed_spte(iter->old_spte))
526 		return false;
527 
528 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
529 		      new_spte) != iter->old_spte)
530 		return false;
531 
532 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
533 			      new_spte, iter->level, true);
534 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
535 
536 	return true;
537 }
538 
539 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
540 					   struct tdp_iter *iter,
541 					   u64 new_spte)
542 {
543 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
544 		return false;
545 
546 	handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
547 				      iter->old_spte, new_spte, iter->level);
548 	return true;
549 }
550 
551 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
552 					   struct tdp_iter *iter)
553 {
554 	/*
555 	 * Freeze the SPTE by setting it to a special,
556 	 * non-present value. This will stop other threads from
557 	 * immediately installing a present entry in its place
558 	 * before the TLBs are flushed.
559 	 */
560 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
561 		return false;
562 
563 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
564 					   KVM_PAGES_PER_HPAGE(iter->level));
565 
566 	/*
567 	 * No other thread can overwrite the removed SPTE as they
568 	 * must either wait on the MMU lock or use
569 	 * tdp_mmu_set_spte_atomic which will not overwrite the
570 	 * special removed SPTE value. No bookkeeping is needed
571 	 * here since the SPTE is going from non-present
572 	 * to non-present.
573 	 */
574 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
575 
576 	return true;
577 }
578 
579 
580 /*
581  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
582  * @kvm: kvm instance
583  * @iter: a tdp_iter instance currently on the SPTE that should be set
584  * @new_spte: The value the SPTE should be set to
585  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
586  *		      of the page. Should be set unless handling an MMU
587  *		      notifier for access tracking. Leaving record_acc_track
588  *		      unset in that case prevents page accesses from being
589  *		      double counted.
590  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
591  *		      appropriate for the change being made. Should be set
592  *		      unless performing certain dirty logging operations.
593  *		      Leaving record_dirty_log unset in that case prevents page
594  *		      writes from being double counted.
595  */
596 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
597 				      u64 new_spte, bool record_acc_track,
598 				      bool record_dirty_log)
599 {
600 	lockdep_assert_held_write(&kvm->mmu_lock);
601 
602 	/*
603 	 * No thread should be using this function to set SPTEs to the
604 	 * temporary removed SPTE value.
605 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
606 	 * should be used. If operating under the MMU lock in write mode, the
607 	 * use of the removed SPTE should not be necessary.
608 	 */
609 	WARN_ON(is_removed_spte(iter->old_spte));
610 
611 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
612 
613 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
614 			      new_spte, iter->level, false);
615 	if (record_acc_track)
616 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
617 					      iter->level);
618 	if (record_dirty_log)
619 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
620 					      iter->old_spte, new_spte,
621 					      iter->level);
622 }
623 
624 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
625 				    u64 new_spte)
626 {
627 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
628 }
629 
630 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
631 						 struct tdp_iter *iter,
632 						 u64 new_spte)
633 {
634 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
635 }
636 
637 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
638 						 struct tdp_iter *iter,
639 						 u64 new_spte)
640 {
641 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
642 }
643 
644 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
645 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
646 
647 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
648 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
649 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
650 		    !is_last_spte(_iter.old_spte, _iter.level))		\
651 			continue;					\
652 		else
653 
654 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
655 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
656 			 _mmu->shadow_root_level, _start, _end)
657 
658 /*
659  * Yield if the MMU lock is contended or this thread needs to return control
660  * to the scheduler.
661  *
662  * If this function should yield and flush is set, it will perform a remote
663  * TLB flush before yielding.
664  *
665  * If this function yields, it will also reset the tdp_iter's walk over the
666  * paging structure and the calling function should skip to the next
667  * iteration to allow the iterator to continue its traversal from the
668  * paging structure root.
669  *
670  * Return true if this function yielded and the iterator's traversal was reset.
671  * Return false if a yield was not needed.
672  */
673 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
674 					     struct tdp_iter *iter, bool flush,
675 					     bool shared)
676 {
677 	/* Ensure forward progress has been made before yielding. */
678 	if (iter->next_last_level_gfn == iter->yielded_gfn)
679 		return false;
680 
681 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
682 		rcu_read_unlock();
683 
684 		if (flush)
685 			kvm_flush_remote_tlbs(kvm);
686 
687 		if (shared)
688 			cond_resched_rwlock_read(&kvm->mmu_lock);
689 		else
690 			cond_resched_rwlock_write(&kvm->mmu_lock);
691 
692 		rcu_read_lock();
693 
694 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
695 
696 		tdp_iter_restart(iter);
697 
698 		return true;
699 	}
700 
701 	return false;
702 }
703 
704 /*
705  * Tears down the mappings for the range of gfns, [start, end), and frees the
706  * non-root pages mapping GFNs strictly within that range. Returns true if
707  * SPTEs have been cleared and a TLB flush is needed before releasing the
708  * MMU lock.
709  *
710  * If can_yield is true, will release the MMU lock and reschedule if the
711  * scheduler needs the CPU or there is contention on the MMU lock. If this
712  * function cannot yield, it will not release the MMU lock or reschedule and
713  * the caller must ensure it does not supply too large a GFN range, or the
714  * operation can cause a soft lockup.
715  *
716  * If shared is true, this thread holds the MMU lock in read mode and must
717  * account for the possibility that other threads are modifying the paging
718  * structures concurrently. If shared is false, this thread should hold the
719  * MMU lock in write mode.
720  */
721 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
722 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
723 			  bool shared)
724 {
725 	struct tdp_iter iter;
726 
727 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
728 
729 	rcu_read_lock();
730 
731 	tdp_root_for_each_pte(iter, root, start, end) {
732 retry:
733 		if (can_yield &&
734 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
735 			flush = false;
736 			continue;
737 		}
738 
739 		if (!is_shadow_present_pte(iter.old_spte))
740 			continue;
741 
742 		/*
743 		 * If this is a non-last-level SPTE that covers a larger range
744 		 * than should be zapped, continue, and zap the mappings at a
745 		 * lower level.
746 		 */
747 		if ((iter.gfn < start ||
748 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
749 		    !is_last_spte(iter.old_spte, iter.level))
750 			continue;
751 
752 		if (!shared) {
753 			tdp_mmu_set_spte(kvm, &iter, 0);
754 			flush = true;
755 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
756 			/*
757 			 * The iter must explicitly re-read the SPTE because
758 			 * the atomic cmpxchg failed.
759 			 */
760 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
761 			goto retry;
762 		}
763 	}
764 
765 	rcu_read_unlock();
766 	return flush;
767 }
768 
769 /*
770  * Tears down the mappings for the range of gfns, [start, end), and frees the
771  * non-root pages mapping GFNs strictly within that range. Returns true if
772  * SPTEs have been cleared and a TLB flush is needed before releasing the
773  * MMU lock.
774  *
775  * If shared is true, this thread holds the MMU lock in read mode and must
776  * account for the possibility that other threads are modifying the paging
777  * structures concurrently. If shared is false, this thread should hold the
778  * MMU in write mode.
779  */
780 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
781 				 gfn_t end, bool can_yield, bool flush,
782 				 bool shared)
783 {
784 	struct kvm_mmu_page *root;
785 
786 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
787 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
788 				      shared);
789 
790 	return flush;
791 }
792 
793 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
794 {
795 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
796 	bool flush = false;
797 	int i;
798 
799 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
800 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
801 						  flush, false);
802 
803 	if (flush)
804 		kvm_flush_remote_tlbs(kvm);
805 }
806 
807 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
808 						  struct kvm_mmu_page *prev_root)
809 {
810 	struct kvm_mmu_page *next_root;
811 
812 	if (prev_root)
813 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
814 						  &prev_root->link,
815 						  typeof(*prev_root), link);
816 	else
817 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
818 						   typeof(*next_root), link);
819 
820 	while (next_root && !(next_root->role.invalid &&
821 			      refcount_read(&next_root->tdp_mmu_root_count)))
822 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
823 						  &next_root->link,
824 						  typeof(*next_root), link);
825 
826 	return next_root;
827 }
828 
829 /*
830  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
831  * invalidated root, they will not be freed until this function drops the
832  * reference. Before dropping that reference, tear down the paging
833  * structure so that whichever thread does drop the last reference
834  * only has to do a trivial amount of work. Since the roots are invalid,
835  * no new SPTEs should be created under them.
836  */
837 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
838 {
839 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
840 	struct kvm_mmu_page *next_root;
841 	struct kvm_mmu_page *root;
842 	bool flush = false;
843 
844 	lockdep_assert_held_read(&kvm->mmu_lock);
845 
846 	rcu_read_lock();
847 
848 	root = next_invalidated_root(kvm, NULL);
849 
850 	while (root) {
851 		next_root = next_invalidated_root(kvm, root);
852 
853 		rcu_read_unlock();
854 
855 		flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
856 				      true);
857 
858 		/*
859 		 * Put the reference acquired in
860 		 * kvm_tdp_mmu_invalidate_roots
861 		 */
862 		kvm_tdp_mmu_put_root(kvm, root, true);
863 
864 		root = next_root;
865 
866 		rcu_read_lock();
867 	}
868 
869 	rcu_read_unlock();
870 
871 	if (flush)
872 		kvm_flush_remote_tlbs(kvm);
873 }
874 
875 /*
876  * Mark each TDP MMU root as invalid so that other threads
877  * will drop their references and allow the root count to
878  * go to 0.
879  *
880  * Also take a reference on all roots so that this thread
881  * can do the bulk of the work required to free the roots
882  * once they are invalidated. Without this reference, a
883  * vCPU thread might drop the last reference to a root and
884  * get stuck with tearing down the entire paging structure.
885  *
886  * Roots which have a zero refcount should be skipped as
887  * they're already being torn down.
888  * Already invalid roots should be referenced again so that
889  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
890  * done with them.
891  *
892  * This has essentially the same effect for the TDP MMU
893  * as updating mmu_valid_gen does for the shadow MMU.
894  */
895 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
896 {
897 	struct kvm_mmu_page *root;
898 
899 	lockdep_assert_held_write(&kvm->mmu_lock);
900 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
901 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
902 			root->role.invalid = true;
903 }
904 
905 /*
906  * Installs a last-level SPTE to handle a TDP page fault.
907  * (NPT/EPT violation/misconfiguration)
908  */
909 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
910 					  int map_writable,
911 					  struct tdp_iter *iter,
912 					  kvm_pfn_t pfn, bool prefault)
913 {
914 	u64 new_spte;
915 	int ret = 0;
916 	int make_spte_ret = 0;
917 
918 	if (unlikely(is_noslot_pfn(pfn)))
919 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
920 	else
921 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
922 					 pfn, iter->old_spte, prefault, true,
923 					 map_writable, !shadow_accessed_mask,
924 					 &new_spte);
925 
926 	if (new_spte == iter->old_spte)
927 		ret = RET_PF_SPURIOUS;
928 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
929 		return RET_PF_RETRY;
930 
931 	/*
932 	 * If the page fault was caused by a write but the page is write
933 	 * protected, emulation is needed. If the emulation was skipped,
934 	 * the vCPU would have the same fault again.
935 	 */
936 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
937 		if (write)
938 			ret = RET_PF_EMULATE;
939 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
940 	}
941 
942 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
943 	if (unlikely(is_mmio_spte(new_spte))) {
944 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
945 				     new_spte);
946 		ret = RET_PF_EMULATE;
947 	} else {
948 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
949 				       rcu_dereference(iter->sptep));
950 	}
951 
952 	if (!prefault)
953 		vcpu->stat.pf_fixed++;
954 
955 	return ret;
956 }
957 
958 /*
959  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
960  * page tables and SPTEs to translate the faulting guest physical address.
961  */
962 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
963 		    int map_writable, int max_level, kvm_pfn_t pfn,
964 		    bool prefault)
965 {
966 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
967 	bool write = error_code & PFERR_WRITE_MASK;
968 	bool exec = error_code & PFERR_FETCH_MASK;
969 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
970 	struct kvm_mmu *mmu = vcpu->arch.mmu;
971 	struct tdp_iter iter;
972 	struct kvm_mmu_page *sp;
973 	u64 *child_pt;
974 	u64 new_spte;
975 	int ret;
976 	gfn_t gfn = gpa >> PAGE_SHIFT;
977 	int level;
978 	int req_level;
979 
980 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
981 		return RET_PF_RETRY;
982 	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
983 		return RET_PF_RETRY;
984 
985 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
986 					huge_page_disallowed, &req_level);
987 
988 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
989 
990 	rcu_read_lock();
991 
992 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
993 		if (nx_huge_page_workaround_enabled)
994 			disallowed_hugepage_adjust(iter.old_spte, gfn,
995 						   iter.level, &pfn, &level);
996 
997 		if (iter.level == level)
998 			break;
999 
1000 		/*
1001 		 * If there is an SPTE mapping a large page at a higher level
1002 		 * than the target, that SPTE must be cleared and replaced
1003 		 * with a non-leaf SPTE.
1004 		 */
1005 		if (is_shadow_present_pte(iter.old_spte) &&
1006 		    is_large_pte(iter.old_spte)) {
1007 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1008 				break;
1009 
1010 			/*
1011 			 * The iter must explicitly re-read the spte here
1012 			 * because the new value informs the !present
1013 			 * path below.
1014 			 */
1015 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1016 		}
1017 
1018 		if (!is_shadow_present_pte(iter.old_spte)) {
1019 			/*
1020 			 * If SPTE has been forzen by another thread, just
1021 			 * give up and retry, avoiding unnecessary page table
1022 			 * allocation and free.
1023 			 */
1024 			if (is_removed_spte(iter.old_spte))
1025 				break;
1026 
1027 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
1028 			child_pt = sp->spt;
1029 
1030 			new_spte = make_nonleaf_spte(child_pt,
1031 						     !shadow_accessed_mask);
1032 
1033 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1034 						    new_spte)) {
1035 				tdp_mmu_link_page(vcpu->kvm, sp, true,
1036 						  huge_page_disallowed &&
1037 						  req_level >= iter.level);
1038 
1039 				trace_kvm_mmu_get_page(sp, true);
1040 			} else {
1041 				tdp_mmu_free_sp(sp);
1042 				break;
1043 			}
1044 		}
1045 	}
1046 
1047 	if (iter.level != level) {
1048 		rcu_read_unlock();
1049 		return RET_PF_RETRY;
1050 	}
1051 
1052 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1053 					      pfn, prefault);
1054 	rcu_read_unlock();
1055 
1056 	return ret;
1057 }
1058 
1059 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1060 				 bool flush)
1061 {
1062 	struct kvm_mmu_page *root;
1063 
1064 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1065 		flush |= zap_gfn_range(kvm, root, range->start, range->end,
1066 				       range->may_block, flush, false);
1067 
1068 	return flush;
1069 }
1070 
1071 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1072 			      struct kvm_gfn_range *range);
1073 
1074 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1075 						   struct kvm_gfn_range *range,
1076 						   tdp_handler_t handler)
1077 {
1078 	struct kvm_mmu_page *root;
1079 	struct tdp_iter iter;
1080 	bool ret = false;
1081 
1082 	rcu_read_lock();
1083 
1084 	/*
1085 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1086 	 * into this helper allow blocking; it'd be dead, wasteful code.
1087 	 */
1088 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1089 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1090 			ret |= handler(kvm, &iter, range);
1091 	}
1092 
1093 	rcu_read_unlock();
1094 
1095 	return ret;
1096 }
1097 
1098 /*
1099  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1100  * if any of the GFNs in the range have been accessed.
1101  */
1102 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1103 			  struct kvm_gfn_range *range)
1104 {
1105 	u64 new_spte = 0;
1106 
1107 	/* If we have a non-accessed entry we don't need to change the pte. */
1108 	if (!is_accessed_spte(iter->old_spte))
1109 		return false;
1110 
1111 	new_spte = iter->old_spte;
1112 
1113 	if (spte_ad_enabled(new_spte)) {
1114 		new_spte &= ~shadow_accessed_mask;
1115 	} else {
1116 		/*
1117 		 * Capture the dirty status of the page, so that it doesn't get
1118 		 * lost when the SPTE is marked for access tracking.
1119 		 */
1120 		if (is_writable_pte(new_spte))
1121 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1122 
1123 		new_spte = mark_spte_for_access_track(new_spte);
1124 	}
1125 
1126 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1127 
1128 	return true;
1129 }
1130 
1131 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1132 {
1133 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1134 }
1135 
1136 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1137 			 struct kvm_gfn_range *range)
1138 {
1139 	return is_accessed_spte(iter->old_spte);
1140 }
1141 
1142 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1143 {
1144 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1145 }
1146 
1147 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1148 			 struct kvm_gfn_range *range)
1149 {
1150 	u64 new_spte;
1151 
1152 	/* Huge pages aren't expected to be modified without first being zapped. */
1153 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1154 
1155 	if (iter->level != PG_LEVEL_4K ||
1156 	    !is_shadow_present_pte(iter->old_spte))
1157 		return false;
1158 
1159 	/*
1160 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1161 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1162 	 * invariant that the PFN of a present * leaf SPTE can never change.
1163 	 * See __handle_changed_spte().
1164 	 */
1165 	tdp_mmu_set_spte(kvm, iter, 0);
1166 
1167 	if (!pte_write(range->pte)) {
1168 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1169 								  pte_pfn(range->pte));
1170 
1171 		tdp_mmu_set_spte(kvm, iter, new_spte);
1172 	}
1173 
1174 	return true;
1175 }
1176 
1177 /*
1178  * Handle the changed_pte MMU notifier for the TDP MMU.
1179  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1180  * notifier.
1181  * Returns non-zero if a flush is needed before releasing the MMU lock.
1182  */
1183 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1184 {
1185 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1186 
1187 	/* FIXME: return 'flush' instead of flushing here. */
1188 	if (flush)
1189 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1190 
1191 	return false;
1192 }
1193 
1194 /*
1195  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1196  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1197  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1198  */
1199 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1200 			     gfn_t start, gfn_t end, int min_level)
1201 {
1202 	struct tdp_iter iter;
1203 	u64 new_spte;
1204 	bool spte_set = false;
1205 
1206 	rcu_read_lock();
1207 
1208 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1209 
1210 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1211 				   min_level, start, end) {
1212 retry:
1213 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1214 			continue;
1215 
1216 		if (!is_shadow_present_pte(iter.old_spte) ||
1217 		    !is_last_spte(iter.old_spte, iter.level) ||
1218 		    !(iter.old_spte & PT_WRITABLE_MASK))
1219 			continue;
1220 
1221 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1222 
1223 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1224 							  new_spte)) {
1225 			/*
1226 			 * The iter must explicitly re-read the SPTE because
1227 			 * the atomic cmpxchg failed.
1228 			 */
1229 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1230 			goto retry;
1231 		}
1232 		spte_set = true;
1233 	}
1234 
1235 	rcu_read_unlock();
1236 	return spte_set;
1237 }
1238 
1239 /*
1240  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1241  * only affect leaf SPTEs down to min_level.
1242  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1243  */
1244 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1245 			     int min_level)
1246 {
1247 	struct kvm_mmu_page *root;
1248 	bool spte_set = false;
1249 
1250 	lockdep_assert_held_read(&kvm->mmu_lock);
1251 
1252 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1253 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1254 			     slot->base_gfn + slot->npages, min_level);
1255 
1256 	return spte_set;
1257 }
1258 
1259 /*
1260  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1261  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1262  * If AD bits are not enabled, this will require clearing the writable bit on
1263  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1264  * be flushed.
1265  */
1266 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1267 			   gfn_t start, gfn_t end)
1268 {
1269 	struct tdp_iter iter;
1270 	u64 new_spte;
1271 	bool spte_set = false;
1272 
1273 	rcu_read_lock();
1274 
1275 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1276 retry:
1277 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1278 			continue;
1279 
1280 		if (spte_ad_need_write_protect(iter.old_spte)) {
1281 			if (is_writable_pte(iter.old_spte))
1282 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1283 			else
1284 				continue;
1285 		} else {
1286 			if (iter.old_spte & shadow_dirty_mask)
1287 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1288 			else
1289 				continue;
1290 		}
1291 
1292 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1293 							  new_spte)) {
1294 			/*
1295 			 * The iter must explicitly re-read the SPTE because
1296 			 * the atomic cmpxchg failed.
1297 			 */
1298 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1299 			goto retry;
1300 		}
1301 		spte_set = true;
1302 	}
1303 
1304 	rcu_read_unlock();
1305 	return spte_set;
1306 }
1307 
1308 /*
1309  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1310  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1311  * If AD bits are not enabled, this will require clearing the writable bit on
1312  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1313  * be flushed.
1314  */
1315 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1316 {
1317 	struct kvm_mmu_page *root;
1318 	bool spte_set = false;
1319 
1320 	lockdep_assert_held_read(&kvm->mmu_lock);
1321 
1322 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1323 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1324 				slot->base_gfn + slot->npages);
1325 
1326 	return spte_set;
1327 }
1328 
1329 /*
1330  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1331  * set in mask, starting at gfn. The given memslot is expected to contain all
1332  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1333  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1334  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1335  */
1336 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1337 				  gfn_t gfn, unsigned long mask, bool wrprot)
1338 {
1339 	struct tdp_iter iter;
1340 	u64 new_spte;
1341 
1342 	rcu_read_lock();
1343 
1344 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1345 				    gfn + BITS_PER_LONG) {
1346 		if (!mask)
1347 			break;
1348 
1349 		if (iter.level > PG_LEVEL_4K ||
1350 		    !(mask & (1UL << (iter.gfn - gfn))))
1351 			continue;
1352 
1353 		mask &= ~(1UL << (iter.gfn - gfn));
1354 
1355 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1356 			if (is_writable_pte(iter.old_spte))
1357 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1358 			else
1359 				continue;
1360 		} else {
1361 			if (iter.old_spte & shadow_dirty_mask)
1362 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1363 			else
1364 				continue;
1365 		}
1366 
1367 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1368 	}
1369 
1370 	rcu_read_unlock();
1371 }
1372 
1373 /*
1374  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1375  * set in mask, starting at gfn. The given memslot is expected to contain all
1376  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1377  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1378  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1379  */
1380 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1381 				       struct kvm_memory_slot *slot,
1382 				       gfn_t gfn, unsigned long mask,
1383 				       bool wrprot)
1384 {
1385 	struct kvm_mmu_page *root;
1386 
1387 	lockdep_assert_held_write(&kvm->mmu_lock);
1388 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1389 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1390 }
1391 
1392 /*
1393  * Clear leaf entries which could be replaced by large mappings, for
1394  * GFNs within the slot.
1395  */
1396 static bool zap_collapsible_spte_range(struct kvm *kvm,
1397 				       struct kvm_mmu_page *root,
1398 				       const struct kvm_memory_slot *slot,
1399 				       bool flush)
1400 {
1401 	gfn_t start = slot->base_gfn;
1402 	gfn_t end = start + slot->npages;
1403 	struct tdp_iter iter;
1404 	kvm_pfn_t pfn;
1405 
1406 	rcu_read_lock();
1407 
1408 	tdp_root_for_each_pte(iter, root, start, end) {
1409 retry:
1410 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1411 			flush = false;
1412 			continue;
1413 		}
1414 
1415 		if (!is_shadow_present_pte(iter.old_spte) ||
1416 		    !is_last_spte(iter.old_spte, iter.level))
1417 			continue;
1418 
1419 		pfn = spte_to_pfn(iter.old_spte);
1420 		if (kvm_is_reserved_pfn(pfn) ||
1421 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1422 							    pfn, PG_LEVEL_NUM))
1423 			continue;
1424 
1425 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1426 			/*
1427 			 * The iter must explicitly re-read the SPTE because
1428 			 * the atomic cmpxchg failed.
1429 			 */
1430 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1431 			goto retry;
1432 		}
1433 		flush = true;
1434 	}
1435 
1436 	rcu_read_unlock();
1437 
1438 	return flush;
1439 }
1440 
1441 /*
1442  * Clear non-leaf entries (and free associated page tables) which could
1443  * be replaced by large mappings, for GFNs within the slot.
1444  */
1445 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1446 				       const struct kvm_memory_slot *slot,
1447 				       bool flush)
1448 {
1449 	struct kvm_mmu_page *root;
1450 
1451 	lockdep_assert_held_read(&kvm->mmu_lock);
1452 
1453 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1454 		flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1455 
1456 	return flush;
1457 }
1458 
1459 /*
1460  * Removes write access on the last level SPTE mapping this GFN and unsets the
1461  * MMU-writable bit to ensure future writes continue to be intercepted.
1462  * Returns true if an SPTE was set and a TLB flush is needed.
1463  */
1464 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1465 			      gfn_t gfn)
1466 {
1467 	struct tdp_iter iter;
1468 	u64 new_spte;
1469 	bool spte_set = false;
1470 
1471 	rcu_read_lock();
1472 
1473 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1474 		if (!is_writable_pte(iter.old_spte))
1475 			break;
1476 
1477 		new_spte = iter.old_spte &
1478 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1479 
1480 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1481 		spte_set = true;
1482 	}
1483 
1484 	rcu_read_unlock();
1485 
1486 	return spte_set;
1487 }
1488 
1489 /*
1490  * Removes write access on the last level SPTE mapping this GFN and unsets the
1491  * MMU-writable bit to ensure future writes continue to be intercepted.
1492  * Returns true if an SPTE was set and a TLB flush is needed.
1493  */
1494 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1495 				   struct kvm_memory_slot *slot, gfn_t gfn)
1496 {
1497 	struct kvm_mmu_page *root;
1498 	bool spte_set = false;
1499 
1500 	lockdep_assert_held_write(&kvm->mmu_lock);
1501 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1502 		spte_set |= write_protect_gfn(kvm, root, gfn);
1503 
1504 	return spte_set;
1505 }
1506 
1507 /*
1508  * Return the level of the lowest level SPTE added to sptes.
1509  * That SPTE may be non-present.
1510  */
1511 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1512 			 int *root_level)
1513 {
1514 	struct tdp_iter iter;
1515 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1516 	gfn_t gfn = addr >> PAGE_SHIFT;
1517 	int leaf = -1;
1518 
1519 	*root_level = vcpu->arch.mmu->shadow_root_level;
1520 
1521 	rcu_read_lock();
1522 
1523 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1524 		leaf = iter.level;
1525 		sptes[leaf] = iter.old_spte;
1526 	}
1527 
1528 	rcu_read_unlock();
1529 
1530 	return leaf;
1531 }
1532