xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 6005a8e9)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 }
29 
30 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
31 							     bool shared)
32 {
33 	if (shared)
34 		lockdep_assert_held_read(&kvm->mmu_lock);
35 	else
36 		lockdep_assert_held_write(&kvm->mmu_lock);
37 }
38 
39 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
40 {
41 	if (!kvm->arch.tdp_mmu_enabled)
42 		return;
43 
44 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45 
46 	/*
47 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 	 * can run before the VM is torn down.
49 	 */
50 	rcu_barrier();
51 }
52 
53 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
54 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
55 			  bool shared);
56 
57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
58 {
59 	free_page((unsigned long)sp->spt);
60 	kmem_cache_free(mmu_page_header_cache, sp);
61 }
62 
63 /*
64  * This is called through call_rcu in order to free TDP page table memory
65  * safely with respect to other kernel threads that may be operating on
66  * the memory.
67  * By only accessing TDP MMU page table memory in an RCU read critical
68  * section, and freeing it after a grace period, lockless access to that
69  * memory won't use it after it is freed.
70  */
71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72 {
73 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 					       rcu_head);
75 
76 	tdp_mmu_free_sp(sp);
77 }
78 
79 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
80 			  bool shared)
81 {
82 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
83 
84 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
85 
86 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
87 		return;
88 
89 	WARN_ON(!root->tdp_mmu_page);
90 
91 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92 	list_del_rcu(&root->link);
93 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94 
95 	zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
96 
97 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
98 }
99 
100 /*
101  * Finds the next valid root after root (or the first valid root if root
102  * is NULL), takes a reference on it, and returns that next root. If root
103  * is not NULL, this thread should have already taken a reference on it, and
104  * that reference will be dropped. If no valid root is found, this
105  * function will return NULL.
106  */
107 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
108 					      struct kvm_mmu_page *prev_root,
109 					      bool shared)
110 {
111 	struct kvm_mmu_page *next_root;
112 
113 	rcu_read_lock();
114 
115 	if (prev_root)
116 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
117 						  &prev_root->link,
118 						  typeof(*prev_root), link);
119 	else
120 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121 						   typeof(*next_root), link);
122 
123 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
124 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
125 				&next_root->link, typeof(*next_root), link);
126 
127 	rcu_read_unlock();
128 
129 	if (prev_root)
130 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
131 
132 	return next_root;
133 }
134 
135 /*
136  * Note: this iterator gets and puts references to the roots it iterates over.
137  * This makes it safe to release the MMU lock and yield within the loop, but
138  * if exiting the loop early, the caller must drop the reference to the most
139  * recent root. (Unless keeping a live reference is desirable.)
140  *
141  * If shared is set, this function is operating under the MMU lock in read
142  * mode. In the unlikely event that this thread must free a root, the lock
143  * will be temporarily dropped and reacquired in write mode.
144  */
145 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
146 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
147 	     _root;							\
148 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
149 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
150 		} else
151 
152 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
153 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
154 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
155 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
156 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
157 		} else
158 
159 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
160 						   int level)
161 {
162 	union kvm_mmu_page_role role;
163 
164 	role = vcpu->arch.mmu->mmu_role.base;
165 	role.level = level;
166 	role.direct = true;
167 	role.gpte_is_8_bytes = true;
168 	role.access = ACC_ALL;
169 
170 	return role;
171 }
172 
173 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
174 					       int level)
175 {
176 	struct kvm_mmu_page *sp;
177 
178 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
179 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
180 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
181 
182 	sp->role.word = page_role_for_level(vcpu, level).word;
183 	sp->gfn = gfn;
184 	sp->tdp_mmu_page = true;
185 
186 	trace_kvm_mmu_get_page(sp, true);
187 
188 	return sp;
189 }
190 
191 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
192 {
193 	union kvm_mmu_page_role role;
194 	struct kvm *kvm = vcpu->kvm;
195 	struct kvm_mmu_page *root;
196 
197 	lockdep_assert_held_write(&kvm->mmu_lock);
198 
199 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
200 
201 	/* Check for an existing root before allocating a new one. */
202 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
203 		if (root->role.word == role.word &&
204 		    kvm_tdp_mmu_get_root(kvm, root))
205 			goto out;
206 	}
207 
208 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
209 	refcount_set(&root->tdp_mmu_root_count, 1);
210 
211 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
212 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
213 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
214 
215 out:
216 	return __pa(root->spt);
217 }
218 
219 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
220 				u64 old_spte, u64 new_spte, int level,
221 				bool shared);
222 
223 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
224 {
225 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
226 		return;
227 
228 	if (is_accessed_spte(old_spte) &&
229 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
230 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
231 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
232 }
233 
234 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
235 					  u64 old_spte, u64 new_spte, int level)
236 {
237 	bool pfn_changed;
238 	struct kvm_memory_slot *slot;
239 
240 	if (level > PG_LEVEL_4K)
241 		return;
242 
243 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
244 
245 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
246 	    is_writable_pte(new_spte)) {
247 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
248 		mark_page_dirty_in_slot(kvm, slot, gfn);
249 	}
250 }
251 
252 /**
253  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
254  *
255  * @kvm: kvm instance
256  * @sp: the new page
257  * @shared: This operation may not be running under the exclusive use of
258  *	    the MMU lock and the operation must synchronize with other
259  *	    threads that might be adding or removing pages.
260  * @account_nx: This page replaces a NX large page and should be marked for
261  *		eventual reclaim.
262  */
263 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
264 			      bool shared, bool account_nx)
265 {
266 	if (shared)
267 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
268 	else
269 		lockdep_assert_held_write(&kvm->mmu_lock);
270 
271 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
272 	if (account_nx)
273 		account_huge_nx_page(kvm, sp);
274 
275 	if (shared)
276 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
277 }
278 
279 /**
280  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
281  *
282  * @kvm: kvm instance
283  * @sp: the page to be removed
284  * @shared: This operation may not be running under the exclusive use of
285  *	    the MMU lock and the operation must synchronize with other
286  *	    threads that might be adding or removing pages.
287  */
288 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
289 				bool shared)
290 {
291 	if (shared)
292 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
293 	else
294 		lockdep_assert_held_write(&kvm->mmu_lock);
295 
296 	list_del(&sp->link);
297 	if (sp->lpage_disallowed)
298 		unaccount_huge_nx_page(kvm, sp);
299 
300 	if (shared)
301 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
302 }
303 
304 /**
305  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
306  *
307  * @kvm: kvm instance
308  * @pt: the page removed from the paging structure
309  * @shared: This operation may not be running under the exclusive use
310  *	    of the MMU lock and the operation must synchronize with other
311  *	    threads that might be modifying SPTEs.
312  *
313  * Given a page table that has been removed from the TDP paging structure,
314  * iterates through the page table to clear SPTEs and free child page tables.
315  *
316  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
317  * protection. Since this thread removed it from the paging structure,
318  * this thread will be responsible for ensuring the page is freed. Hence the
319  * early rcu_dereferences in the function.
320  */
321 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
322 					bool shared)
323 {
324 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
325 	int level = sp->role.level;
326 	gfn_t base_gfn = sp->gfn;
327 	u64 old_child_spte;
328 	u64 *sptep;
329 	gfn_t gfn;
330 	int i;
331 
332 	trace_kvm_mmu_prepare_zap_page(sp);
333 
334 	tdp_mmu_unlink_page(kvm, sp, shared);
335 
336 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
337 		sptep = rcu_dereference(pt) + i;
338 		gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
339 
340 		if (shared) {
341 			/*
342 			 * Set the SPTE to a nonpresent value that other
343 			 * threads will not overwrite. If the SPTE was
344 			 * already marked as removed then another thread
345 			 * handling a page fault could overwrite it, so
346 			 * set the SPTE until it is set from some other
347 			 * value to the removed SPTE value.
348 			 */
349 			for (;;) {
350 				old_child_spte = xchg(sptep, REMOVED_SPTE);
351 				if (!is_removed_spte(old_child_spte))
352 					break;
353 				cpu_relax();
354 			}
355 		} else {
356 			/*
357 			 * If the SPTE is not MMU-present, there is no backing
358 			 * page associated with the SPTE and so no side effects
359 			 * that need to be recorded, and exclusive ownership of
360 			 * mmu_lock ensures the SPTE can't be made present.
361 			 * Note, zapping MMIO SPTEs is also unnecessary as they
362 			 * are guarded by the memslots generation, not by being
363 			 * unreachable.
364 			 */
365 			old_child_spte = READ_ONCE(*sptep);
366 			if (!is_shadow_present_pte(old_child_spte))
367 				continue;
368 
369 			/*
370 			 * Marking the SPTE as a removed SPTE is not
371 			 * strictly necessary here as the MMU lock will
372 			 * stop other threads from concurrently modifying
373 			 * this SPTE. Using the removed SPTE value keeps
374 			 * the two branches consistent and simplifies
375 			 * the function.
376 			 */
377 			WRITE_ONCE(*sptep, REMOVED_SPTE);
378 		}
379 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
380 				    old_child_spte, REMOVED_SPTE, level - 1,
381 				    shared);
382 	}
383 
384 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
385 					   KVM_PAGES_PER_HPAGE(level));
386 
387 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
388 }
389 
390 /**
391  * handle_changed_spte - handle bookkeeping associated with an SPTE change
392  * @kvm: kvm instance
393  * @as_id: the address space of the paging structure the SPTE was a part of
394  * @gfn: the base GFN that was mapped by the SPTE
395  * @old_spte: The value of the SPTE before the change
396  * @new_spte: The value of the SPTE after the change
397  * @level: the level of the PT the SPTE is part of in the paging structure
398  * @shared: This operation may not be running under the exclusive use of
399  *	    the MMU lock and the operation must synchronize with other
400  *	    threads that might be modifying SPTEs.
401  *
402  * Handle bookkeeping that might result from the modification of a SPTE.
403  * This function must be called for all TDP SPTE modifications.
404  */
405 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
406 				  u64 old_spte, u64 new_spte, int level,
407 				  bool shared)
408 {
409 	bool was_present = is_shadow_present_pte(old_spte);
410 	bool is_present = is_shadow_present_pte(new_spte);
411 	bool was_leaf = was_present && is_last_spte(old_spte, level);
412 	bool is_leaf = is_present && is_last_spte(new_spte, level);
413 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
414 
415 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
416 	WARN_ON(level < PG_LEVEL_4K);
417 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
418 
419 	/*
420 	 * If this warning were to trigger it would indicate that there was a
421 	 * missing MMU notifier or a race with some notifier handler.
422 	 * A present, leaf SPTE should never be directly replaced with another
423 	 * present leaf SPTE pointing to a different PFN. A notifier handler
424 	 * should be zapping the SPTE before the main MM's page table is
425 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
426 	 * thread before replacement.
427 	 */
428 	if (was_leaf && is_leaf && pfn_changed) {
429 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
430 		       "SPTE with another present leaf SPTE mapping a\n"
431 		       "different PFN!\n"
432 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
433 		       as_id, gfn, old_spte, new_spte, level);
434 
435 		/*
436 		 * Crash the host to prevent error propagation and guest data
437 		 * corruption.
438 		 */
439 		BUG();
440 	}
441 
442 	if (old_spte == new_spte)
443 		return;
444 
445 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
446 
447 	/*
448 	 * The only times a SPTE should be changed from a non-present to
449 	 * non-present state is when an MMIO entry is installed/modified/
450 	 * removed. In that case, there is nothing to do here.
451 	 */
452 	if (!was_present && !is_present) {
453 		/*
454 		 * If this change does not involve a MMIO SPTE or removed SPTE,
455 		 * it is unexpected. Log the change, though it should not
456 		 * impact the guest since both the former and current SPTEs
457 		 * are nonpresent.
458 		 */
459 		if (WARN_ON(!is_mmio_spte(old_spte) &&
460 			    !is_mmio_spte(new_spte) &&
461 			    !is_removed_spte(new_spte)))
462 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
463 			       "should not be replaced with another,\n"
464 			       "different nonpresent SPTE, unless one or both\n"
465 			       "are MMIO SPTEs, or the new SPTE is\n"
466 			       "a temporary removed SPTE.\n"
467 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
468 			       as_id, gfn, old_spte, new_spte, level);
469 		return;
470 	}
471 
472 
473 	if (was_leaf && is_dirty_spte(old_spte) &&
474 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
475 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
476 
477 	/*
478 	 * Recursively handle child PTs if the change removed a subtree from
479 	 * the paging structure.
480 	 */
481 	if (was_present && !was_leaf && (pfn_changed || !is_present))
482 		handle_removed_tdp_mmu_page(kvm,
483 				spte_to_child_pt(old_spte, level), shared);
484 }
485 
486 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
487 				u64 old_spte, u64 new_spte, int level,
488 				bool shared)
489 {
490 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
491 			      shared);
492 	handle_changed_spte_acc_track(old_spte, new_spte, level);
493 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
494 				      new_spte, level);
495 }
496 
497 /*
498  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
499  * and handle the associated bookkeeping, but do not mark the page dirty
500  * in KVM's dirty bitmaps.
501  *
502  * @kvm: kvm instance
503  * @iter: a tdp_iter instance currently on the SPTE that should be set
504  * @new_spte: The value the SPTE should be set to
505  * Returns: true if the SPTE was set, false if it was not. If false is returned,
506  *	    this function will have no side-effects.
507  */
508 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
509 							struct tdp_iter *iter,
510 							u64 new_spte)
511 {
512 	lockdep_assert_held_read(&kvm->mmu_lock);
513 
514 	/*
515 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
516 	 * may modify it.
517 	 */
518 	if (is_removed_spte(iter->old_spte))
519 		return false;
520 
521 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
522 		      new_spte) != iter->old_spte)
523 		return false;
524 
525 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
526 			      new_spte, iter->level, true);
527 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
528 
529 	return true;
530 }
531 
532 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
533 					   struct tdp_iter *iter,
534 					   u64 new_spte)
535 {
536 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
537 		return false;
538 
539 	handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
540 				      iter->old_spte, new_spte, iter->level);
541 	return true;
542 }
543 
544 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
545 					   struct tdp_iter *iter)
546 {
547 	/*
548 	 * Freeze the SPTE by setting it to a special,
549 	 * non-present value. This will stop other threads from
550 	 * immediately installing a present entry in its place
551 	 * before the TLBs are flushed.
552 	 */
553 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
554 		return false;
555 
556 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
557 					   KVM_PAGES_PER_HPAGE(iter->level));
558 
559 	/*
560 	 * No other thread can overwrite the removed SPTE as they
561 	 * must either wait on the MMU lock or use
562 	 * tdp_mmu_set_spte_atomic which will not overwrite the
563 	 * special removed SPTE value. No bookkeeping is needed
564 	 * here since the SPTE is going from non-present
565 	 * to non-present.
566 	 */
567 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
568 
569 	return true;
570 }
571 
572 
573 /*
574  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
575  * @kvm: kvm instance
576  * @iter: a tdp_iter instance currently on the SPTE that should be set
577  * @new_spte: The value the SPTE should be set to
578  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
579  *		      of the page. Should be set unless handling an MMU
580  *		      notifier for access tracking. Leaving record_acc_track
581  *		      unset in that case prevents page accesses from being
582  *		      double counted.
583  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
584  *		      appropriate for the change being made. Should be set
585  *		      unless performing certain dirty logging operations.
586  *		      Leaving record_dirty_log unset in that case prevents page
587  *		      writes from being double counted.
588  */
589 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
590 				      u64 new_spte, bool record_acc_track,
591 				      bool record_dirty_log)
592 {
593 	lockdep_assert_held_write(&kvm->mmu_lock);
594 
595 	/*
596 	 * No thread should be using this function to set SPTEs to the
597 	 * temporary removed SPTE value.
598 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
599 	 * should be used. If operating under the MMU lock in write mode, the
600 	 * use of the removed SPTE should not be necessary.
601 	 */
602 	WARN_ON(is_removed_spte(iter->old_spte));
603 
604 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
605 
606 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
607 			      new_spte, iter->level, false);
608 	if (record_acc_track)
609 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
610 					      iter->level);
611 	if (record_dirty_log)
612 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
613 					      iter->old_spte, new_spte,
614 					      iter->level);
615 }
616 
617 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
618 				    u64 new_spte)
619 {
620 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
621 }
622 
623 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
624 						 struct tdp_iter *iter,
625 						 u64 new_spte)
626 {
627 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
628 }
629 
630 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
631 						 struct tdp_iter *iter,
632 						 u64 new_spte)
633 {
634 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
635 }
636 
637 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
638 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
639 
640 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
641 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
642 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
643 		    !is_last_spte(_iter.old_spte, _iter.level))		\
644 			continue;					\
645 		else
646 
647 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
648 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
649 			 _mmu->shadow_root_level, _start, _end)
650 
651 /*
652  * Yield if the MMU lock is contended or this thread needs to return control
653  * to the scheduler.
654  *
655  * If this function should yield and flush is set, it will perform a remote
656  * TLB flush before yielding.
657  *
658  * If this function yields, it will also reset the tdp_iter's walk over the
659  * paging structure and the calling function should skip to the next
660  * iteration to allow the iterator to continue its traversal from the
661  * paging structure root.
662  *
663  * Return true if this function yielded and the iterator's traversal was reset.
664  * Return false if a yield was not needed.
665  */
666 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
667 					     struct tdp_iter *iter, bool flush,
668 					     bool shared)
669 {
670 	/* Ensure forward progress has been made before yielding. */
671 	if (iter->next_last_level_gfn == iter->yielded_gfn)
672 		return false;
673 
674 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
675 		rcu_read_unlock();
676 
677 		if (flush)
678 			kvm_flush_remote_tlbs(kvm);
679 
680 		if (shared)
681 			cond_resched_rwlock_read(&kvm->mmu_lock);
682 		else
683 			cond_resched_rwlock_write(&kvm->mmu_lock);
684 
685 		rcu_read_lock();
686 
687 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
688 
689 		tdp_iter_restart(iter);
690 
691 		return true;
692 	}
693 
694 	return false;
695 }
696 
697 /*
698  * Tears down the mappings for the range of gfns, [start, end), and frees the
699  * non-root pages mapping GFNs strictly within that range. Returns true if
700  * SPTEs have been cleared and a TLB flush is needed before releasing the
701  * MMU lock.
702  *
703  * If can_yield is true, will release the MMU lock and reschedule if the
704  * scheduler needs the CPU or there is contention on the MMU lock. If this
705  * function cannot yield, it will not release the MMU lock or reschedule and
706  * the caller must ensure it does not supply too large a GFN range, or the
707  * operation can cause a soft lockup.
708  *
709  * If shared is true, this thread holds the MMU lock in read mode and must
710  * account for the possibility that other threads are modifying the paging
711  * structures concurrently. If shared is false, this thread should hold the
712  * MMU lock in write mode.
713  */
714 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
715 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
716 			  bool shared)
717 {
718 	struct tdp_iter iter;
719 
720 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
721 
722 	rcu_read_lock();
723 
724 	tdp_root_for_each_pte(iter, root, start, end) {
725 retry:
726 		if (can_yield &&
727 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
728 			flush = false;
729 			continue;
730 		}
731 
732 		if (!is_shadow_present_pte(iter.old_spte))
733 			continue;
734 
735 		/*
736 		 * If this is a non-last-level SPTE that covers a larger range
737 		 * than should be zapped, continue, and zap the mappings at a
738 		 * lower level.
739 		 */
740 		if ((iter.gfn < start ||
741 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
742 		    !is_last_spte(iter.old_spte, iter.level))
743 			continue;
744 
745 		if (!shared) {
746 			tdp_mmu_set_spte(kvm, &iter, 0);
747 			flush = true;
748 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
749 			/*
750 			 * The iter must explicitly re-read the SPTE because
751 			 * the atomic cmpxchg failed.
752 			 */
753 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
754 			goto retry;
755 		}
756 	}
757 
758 	rcu_read_unlock();
759 	return flush;
760 }
761 
762 /*
763  * Tears down the mappings for the range of gfns, [start, end), and frees the
764  * non-root pages mapping GFNs strictly within that range. Returns true if
765  * SPTEs have been cleared and a TLB flush is needed before releasing the
766  * MMU lock.
767  *
768  * If shared is true, this thread holds the MMU lock in read mode and must
769  * account for the possibility that other threads are modifying the paging
770  * structures concurrently. If shared is false, this thread should hold the
771  * MMU in write mode.
772  */
773 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
774 				 gfn_t end, bool can_yield, bool flush,
775 				 bool shared)
776 {
777 	struct kvm_mmu_page *root;
778 
779 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
780 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
781 				      shared);
782 
783 	return flush;
784 }
785 
786 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
787 {
788 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
789 	bool flush = false;
790 	int i;
791 
792 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
793 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
794 						  flush, false);
795 
796 	if (flush)
797 		kvm_flush_remote_tlbs(kvm);
798 }
799 
800 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
801 						  struct kvm_mmu_page *prev_root)
802 {
803 	struct kvm_mmu_page *next_root;
804 
805 	if (prev_root)
806 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
807 						  &prev_root->link,
808 						  typeof(*prev_root), link);
809 	else
810 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
811 						   typeof(*next_root), link);
812 
813 	while (next_root && !(next_root->role.invalid &&
814 			      refcount_read(&next_root->tdp_mmu_root_count)))
815 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
816 						  &next_root->link,
817 						  typeof(*next_root), link);
818 
819 	return next_root;
820 }
821 
822 /*
823  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
824  * invalidated root, they will not be freed until this function drops the
825  * reference. Before dropping that reference, tear down the paging
826  * structure so that whichever thread does drop the last reference
827  * only has to do a trivial amount of work. Since the roots are invalid,
828  * no new SPTEs should be created under them.
829  */
830 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
831 {
832 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
833 	struct kvm_mmu_page *next_root;
834 	struct kvm_mmu_page *root;
835 	bool flush = false;
836 
837 	lockdep_assert_held_read(&kvm->mmu_lock);
838 
839 	rcu_read_lock();
840 
841 	root = next_invalidated_root(kvm, NULL);
842 
843 	while (root) {
844 		next_root = next_invalidated_root(kvm, root);
845 
846 		rcu_read_unlock();
847 
848 		flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
849 				      true);
850 
851 		/*
852 		 * Put the reference acquired in
853 		 * kvm_tdp_mmu_invalidate_roots
854 		 */
855 		kvm_tdp_mmu_put_root(kvm, root, true);
856 
857 		root = next_root;
858 
859 		rcu_read_lock();
860 	}
861 
862 	rcu_read_unlock();
863 
864 	if (flush)
865 		kvm_flush_remote_tlbs(kvm);
866 }
867 
868 /*
869  * Mark each TDP MMU root as invalid so that other threads
870  * will drop their references and allow the root count to
871  * go to 0.
872  *
873  * Also take a reference on all roots so that this thread
874  * can do the bulk of the work required to free the roots
875  * once they are invalidated. Without this reference, a
876  * vCPU thread might drop the last reference to a root and
877  * get stuck with tearing down the entire paging structure.
878  *
879  * Roots which have a zero refcount should be skipped as
880  * they're already being torn down.
881  * Already invalid roots should be referenced again so that
882  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
883  * done with them.
884  *
885  * This has essentially the same effect for the TDP MMU
886  * as updating mmu_valid_gen does for the shadow MMU.
887  */
888 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
889 {
890 	struct kvm_mmu_page *root;
891 
892 	lockdep_assert_held_write(&kvm->mmu_lock);
893 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
894 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
895 			root->role.invalid = true;
896 }
897 
898 /*
899  * Installs a last-level SPTE to handle a TDP page fault.
900  * (NPT/EPT violation/misconfiguration)
901  */
902 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
903 					  int map_writable,
904 					  struct tdp_iter *iter,
905 					  kvm_pfn_t pfn, bool prefault)
906 {
907 	u64 new_spte;
908 	int ret = 0;
909 	int make_spte_ret = 0;
910 
911 	if (unlikely(is_noslot_pfn(pfn)))
912 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
913 	else
914 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
915 					 pfn, iter->old_spte, prefault, true,
916 					 map_writable, !shadow_accessed_mask,
917 					 &new_spte);
918 
919 	if (new_spte == iter->old_spte)
920 		ret = RET_PF_SPURIOUS;
921 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
922 		return RET_PF_RETRY;
923 
924 	/*
925 	 * If the page fault was caused by a write but the page is write
926 	 * protected, emulation is needed. If the emulation was skipped,
927 	 * the vCPU would have the same fault again.
928 	 */
929 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
930 		if (write)
931 			ret = RET_PF_EMULATE;
932 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
933 	}
934 
935 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
936 	if (unlikely(is_mmio_spte(new_spte))) {
937 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
938 				     new_spte);
939 		ret = RET_PF_EMULATE;
940 	} else {
941 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
942 				       rcu_dereference(iter->sptep));
943 	}
944 
945 	if (!prefault)
946 		vcpu->stat.pf_fixed++;
947 
948 	return ret;
949 }
950 
951 /*
952  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
953  * page tables and SPTEs to translate the faulting guest physical address.
954  */
955 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
956 		    int map_writable, int max_level, kvm_pfn_t pfn,
957 		    bool prefault)
958 {
959 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
960 	bool write = error_code & PFERR_WRITE_MASK;
961 	bool exec = error_code & PFERR_FETCH_MASK;
962 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
963 	struct kvm_mmu *mmu = vcpu->arch.mmu;
964 	struct tdp_iter iter;
965 	struct kvm_mmu_page *sp;
966 	u64 *child_pt;
967 	u64 new_spte;
968 	int ret;
969 	gfn_t gfn = gpa >> PAGE_SHIFT;
970 	int level;
971 	int req_level;
972 
973 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
974 		return RET_PF_RETRY;
975 	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
976 		return RET_PF_RETRY;
977 
978 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
979 					huge_page_disallowed, &req_level);
980 
981 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
982 
983 	rcu_read_lock();
984 
985 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
986 		if (nx_huge_page_workaround_enabled)
987 			disallowed_hugepage_adjust(iter.old_spte, gfn,
988 						   iter.level, &pfn, &level);
989 
990 		if (iter.level == level)
991 			break;
992 
993 		/*
994 		 * If there is an SPTE mapping a large page at a higher level
995 		 * than the target, that SPTE must be cleared and replaced
996 		 * with a non-leaf SPTE.
997 		 */
998 		if (is_shadow_present_pte(iter.old_spte) &&
999 		    is_large_pte(iter.old_spte)) {
1000 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1001 				break;
1002 
1003 			/*
1004 			 * The iter must explicitly re-read the spte here
1005 			 * because the new value informs the !present
1006 			 * path below.
1007 			 */
1008 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1009 		}
1010 
1011 		if (!is_shadow_present_pte(iter.old_spte)) {
1012 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
1013 			child_pt = sp->spt;
1014 
1015 			new_spte = make_nonleaf_spte(child_pt,
1016 						     !shadow_accessed_mask);
1017 
1018 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1019 						    new_spte)) {
1020 				tdp_mmu_link_page(vcpu->kvm, sp, true,
1021 						  huge_page_disallowed &&
1022 						  req_level >= iter.level);
1023 
1024 				trace_kvm_mmu_get_page(sp, true);
1025 			} else {
1026 				tdp_mmu_free_sp(sp);
1027 				break;
1028 			}
1029 		}
1030 	}
1031 
1032 	if (iter.level != level) {
1033 		rcu_read_unlock();
1034 		return RET_PF_RETRY;
1035 	}
1036 
1037 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1038 					      pfn, prefault);
1039 	rcu_read_unlock();
1040 
1041 	return ret;
1042 }
1043 
1044 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1045 				 bool flush)
1046 {
1047 	struct kvm_mmu_page *root;
1048 
1049 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1050 		flush |= zap_gfn_range(kvm, root, range->start, range->end,
1051 				       range->may_block, flush, false);
1052 
1053 	return flush;
1054 }
1055 
1056 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1057 			      struct kvm_gfn_range *range);
1058 
1059 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1060 						   struct kvm_gfn_range *range,
1061 						   tdp_handler_t handler)
1062 {
1063 	struct kvm_mmu_page *root;
1064 	struct tdp_iter iter;
1065 	bool ret = false;
1066 
1067 	rcu_read_lock();
1068 
1069 	/*
1070 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1071 	 * into this helper allow blocking; it'd be dead, wasteful code.
1072 	 */
1073 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1074 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1075 			ret |= handler(kvm, &iter, range);
1076 	}
1077 
1078 	rcu_read_unlock();
1079 
1080 	return ret;
1081 }
1082 
1083 /*
1084  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1085  * if any of the GFNs in the range have been accessed.
1086  */
1087 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1088 			  struct kvm_gfn_range *range)
1089 {
1090 	u64 new_spte = 0;
1091 
1092 	/* If we have a non-accessed entry we don't need to change the pte. */
1093 	if (!is_accessed_spte(iter->old_spte))
1094 		return false;
1095 
1096 	new_spte = iter->old_spte;
1097 
1098 	if (spte_ad_enabled(new_spte)) {
1099 		new_spte &= ~shadow_accessed_mask;
1100 	} else {
1101 		/*
1102 		 * Capture the dirty status of the page, so that it doesn't get
1103 		 * lost when the SPTE is marked for access tracking.
1104 		 */
1105 		if (is_writable_pte(new_spte))
1106 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1107 
1108 		new_spte = mark_spte_for_access_track(new_spte);
1109 	}
1110 
1111 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1112 
1113 	return true;
1114 }
1115 
1116 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1117 {
1118 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1119 }
1120 
1121 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1122 			 struct kvm_gfn_range *range)
1123 {
1124 	return is_accessed_spte(iter->old_spte);
1125 }
1126 
1127 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1128 {
1129 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1130 }
1131 
1132 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1133 			 struct kvm_gfn_range *range)
1134 {
1135 	u64 new_spte;
1136 
1137 	/* Huge pages aren't expected to be modified without first being zapped. */
1138 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1139 
1140 	if (iter->level != PG_LEVEL_4K ||
1141 	    !is_shadow_present_pte(iter->old_spte))
1142 		return false;
1143 
1144 	/*
1145 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1146 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1147 	 * invariant that the PFN of a present * leaf SPTE can never change.
1148 	 * See __handle_changed_spte().
1149 	 */
1150 	tdp_mmu_set_spte(kvm, iter, 0);
1151 
1152 	if (!pte_write(range->pte)) {
1153 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1154 								  pte_pfn(range->pte));
1155 
1156 		tdp_mmu_set_spte(kvm, iter, new_spte);
1157 	}
1158 
1159 	return true;
1160 }
1161 
1162 /*
1163  * Handle the changed_pte MMU notifier for the TDP MMU.
1164  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1165  * notifier.
1166  * Returns non-zero if a flush is needed before releasing the MMU lock.
1167  */
1168 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1169 {
1170 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1171 
1172 	/* FIXME: return 'flush' instead of flushing here. */
1173 	if (flush)
1174 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1175 
1176 	return false;
1177 }
1178 
1179 /*
1180  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1181  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1182  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1183  */
1184 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1185 			     gfn_t start, gfn_t end, int min_level)
1186 {
1187 	struct tdp_iter iter;
1188 	u64 new_spte;
1189 	bool spte_set = false;
1190 
1191 	rcu_read_lock();
1192 
1193 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1194 
1195 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1196 				   min_level, start, end) {
1197 retry:
1198 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1199 			continue;
1200 
1201 		if (!is_shadow_present_pte(iter.old_spte) ||
1202 		    !is_last_spte(iter.old_spte, iter.level) ||
1203 		    !(iter.old_spte & PT_WRITABLE_MASK))
1204 			continue;
1205 
1206 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1207 
1208 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1209 							  new_spte)) {
1210 			/*
1211 			 * The iter must explicitly re-read the SPTE because
1212 			 * the atomic cmpxchg failed.
1213 			 */
1214 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1215 			goto retry;
1216 		}
1217 		spte_set = true;
1218 	}
1219 
1220 	rcu_read_unlock();
1221 	return spte_set;
1222 }
1223 
1224 /*
1225  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1226  * only affect leaf SPTEs down to min_level.
1227  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1228  */
1229 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1230 			     int min_level)
1231 {
1232 	struct kvm_mmu_page *root;
1233 	bool spte_set = false;
1234 
1235 	lockdep_assert_held_read(&kvm->mmu_lock);
1236 
1237 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1238 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1239 			     slot->base_gfn + slot->npages, min_level);
1240 
1241 	return spte_set;
1242 }
1243 
1244 /*
1245  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1246  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1247  * If AD bits are not enabled, this will require clearing the writable bit on
1248  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1249  * be flushed.
1250  */
1251 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1252 			   gfn_t start, gfn_t end)
1253 {
1254 	struct tdp_iter iter;
1255 	u64 new_spte;
1256 	bool spte_set = false;
1257 
1258 	rcu_read_lock();
1259 
1260 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1261 retry:
1262 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1263 			continue;
1264 
1265 		if (spte_ad_need_write_protect(iter.old_spte)) {
1266 			if (is_writable_pte(iter.old_spte))
1267 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1268 			else
1269 				continue;
1270 		} else {
1271 			if (iter.old_spte & shadow_dirty_mask)
1272 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1273 			else
1274 				continue;
1275 		}
1276 
1277 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1278 							  new_spte)) {
1279 			/*
1280 			 * The iter must explicitly re-read the SPTE because
1281 			 * the atomic cmpxchg failed.
1282 			 */
1283 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1284 			goto retry;
1285 		}
1286 		spte_set = true;
1287 	}
1288 
1289 	rcu_read_unlock();
1290 	return spte_set;
1291 }
1292 
1293 /*
1294  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1295  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1296  * If AD bits are not enabled, this will require clearing the writable bit on
1297  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1298  * be flushed.
1299  */
1300 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1301 {
1302 	struct kvm_mmu_page *root;
1303 	bool spte_set = false;
1304 
1305 	lockdep_assert_held_read(&kvm->mmu_lock);
1306 
1307 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1308 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1309 				slot->base_gfn + slot->npages);
1310 
1311 	return spte_set;
1312 }
1313 
1314 /*
1315  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1316  * set in mask, starting at gfn. The given memslot is expected to contain all
1317  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1318  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1319  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1320  */
1321 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1322 				  gfn_t gfn, unsigned long mask, bool wrprot)
1323 {
1324 	struct tdp_iter iter;
1325 	u64 new_spte;
1326 
1327 	rcu_read_lock();
1328 
1329 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1330 				    gfn + BITS_PER_LONG) {
1331 		if (!mask)
1332 			break;
1333 
1334 		if (iter.level > PG_LEVEL_4K ||
1335 		    !(mask & (1UL << (iter.gfn - gfn))))
1336 			continue;
1337 
1338 		mask &= ~(1UL << (iter.gfn - gfn));
1339 
1340 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1341 			if (is_writable_pte(iter.old_spte))
1342 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1343 			else
1344 				continue;
1345 		} else {
1346 			if (iter.old_spte & shadow_dirty_mask)
1347 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1348 			else
1349 				continue;
1350 		}
1351 
1352 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1353 	}
1354 
1355 	rcu_read_unlock();
1356 }
1357 
1358 /*
1359  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1360  * set in mask, starting at gfn. The given memslot is expected to contain all
1361  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1362  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1363  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1364  */
1365 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1366 				       struct kvm_memory_slot *slot,
1367 				       gfn_t gfn, unsigned long mask,
1368 				       bool wrprot)
1369 {
1370 	struct kvm_mmu_page *root;
1371 
1372 	lockdep_assert_held_write(&kvm->mmu_lock);
1373 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1374 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1375 }
1376 
1377 /*
1378  * Clear leaf entries which could be replaced by large mappings, for
1379  * GFNs within the slot.
1380  */
1381 static bool zap_collapsible_spte_range(struct kvm *kvm,
1382 				       struct kvm_mmu_page *root,
1383 				       const struct kvm_memory_slot *slot,
1384 				       bool flush)
1385 {
1386 	gfn_t start = slot->base_gfn;
1387 	gfn_t end = start + slot->npages;
1388 	struct tdp_iter iter;
1389 	kvm_pfn_t pfn;
1390 
1391 	rcu_read_lock();
1392 
1393 	tdp_root_for_each_pte(iter, root, start, end) {
1394 retry:
1395 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1396 			flush = false;
1397 			continue;
1398 		}
1399 
1400 		if (!is_shadow_present_pte(iter.old_spte) ||
1401 		    !is_last_spte(iter.old_spte, iter.level))
1402 			continue;
1403 
1404 		pfn = spte_to_pfn(iter.old_spte);
1405 		if (kvm_is_reserved_pfn(pfn) ||
1406 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1407 							    pfn, PG_LEVEL_NUM))
1408 			continue;
1409 
1410 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1411 			/*
1412 			 * The iter must explicitly re-read the SPTE because
1413 			 * the atomic cmpxchg failed.
1414 			 */
1415 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1416 			goto retry;
1417 		}
1418 		flush = true;
1419 	}
1420 
1421 	rcu_read_unlock();
1422 
1423 	return flush;
1424 }
1425 
1426 /*
1427  * Clear non-leaf entries (and free associated page tables) which could
1428  * be replaced by large mappings, for GFNs within the slot.
1429  */
1430 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1431 				       const struct kvm_memory_slot *slot,
1432 				       bool flush)
1433 {
1434 	struct kvm_mmu_page *root;
1435 
1436 	lockdep_assert_held_read(&kvm->mmu_lock);
1437 
1438 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1439 		flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1440 
1441 	return flush;
1442 }
1443 
1444 /*
1445  * Removes write access on the last level SPTE mapping this GFN and unsets the
1446  * MMU-writable bit to ensure future writes continue to be intercepted.
1447  * Returns true if an SPTE was set and a TLB flush is needed.
1448  */
1449 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1450 			      gfn_t gfn)
1451 {
1452 	struct tdp_iter iter;
1453 	u64 new_spte;
1454 	bool spte_set = false;
1455 
1456 	rcu_read_lock();
1457 
1458 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1459 		if (!is_writable_pte(iter.old_spte))
1460 			break;
1461 
1462 		new_spte = iter.old_spte &
1463 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1464 
1465 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1466 		spte_set = true;
1467 	}
1468 
1469 	rcu_read_unlock();
1470 
1471 	return spte_set;
1472 }
1473 
1474 /*
1475  * Removes write access on the last level SPTE mapping this GFN and unsets the
1476  * MMU-writable bit to ensure future writes continue to be intercepted.
1477  * Returns true if an SPTE was set and a TLB flush is needed.
1478  */
1479 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1480 				   struct kvm_memory_slot *slot, gfn_t gfn)
1481 {
1482 	struct kvm_mmu_page *root;
1483 	bool spte_set = false;
1484 
1485 	lockdep_assert_held_write(&kvm->mmu_lock);
1486 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1487 		spte_set |= write_protect_gfn(kvm, root, gfn);
1488 
1489 	return spte_set;
1490 }
1491 
1492 /*
1493  * Return the level of the lowest level SPTE added to sptes.
1494  * That SPTE may be non-present.
1495  */
1496 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1497 			 int *root_level)
1498 {
1499 	struct tdp_iter iter;
1500 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1501 	gfn_t gfn = addr >> PAGE_SHIFT;
1502 	int leaf = -1;
1503 
1504 	*root_level = vcpu->arch.mmu->shadow_root_level;
1505 
1506 	rcu_read_lock();
1507 
1508 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1509 		leaf = iter.level;
1510 		sptes[leaf] = iter.old_spte;
1511 	}
1512 
1513 	rcu_read_unlock();
1514 
1515 	return leaf;
1516 }
1517