xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision dbe986bd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.
52 	 */
53 	rcu_barrier();
54 }
55 
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
58 			  bool shared);
59 
60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 	free_page((unsigned long)sp->spt);
63 	kmem_cache_free(mmu_page_header_cache, sp);
64 }
65 
66 /*
67  * This is called through call_rcu in order to free TDP page table memory
68  * safely with respect to other kernel threads that may be operating on
69  * the memory.
70  * By only accessing TDP MMU page table memory in an RCU read critical
71  * section, and freeing it after a grace period, lockless access to that
72  * memory won't use it after it is freed.
73  */
74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 					       rcu_head);
78 
79 	tdp_mmu_free_sp(sp);
80 }
81 
82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 			  bool shared)
84 {
85 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86 
87 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 		return;
89 
90 	WARN_ON(!root->tdp_mmu_page);
91 
92 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 	list_del_rcu(&root->link);
94 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95 
96 	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97 
98 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100 
101 /*
102  * Finds the next valid root after root (or the first valid root if root
103  * is NULL), takes a reference on it, and returns that next root. If root
104  * is not NULL, this thread should have already taken a reference on it, and
105  * that reference will be dropped. If no valid root is found, this
106  * function will return NULL.
107  */
108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
109 					      struct kvm_mmu_page *prev_root,
110 					      bool shared)
111 {
112 	struct kvm_mmu_page *next_root;
113 
114 	rcu_read_lock();
115 
116 	if (prev_root)
117 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 						  &prev_root->link,
119 						  typeof(*prev_root), link);
120 	else
121 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 						   typeof(*next_root), link);
123 
124 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 				&next_root->link, typeof(*next_root), link);
127 
128 	rcu_read_unlock();
129 
130 	if (prev_root)
131 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
132 
133 	return next_root;
134 }
135 
136 /*
137  * Note: this iterator gets and puts references to the roots it iterates over.
138  * This makes it safe to release the MMU lock and yield within the loop, but
139  * if exiting the loop early, the caller must drop the reference to the most
140  * recent root. (Unless keeping a live reference is desirable.)
141  *
142  * If shared is set, this function is operating under the MMU lock in read
143  * mode. In the unlikely event that this thread must free a root, the lock
144  * will be temporarily dropped and reacquired in write mode.
145  */
146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
147 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
148 	     _root;							\
149 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
150 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
151 		} else
152 
153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
154 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
155 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
156 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
157 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
158 		} else
159 
160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 						   int level)
162 {
163 	union kvm_mmu_page_role role;
164 
165 	role = vcpu->arch.mmu->mmu_role.base;
166 	role.level = level;
167 	role.direct = true;
168 	role.gpte_is_8_bytes = true;
169 	role.access = ACC_ALL;
170 
171 	return role;
172 }
173 
174 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
175 					       int level)
176 {
177 	struct kvm_mmu_page *sp;
178 
179 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
180 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
181 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
182 
183 	sp->role.word = page_role_for_level(vcpu, level).word;
184 	sp->gfn = gfn;
185 	sp->tdp_mmu_page = true;
186 
187 	trace_kvm_mmu_get_page(sp, true);
188 
189 	return sp;
190 }
191 
192 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
193 {
194 	union kvm_mmu_page_role role;
195 	struct kvm *kvm = vcpu->kvm;
196 	struct kvm_mmu_page *root;
197 
198 	lockdep_assert_held_write(&kvm->mmu_lock);
199 
200 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
201 
202 	/* Check for an existing root before allocating a new one. */
203 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
204 		if (root->role.word == role.word &&
205 		    kvm_tdp_mmu_get_root(kvm, root))
206 			goto out;
207 	}
208 
209 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
210 	refcount_set(&root->tdp_mmu_root_count, 1);
211 
212 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
213 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
214 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
215 
216 out:
217 	return __pa(root->spt);
218 }
219 
220 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
221 				u64 old_spte, u64 new_spte, int level,
222 				bool shared);
223 
224 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
225 {
226 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
227 		return;
228 
229 	if (is_accessed_spte(old_spte) &&
230 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
231 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
232 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
233 }
234 
235 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
236 					  u64 old_spte, u64 new_spte, int level)
237 {
238 	bool pfn_changed;
239 	struct kvm_memory_slot *slot;
240 
241 	if (level > PG_LEVEL_4K)
242 		return;
243 
244 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
245 
246 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
247 	    is_writable_pte(new_spte)) {
248 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
249 		mark_page_dirty_in_slot(kvm, slot, gfn);
250 	}
251 }
252 
253 /**
254  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
255  *
256  * @kvm: kvm instance
257  * @sp: the new page
258  * @shared: This operation may not be running under the exclusive use of
259  *	    the MMU lock and the operation must synchronize with other
260  *	    threads that might be adding or removing pages.
261  * @account_nx: This page replaces a NX large page and should be marked for
262  *		eventual reclaim.
263  */
264 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
265 			      bool shared, bool account_nx)
266 {
267 	if (shared)
268 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
269 	else
270 		lockdep_assert_held_write(&kvm->mmu_lock);
271 
272 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
273 	if (account_nx)
274 		account_huge_nx_page(kvm, sp);
275 
276 	if (shared)
277 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
278 }
279 
280 /**
281  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
282  *
283  * @kvm: kvm instance
284  * @sp: the page to be removed
285  * @shared: This operation may not be running under the exclusive use of
286  *	    the MMU lock and the operation must synchronize with other
287  *	    threads that might be adding or removing pages.
288  */
289 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
290 				bool shared)
291 {
292 	if (shared)
293 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
294 	else
295 		lockdep_assert_held_write(&kvm->mmu_lock);
296 
297 	list_del(&sp->link);
298 	if (sp->lpage_disallowed)
299 		unaccount_huge_nx_page(kvm, sp);
300 
301 	if (shared)
302 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
303 }
304 
305 /**
306  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
307  *
308  * @kvm: kvm instance
309  * @pt: the page removed from the paging structure
310  * @shared: This operation may not be running under the exclusive use
311  *	    of the MMU lock and the operation must synchronize with other
312  *	    threads that might be modifying SPTEs.
313  *
314  * Given a page table that has been removed from the TDP paging structure,
315  * iterates through the page table to clear SPTEs and free child page tables.
316  *
317  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
318  * protection. Since this thread removed it from the paging structure,
319  * this thread will be responsible for ensuring the page is freed. Hence the
320  * early rcu_dereferences in the function.
321  */
322 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
323 					bool shared)
324 {
325 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
326 	int level = sp->role.level;
327 	gfn_t base_gfn = sp->gfn;
328 	u64 old_child_spte;
329 	u64 *sptep;
330 	gfn_t gfn;
331 	int i;
332 
333 	trace_kvm_mmu_prepare_zap_page(sp);
334 
335 	tdp_mmu_unlink_page(kvm, sp, shared);
336 
337 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
338 		sptep = rcu_dereference(pt) + i;
339 		gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
340 
341 		if (shared) {
342 			/*
343 			 * Set the SPTE to a nonpresent value that other
344 			 * threads will not overwrite. If the SPTE was
345 			 * already marked as removed then another thread
346 			 * handling a page fault could overwrite it, so
347 			 * set the SPTE until it is set from some other
348 			 * value to the removed SPTE value.
349 			 */
350 			for (;;) {
351 				old_child_spte = xchg(sptep, REMOVED_SPTE);
352 				if (!is_removed_spte(old_child_spte))
353 					break;
354 				cpu_relax();
355 			}
356 		} else {
357 			/*
358 			 * If the SPTE is not MMU-present, there is no backing
359 			 * page associated with the SPTE and so no side effects
360 			 * that need to be recorded, and exclusive ownership of
361 			 * mmu_lock ensures the SPTE can't be made present.
362 			 * Note, zapping MMIO SPTEs is also unnecessary as they
363 			 * are guarded by the memslots generation, not by being
364 			 * unreachable.
365 			 */
366 			old_child_spte = READ_ONCE(*sptep);
367 			if (!is_shadow_present_pte(old_child_spte))
368 				continue;
369 
370 			/*
371 			 * Marking the SPTE as a removed SPTE is not
372 			 * strictly necessary here as the MMU lock will
373 			 * stop other threads from concurrently modifying
374 			 * this SPTE. Using the removed SPTE value keeps
375 			 * the two branches consistent and simplifies
376 			 * the function.
377 			 */
378 			WRITE_ONCE(*sptep, REMOVED_SPTE);
379 		}
380 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
381 				    old_child_spte, REMOVED_SPTE, level,
382 				    shared);
383 	}
384 
385 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
386 					   KVM_PAGES_PER_HPAGE(level + 1));
387 
388 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
389 }
390 
391 /**
392  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
393  * @kvm: kvm instance
394  * @as_id: the address space of the paging structure the SPTE was a part of
395  * @gfn: the base GFN that was mapped by the SPTE
396  * @old_spte: The value of the SPTE before the change
397  * @new_spte: The value of the SPTE after the change
398  * @level: the level of the PT the SPTE is part of in the paging structure
399  * @shared: This operation may not be running under the exclusive use of
400  *	    the MMU lock and the operation must synchronize with other
401  *	    threads that might be modifying SPTEs.
402  *
403  * Handle bookkeeping that might result from the modification of a SPTE.
404  * This function must be called for all TDP SPTE modifications.
405  */
406 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
407 				  u64 old_spte, u64 new_spte, int level,
408 				  bool shared)
409 {
410 	bool was_present = is_shadow_present_pte(old_spte);
411 	bool is_present = is_shadow_present_pte(new_spte);
412 	bool was_leaf = was_present && is_last_spte(old_spte, level);
413 	bool is_leaf = is_present && is_last_spte(new_spte, level);
414 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
415 
416 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
417 	WARN_ON(level < PG_LEVEL_4K);
418 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
419 
420 	/*
421 	 * If this warning were to trigger it would indicate that there was a
422 	 * missing MMU notifier or a race with some notifier handler.
423 	 * A present, leaf SPTE should never be directly replaced with another
424 	 * present leaf SPTE pointing to a different PFN. A notifier handler
425 	 * should be zapping the SPTE before the main MM's page table is
426 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
427 	 * thread before replacement.
428 	 */
429 	if (was_leaf && is_leaf && pfn_changed) {
430 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
431 		       "SPTE with another present leaf SPTE mapping a\n"
432 		       "different PFN!\n"
433 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
434 		       as_id, gfn, old_spte, new_spte, level);
435 
436 		/*
437 		 * Crash the host to prevent error propagation and guest data
438 		 * corruption.
439 		 */
440 		BUG();
441 	}
442 
443 	if (old_spte == new_spte)
444 		return;
445 
446 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
447 
448 	if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
449 		if (is_large_pte(old_spte))
450 			atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
451 		else
452 			atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
453 	}
454 
455 	/*
456 	 * The only times a SPTE should be changed from a non-present to
457 	 * non-present state is when an MMIO entry is installed/modified/
458 	 * removed. In that case, there is nothing to do here.
459 	 */
460 	if (!was_present && !is_present) {
461 		/*
462 		 * If this change does not involve a MMIO SPTE or removed SPTE,
463 		 * it is unexpected. Log the change, though it should not
464 		 * impact the guest since both the former and current SPTEs
465 		 * are nonpresent.
466 		 */
467 		if (WARN_ON(!is_mmio_spte(old_spte) &&
468 			    !is_mmio_spte(new_spte) &&
469 			    !is_removed_spte(new_spte)))
470 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
471 			       "should not be replaced with another,\n"
472 			       "different nonpresent SPTE, unless one or both\n"
473 			       "are MMIO SPTEs, or the new SPTE is\n"
474 			       "a temporary removed SPTE.\n"
475 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
476 			       as_id, gfn, old_spte, new_spte, level);
477 		return;
478 	}
479 
480 
481 	if (was_leaf && is_dirty_spte(old_spte) &&
482 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
483 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
484 
485 	/*
486 	 * Recursively handle child PTs if the change removed a subtree from
487 	 * the paging structure.
488 	 */
489 	if (was_present && !was_leaf && (pfn_changed || !is_present))
490 		handle_removed_tdp_mmu_page(kvm,
491 				spte_to_child_pt(old_spte, level), shared);
492 }
493 
494 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
495 				u64 old_spte, u64 new_spte, int level,
496 				bool shared)
497 {
498 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
499 			      shared);
500 	handle_changed_spte_acc_track(old_spte, new_spte, level);
501 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
502 				      new_spte, level);
503 }
504 
505 /*
506  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
507  * and handle the associated bookkeeping, but do not mark the page dirty
508  * in KVM's dirty bitmaps.
509  *
510  * @kvm: kvm instance
511  * @iter: a tdp_iter instance currently on the SPTE that should be set
512  * @new_spte: The value the SPTE should be set to
513  * Returns: true if the SPTE was set, false if it was not. If false is returned,
514  *	    this function will have no side-effects.
515  */
516 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
517 							struct tdp_iter *iter,
518 							u64 new_spte)
519 {
520 	lockdep_assert_held_read(&kvm->mmu_lock);
521 
522 	/*
523 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
524 	 * may modify it.
525 	 */
526 	if (is_removed_spte(iter->old_spte))
527 		return false;
528 
529 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
530 		      new_spte) != iter->old_spte)
531 		return false;
532 
533 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
534 			      new_spte, iter->level, true);
535 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
536 
537 	return true;
538 }
539 
540 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
541 					   struct tdp_iter *iter,
542 					   u64 new_spte)
543 {
544 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
545 		return false;
546 
547 	handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
548 				      iter->old_spte, new_spte, iter->level);
549 	return true;
550 }
551 
552 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
553 					   struct tdp_iter *iter)
554 {
555 	/*
556 	 * Freeze the SPTE by setting it to a special,
557 	 * non-present value. This will stop other threads from
558 	 * immediately installing a present entry in its place
559 	 * before the TLBs are flushed.
560 	 */
561 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
562 		return false;
563 
564 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
565 					   KVM_PAGES_PER_HPAGE(iter->level));
566 
567 	/*
568 	 * No other thread can overwrite the removed SPTE as they
569 	 * must either wait on the MMU lock or use
570 	 * tdp_mmu_set_spte_atomic which will not overwrite the
571 	 * special removed SPTE value. No bookkeeping is needed
572 	 * here since the SPTE is going from non-present
573 	 * to non-present.
574 	 */
575 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
576 
577 	return true;
578 }
579 
580 
581 /*
582  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
583  * @kvm: kvm instance
584  * @iter: a tdp_iter instance currently on the SPTE that should be set
585  * @new_spte: The value the SPTE should be set to
586  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
587  *		      of the page. Should be set unless handling an MMU
588  *		      notifier for access tracking. Leaving record_acc_track
589  *		      unset in that case prevents page accesses from being
590  *		      double counted.
591  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
592  *		      appropriate for the change being made. Should be set
593  *		      unless performing certain dirty logging operations.
594  *		      Leaving record_dirty_log unset in that case prevents page
595  *		      writes from being double counted.
596  */
597 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
598 				      u64 new_spte, bool record_acc_track,
599 				      bool record_dirty_log)
600 {
601 	lockdep_assert_held_write(&kvm->mmu_lock);
602 
603 	/*
604 	 * No thread should be using this function to set SPTEs to the
605 	 * temporary removed SPTE value.
606 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
607 	 * should be used. If operating under the MMU lock in write mode, the
608 	 * use of the removed SPTE should not be necessary.
609 	 */
610 	WARN_ON(is_removed_spte(iter->old_spte));
611 
612 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
613 
614 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
615 			      new_spte, iter->level, false);
616 	if (record_acc_track)
617 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
618 					      iter->level);
619 	if (record_dirty_log)
620 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
621 					      iter->old_spte, new_spte,
622 					      iter->level);
623 }
624 
625 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
626 				    u64 new_spte)
627 {
628 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
629 }
630 
631 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
632 						 struct tdp_iter *iter,
633 						 u64 new_spte)
634 {
635 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
636 }
637 
638 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
639 						 struct tdp_iter *iter,
640 						 u64 new_spte)
641 {
642 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
643 }
644 
645 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
646 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
647 
648 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
649 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
650 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
651 		    !is_last_spte(_iter.old_spte, _iter.level))		\
652 			continue;					\
653 		else
654 
655 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
656 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
657 			 _mmu->shadow_root_level, _start, _end)
658 
659 /*
660  * Yield if the MMU lock is contended or this thread needs to return control
661  * to the scheduler.
662  *
663  * If this function should yield and flush is set, it will perform a remote
664  * TLB flush before yielding.
665  *
666  * If this function yields, it will also reset the tdp_iter's walk over the
667  * paging structure and the calling function should skip to the next
668  * iteration to allow the iterator to continue its traversal from the
669  * paging structure root.
670  *
671  * Return true if this function yielded and the iterator's traversal was reset.
672  * Return false if a yield was not needed.
673  */
674 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
675 					     struct tdp_iter *iter, bool flush,
676 					     bool shared)
677 {
678 	/* Ensure forward progress has been made before yielding. */
679 	if (iter->next_last_level_gfn == iter->yielded_gfn)
680 		return false;
681 
682 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
683 		rcu_read_unlock();
684 
685 		if (flush)
686 			kvm_flush_remote_tlbs(kvm);
687 
688 		if (shared)
689 			cond_resched_rwlock_read(&kvm->mmu_lock);
690 		else
691 			cond_resched_rwlock_write(&kvm->mmu_lock);
692 
693 		rcu_read_lock();
694 
695 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
696 
697 		tdp_iter_restart(iter);
698 
699 		return true;
700 	}
701 
702 	return false;
703 }
704 
705 /*
706  * Tears down the mappings for the range of gfns, [start, end), and frees the
707  * non-root pages mapping GFNs strictly within that range. Returns true if
708  * SPTEs have been cleared and a TLB flush is needed before releasing the
709  * MMU lock.
710  *
711  * If can_yield is true, will release the MMU lock and reschedule if the
712  * scheduler needs the CPU or there is contention on the MMU lock. If this
713  * function cannot yield, it will not release the MMU lock or reschedule and
714  * the caller must ensure it does not supply too large a GFN range, or the
715  * operation can cause a soft lockup.
716  *
717  * If shared is true, this thread holds the MMU lock in read mode and must
718  * account for the possibility that other threads are modifying the paging
719  * structures concurrently. If shared is false, this thread should hold the
720  * MMU lock in write mode.
721  */
722 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
723 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
724 			  bool shared)
725 {
726 	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
727 	bool zap_all = (start == 0 && end >= max_gfn_host);
728 	struct tdp_iter iter;
729 
730 	/*
731 	 * No need to try to step down in the iterator when zapping all SPTEs,
732 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
733 	 */
734 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
735 
736 	/*
737 	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
738 	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
739 	 * and so KVM will never install a SPTE for such addresses.
740 	 */
741 	end = min(end, max_gfn_host);
742 
743 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
744 
745 	rcu_read_lock();
746 
747 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
748 				   min_level, start, end) {
749 retry:
750 		if (can_yield &&
751 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
752 			flush = false;
753 			continue;
754 		}
755 
756 		if (!is_shadow_present_pte(iter.old_spte))
757 			continue;
758 
759 		/*
760 		 * If this is a non-last-level SPTE that covers a larger range
761 		 * than should be zapped, continue, and zap the mappings at a
762 		 * lower level, except when zapping all SPTEs.
763 		 */
764 		if (!zap_all &&
765 		    (iter.gfn < start ||
766 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
767 		    !is_last_spte(iter.old_spte, iter.level))
768 			continue;
769 
770 		if (!shared) {
771 			tdp_mmu_set_spte(kvm, &iter, 0);
772 			flush = true;
773 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
774 			/*
775 			 * The iter must explicitly re-read the SPTE because
776 			 * the atomic cmpxchg failed.
777 			 */
778 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
779 			goto retry;
780 		}
781 	}
782 
783 	rcu_read_unlock();
784 	return flush;
785 }
786 
787 /*
788  * Tears down the mappings for the range of gfns, [start, end), and frees the
789  * non-root pages mapping GFNs strictly within that range. Returns true if
790  * SPTEs have been cleared and a TLB flush is needed before releasing the
791  * MMU lock.
792  *
793  * If shared is true, this thread holds the MMU lock in read mode and must
794  * account for the possibility that other threads are modifying the paging
795  * structures concurrently. If shared is false, this thread should hold the
796  * MMU in write mode.
797  */
798 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
799 				 gfn_t end, bool can_yield, bool flush,
800 				 bool shared)
801 {
802 	struct kvm_mmu_page *root;
803 
804 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
805 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
806 				      shared);
807 
808 	return flush;
809 }
810 
811 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
812 {
813 	bool flush = false;
814 	int i;
815 
816 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
817 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
818 						  flush, false);
819 
820 	if (flush)
821 		kvm_flush_remote_tlbs(kvm);
822 }
823 
824 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
825 						  struct kvm_mmu_page *prev_root)
826 {
827 	struct kvm_mmu_page *next_root;
828 
829 	if (prev_root)
830 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
831 						  &prev_root->link,
832 						  typeof(*prev_root), link);
833 	else
834 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
835 						   typeof(*next_root), link);
836 
837 	while (next_root && !(next_root->role.invalid &&
838 			      refcount_read(&next_root->tdp_mmu_root_count)))
839 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
840 						  &next_root->link,
841 						  typeof(*next_root), link);
842 
843 	return next_root;
844 }
845 
846 /*
847  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
848  * invalidated root, they will not be freed until this function drops the
849  * reference. Before dropping that reference, tear down the paging
850  * structure so that whichever thread does drop the last reference
851  * only has to do a trivial amount of work. Since the roots are invalid,
852  * no new SPTEs should be created under them.
853  */
854 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
855 {
856 	struct kvm_mmu_page *next_root;
857 	struct kvm_mmu_page *root;
858 	bool flush = false;
859 
860 	lockdep_assert_held_read(&kvm->mmu_lock);
861 
862 	rcu_read_lock();
863 
864 	root = next_invalidated_root(kvm, NULL);
865 
866 	while (root) {
867 		next_root = next_invalidated_root(kvm, root);
868 
869 		rcu_read_unlock();
870 
871 		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
872 
873 		/*
874 		 * Put the reference acquired in
875 		 * kvm_tdp_mmu_invalidate_roots
876 		 */
877 		kvm_tdp_mmu_put_root(kvm, root, true);
878 
879 		root = next_root;
880 
881 		rcu_read_lock();
882 	}
883 
884 	rcu_read_unlock();
885 
886 	if (flush)
887 		kvm_flush_remote_tlbs(kvm);
888 }
889 
890 /*
891  * Mark each TDP MMU root as invalid so that other threads
892  * will drop their references and allow the root count to
893  * go to 0.
894  *
895  * Also take a reference on all roots so that this thread
896  * can do the bulk of the work required to free the roots
897  * once they are invalidated. Without this reference, a
898  * vCPU thread might drop the last reference to a root and
899  * get stuck with tearing down the entire paging structure.
900  *
901  * Roots which have a zero refcount should be skipped as
902  * they're already being torn down.
903  * Already invalid roots should be referenced again so that
904  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
905  * done with them.
906  *
907  * This has essentially the same effect for the TDP MMU
908  * as updating mmu_valid_gen does for the shadow MMU.
909  */
910 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
911 {
912 	struct kvm_mmu_page *root;
913 
914 	lockdep_assert_held_write(&kvm->mmu_lock);
915 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
916 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
917 			root->role.invalid = true;
918 }
919 
920 /*
921  * Installs a last-level SPTE to handle a TDP page fault.
922  * (NPT/EPT violation/misconfiguration)
923  */
924 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
925 					  int map_writable,
926 					  struct tdp_iter *iter,
927 					  kvm_pfn_t pfn, bool prefault)
928 {
929 	u64 new_spte;
930 	int ret = RET_PF_FIXED;
931 	int make_spte_ret = 0;
932 
933 	if (unlikely(is_noslot_pfn(pfn)))
934 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
935 	else
936 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
937 					 pfn, iter->old_spte, prefault, true,
938 					 map_writable, !shadow_accessed_mask,
939 					 &new_spte);
940 
941 	if (new_spte == iter->old_spte)
942 		ret = RET_PF_SPURIOUS;
943 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
944 		return RET_PF_RETRY;
945 
946 	/*
947 	 * If the page fault was caused by a write but the page is write
948 	 * protected, emulation is needed. If the emulation was skipped,
949 	 * the vCPU would have the same fault again.
950 	 */
951 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
952 		if (write)
953 			ret = RET_PF_EMULATE;
954 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
955 	}
956 
957 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
958 	if (unlikely(is_mmio_spte(new_spte))) {
959 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
960 				     new_spte);
961 		ret = RET_PF_EMULATE;
962 	} else {
963 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
964 				       rcu_dereference(iter->sptep));
965 	}
966 
967 	/*
968 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
969 	 * consistent with legacy MMU behavior.
970 	 */
971 	if (ret != RET_PF_SPURIOUS)
972 		vcpu->stat.pf_fixed++;
973 
974 	return ret;
975 }
976 
977 /*
978  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
979  * page tables and SPTEs to translate the faulting guest physical address.
980  */
981 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
982 		    int map_writable, int max_level, kvm_pfn_t pfn,
983 		    bool prefault)
984 {
985 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
986 	bool write = error_code & PFERR_WRITE_MASK;
987 	bool exec = error_code & PFERR_FETCH_MASK;
988 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
989 	struct kvm_mmu *mmu = vcpu->arch.mmu;
990 	struct tdp_iter iter;
991 	struct kvm_mmu_page *sp;
992 	u64 *child_pt;
993 	u64 new_spte;
994 	int ret;
995 	gfn_t gfn = gpa >> PAGE_SHIFT;
996 	int level;
997 	int req_level;
998 
999 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
1000 					huge_page_disallowed, &req_level);
1001 
1002 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
1003 
1004 	rcu_read_lock();
1005 
1006 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1007 		if (nx_huge_page_workaround_enabled)
1008 			disallowed_hugepage_adjust(iter.old_spte, gfn,
1009 						   iter.level, &pfn, &level);
1010 
1011 		if (iter.level == level)
1012 			break;
1013 
1014 		/*
1015 		 * If there is an SPTE mapping a large page at a higher level
1016 		 * than the target, that SPTE must be cleared and replaced
1017 		 * with a non-leaf SPTE.
1018 		 */
1019 		if (is_shadow_present_pte(iter.old_spte) &&
1020 		    is_large_pte(iter.old_spte)) {
1021 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1022 				break;
1023 
1024 			/*
1025 			 * The iter must explicitly re-read the spte here
1026 			 * because the new value informs the !present
1027 			 * path below.
1028 			 */
1029 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1030 		}
1031 
1032 		if (!is_shadow_present_pte(iter.old_spte)) {
1033 			/*
1034 			 * If SPTE has been frozen by another thread, just
1035 			 * give up and retry, avoiding unnecessary page table
1036 			 * allocation and free.
1037 			 */
1038 			if (is_removed_spte(iter.old_spte))
1039 				break;
1040 
1041 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1042 			child_pt = sp->spt;
1043 
1044 			new_spte = make_nonleaf_spte(child_pt,
1045 						     !shadow_accessed_mask);
1046 
1047 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1048 						    new_spte)) {
1049 				tdp_mmu_link_page(vcpu->kvm, sp, true,
1050 						  huge_page_disallowed &&
1051 						  req_level >= iter.level);
1052 
1053 				trace_kvm_mmu_get_page(sp, true);
1054 			} else {
1055 				tdp_mmu_free_sp(sp);
1056 				break;
1057 			}
1058 		}
1059 	}
1060 
1061 	if (iter.level != level) {
1062 		rcu_read_unlock();
1063 		return RET_PF_RETRY;
1064 	}
1065 
1066 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1067 					      pfn, prefault);
1068 	rcu_read_unlock();
1069 
1070 	return ret;
1071 }
1072 
1073 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1074 				 bool flush)
1075 {
1076 	struct kvm_mmu_page *root;
1077 
1078 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1079 		flush |= zap_gfn_range(kvm, root, range->start, range->end,
1080 				       range->may_block, flush, false);
1081 
1082 	return flush;
1083 }
1084 
1085 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1086 			      struct kvm_gfn_range *range);
1087 
1088 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1089 						   struct kvm_gfn_range *range,
1090 						   tdp_handler_t handler)
1091 {
1092 	struct kvm_mmu_page *root;
1093 	struct tdp_iter iter;
1094 	bool ret = false;
1095 
1096 	rcu_read_lock();
1097 
1098 	/*
1099 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1100 	 * into this helper allow blocking; it'd be dead, wasteful code.
1101 	 */
1102 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1103 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1104 			ret |= handler(kvm, &iter, range);
1105 	}
1106 
1107 	rcu_read_unlock();
1108 
1109 	return ret;
1110 }
1111 
1112 /*
1113  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1114  * if any of the GFNs in the range have been accessed.
1115  */
1116 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1117 			  struct kvm_gfn_range *range)
1118 {
1119 	u64 new_spte = 0;
1120 
1121 	/* If we have a non-accessed entry we don't need to change the pte. */
1122 	if (!is_accessed_spte(iter->old_spte))
1123 		return false;
1124 
1125 	new_spte = iter->old_spte;
1126 
1127 	if (spte_ad_enabled(new_spte)) {
1128 		new_spte &= ~shadow_accessed_mask;
1129 	} else {
1130 		/*
1131 		 * Capture the dirty status of the page, so that it doesn't get
1132 		 * lost when the SPTE is marked for access tracking.
1133 		 */
1134 		if (is_writable_pte(new_spte))
1135 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1136 
1137 		new_spte = mark_spte_for_access_track(new_spte);
1138 	}
1139 
1140 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1141 
1142 	return true;
1143 }
1144 
1145 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1146 {
1147 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1148 }
1149 
1150 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1151 			 struct kvm_gfn_range *range)
1152 {
1153 	return is_accessed_spte(iter->old_spte);
1154 }
1155 
1156 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1157 {
1158 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1159 }
1160 
1161 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1162 			 struct kvm_gfn_range *range)
1163 {
1164 	u64 new_spte;
1165 
1166 	/* Huge pages aren't expected to be modified without first being zapped. */
1167 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1168 
1169 	if (iter->level != PG_LEVEL_4K ||
1170 	    !is_shadow_present_pte(iter->old_spte))
1171 		return false;
1172 
1173 	/*
1174 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1175 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1176 	 * invariant that the PFN of a present * leaf SPTE can never change.
1177 	 * See __handle_changed_spte().
1178 	 */
1179 	tdp_mmu_set_spte(kvm, iter, 0);
1180 
1181 	if (!pte_write(range->pte)) {
1182 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1183 								  pte_pfn(range->pte));
1184 
1185 		tdp_mmu_set_spte(kvm, iter, new_spte);
1186 	}
1187 
1188 	return true;
1189 }
1190 
1191 /*
1192  * Handle the changed_pte MMU notifier for the TDP MMU.
1193  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1194  * notifier.
1195  * Returns non-zero if a flush is needed before releasing the MMU lock.
1196  */
1197 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1198 {
1199 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1200 
1201 	/* FIXME: return 'flush' instead of flushing here. */
1202 	if (flush)
1203 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1204 
1205 	return false;
1206 }
1207 
1208 /*
1209  * Remove write access from all SPTEs at or above min_level that map GFNs
1210  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1211  * be flushed.
1212  */
1213 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1214 			     gfn_t start, gfn_t end, int min_level)
1215 {
1216 	struct tdp_iter iter;
1217 	u64 new_spte;
1218 	bool spte_set = false;
1219 
1220 	rcu_read_lock();
1221 
1222 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1223 
1224 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1225 				   min_level, start, end) {
1226 retry:
1227 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1228 			continue;
1229 
1230 		if (!is_shadow_present_pte(iter.old_spte) ||
1231 		    !is_last_spte(iter.old_spte, iter.level) ||
1232 		    !(iter.old_spte & PT_WRITABLE_MASK))
1233 			continue;
1234 
1235 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1236 
1237 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1238 							  new_spte)) {
1239 			/*
1240 			 * The iter must explicitly re-read the SPTE because
1241 			 * the atomic cmpxchg failed.
1242 			 */
1243 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1244 			goto retry;
1245 		}
1246 		spte_set = true;
1247 	}
1248 
1249 	rcu_read_unlock();
1250 	return spte_set;
1251 }
1252 
1253 /*
1254  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1255  * only affect leaf SPTEs down to min_level.
1256  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1257  */
1258 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1259 			     int min_level)
1260 {
1261 	struct kvm_mmu_page *root;
1262 	bool spte_set = false;
1263 
1264 	lockdep_assert_held_read(&kvm->mmu_lock);
1265 
1266 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1267 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1268 			     slot->base_gfn + slot->npages, min_level);
1269 
1270 	return spte_set;
1271 }
1272 
1273 /*
1274  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1275  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1276  * If AD bits are not enabled, this will require clearing the writable bit on
1277  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1278  * be flushed.
1279  */
1280 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1281 			   gfn_t start, gfn_t end)
1282 {
1283 	struct tdp_iter iter;
1284 	u64 new_spte;
1285 	bool spte_set = false;
1286 
1287 	rcu_read_lock();
1288 
1289 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1290 retry:
1291 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1292 			continue;
1293 
1294 		if (spte_ad_need_write_protect(iter.old_spte)) {
1295 			if (is_writable_pte(iter.old_spte))
1296 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1297 			else
1298 				continue;
1299 		} else {
1300 			if (iter.old_spte & shadow_dirty_mask)
1301 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1302 			else
1303 				continue;
1304 		}
1305 
1306 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1307 							  new_spte)) {
1308 			/*
1309 			 * The iter must explicitly re-read the SPTE because
1310 			 * the atomic cmpxchg failed.
1311 			 */
1312 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1313 			goto retry;
1314 		}
1315 		spte_set = true;
1316 	}
1317 
1318 	rcu_read_unlock();
1319 	return spte_set;
1320 }
1321 
1322 /*
1323  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1324  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1325  * If AD bits are not enabled, this will require clearing the writable bit on
1326  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1327  * be flushed.
1328  */
1329 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1330 {
1331 	struct kvm_mmu_page *root;
1332 	bool spte_set = false;
1333 
1334 	lockdep_assert_held_read(&kvm->mmu_lock);
1335 
1336 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1337 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1338 				slot->base_gfn + slot->npages);
1339 
1340 	return spte_set;
1341 }
1342 
1343 /*
1344  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1345  * set in mask, starting at gfn. The given memslot is expected to contain all
1346  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1347  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1348  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1349  */
1350 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1351 				  gfn_t gfn, unsigned long mask, bool wrprot)
1352 {
1353 	struct tdp_iter iter;
1354 	u64 new_spte;
1355 
1356 	rcu_read_lock();
1357 
1358 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1359 				    gfn + BITS_PER_LONG) {
1360 		if (!mask)
1361 			break;
1362 
1363 		if (iter.level > PG_LEVEL_4K ||
1364 		    !(mask & (1UL << (iter.gfn - gfn))))
1365 			continue;
1366 
1367 		mask &= ~(1UL << (iter.gfn - gfn));
1368 
1369 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1370 			if (is_writable_pte(iter.old_spte))
1371 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1372 			else
1373 				continue;
1374 		} else {
1375 			if (iter.old_spte & shadow_dirty_mask)
1376 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1377 			else
1378 				continue;
1379 		}
1380 
1381 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1382 	}
1383 
1384 	rcu_read_unlock();
1385 }
1386 
1387 /*
1388  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1389  * set in mask, starting at gfn. The given memslot is expected to contain all
1390  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1391  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1392  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1393  */
1394 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1395 				       struct kvm_memory_slot *slot,
1396 				       gfn_t gfn, unsigned long mask,
1397 				       bool wrprot)
1398 {
1399 	struct kvm_mmu_page *root;
1400 
1401 	lockdep_assert_held_write(&kvm->mmu_lock);
1402 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1403 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1404 }
1405 
1406 /*
1407  * Clear leaf entries which could be replaced by large mappings, for
1408  * GFNs within the slot.
1409  */
1410 static bool zap_collapsible_spte_range(struct kvm *kvm,
1411 				       struct kvm_mmu_page *root,
1412 				       const struct kvm_memory_slot *slot,
1413 				       bool flush)
1414 {
1415 	gfn_t start = slot->base_gfn;
1416 	gfn_t end = start + slot->npages;
1417 	struct tdp_iter iter;
1418 	kvm_pfn_t pfn;
1419 
1420 	rcu_read_lock();
1421 
1422 	tdp_root_for_each_pte(iter, root, start, end) {
1423 retry:
1424 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1425 			flush = false;
1426 			continue;
1427 		}
1428 
1429 		if (!is_shadow_present_pte(iter.old_spte) ||
1430 		    !is_last_spte(iter.old_spte, iter.level))
1431 			continue;
1432 
1433 		pfn = spte_to_pfn(iter.old_spte);
1434 		if (kvm_is_reserved_pfn(pfn) ||
1435 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1436 							    pfn, PG_LEVEL_NUM))
1437 			continue;
1438 
1439 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1440 			/*
1441 			 * The iter must explicitly re-read the SPTE because
1442 			 * the atomic cmpxchg failed.
1443 			 */
1444 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1445 			goto retry;
1446 		}
1447 		flush = true;
1448 	}
1449 
1450 	rcu_read_unlock();
1451 
1452 	return flush;
1453 }
1454 
1455 /*
1456  * Clear non-leaf entries (and free associated page tables) which could
1457  * be replaced by large mappings, for GFNs within the slot.
1458  */
1459 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1460 				       const struct kvm_memory_slot *slot,
1461 				       bool flush)
1462 {
1463 	struct kvm_mmu_page *root;
1464 
1465 	lockdep_assert_held_read(&kvm->mmu_lock);
1466 
1467 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1468 		flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1469 
1470 	return flush;
1471 }
1472 
1473 /*
1474  * Removes write access on the last level SPTE mapping this GFN and unsets the
1475  * MMU-writable bit to ensure future writes continue to be intercepted.
1476  * Returns true if an SPTE was set and a TLB flush is needed.
1477  */
1478 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1479 			      gfn_t gfn, int min_level)
1480 {
1481 	struct tdp_iter iter;
1482 	u64 new_spte;
1483 	bool spte_set = false;
1484 
1485 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1486 
1487 	rcu_read_lock();
1488 
1489 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1490 				   min_level, gfn, gfn + 1) {
1491 		if (!is_shadow_present_pte(iter.old_spte) ||
1492 		    !is_last_spte(iter.old_spte, iter.level))
1493 			continue;
1494 
1495 		if (!is_writable_pte(iter.old_spte))
1496 			break;
1497 
1498 		new_spte = iter.old_spte &
1499 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1500 
1501 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1502 		spte_set = true;
1503 	}
1504 
1505 	rcu_read_unlock();
1506 
1507 	return spte_set;
1508 }
1509 
1510 /*
1511  * Removes write access on the last level SPTE mapping this GFN and unsets the
1512  * MMU-writable bit to ensure future writes continue to be intercepted.
1513  * Returns true if an SPTE was set and a TLB flush is needed.
1514  */
1515 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1516 				   struct kvm_memory_slot *slot, gfn_t gfn,
1517 				   int min_level)
1518 {
1519 	struct kvm_mmu_page *root;
1520 	bool spte_set = false;
1521 
1522 	lockdep_assert_held_write(&kvm->mmu_lock);
1523 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1524 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1525 
1526 	return spte_set;
1527 }
1528 
1529 /*
1530  * Return the level of the lowest level SPTE added to sptes.
1531  * That SPTE may be non-present.
1532  */
1533 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1534 			 int *root_level)
1535 {
1536 	struct tdp_iter iter;
1537 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1538 	gfn_t gfn = addr >> PAGE_SHIFT;
1539 	int leaf = -1;
1540 
1541 	*root_level = vcpu->arch.mmu->shadow_root_level;
1542 
1543 	rcu_read_lock();
1544 
1545 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1546 		leaf = iter.level;
1547 		sptes[leaf] = iter.old_spte;
1548 	}
1549 
1550 	rcu_read_unlock();
1551 
1552 	return leaf;
1553 }
1554