xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 6d425d7c)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.
52 	 */
53 	rcu_barrier();
54 }
55 
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
58 			  bool shared);
59 
60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 	free_page((unsigned long)sp->spt);
63 	kmem_cache_free(mmu_page_header_cache, sp);
64 }
65 
66 /*
67  * This is called through call_rcu in order to free TDP page table memory
68  * safely with respect to other kernel threads that may be operating on
69  * the memory.
70  * By only accessing TDP MMU page table memory in an RCU read critical
71  * section, and freeing it after a grace period, lockless access to that
72  * memory won't use it after it is freed.
73  */
74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 					       rcu_head);
78 
79 	tdp_mmu_free_sp(sp);
80 }
81 
82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 			  bool shared)
84 {
85 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86 
87 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 		return;
89 
90 	WARN_ON(!root->tdp_mmu_page);
91 
92 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 	list_del_rcu(&root->link);
94 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95 
96 	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97 
98 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100 
101 /*
102  * Finds the next valid root after root (or the first valid root if root
103  * is NULL), takes a reference on it, and returns that next root. If root
104  * is not NULL, this thread should have already taken a reference on it, and
105  * that reference will be dropped. If no valid root is found, this
106  * function will return NULL.
107  */
108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
109 					      struct kvm_mmu_page *prev_root,
110 					      bool shared)
111 {
112 	struct kvm_mmu_page *next_root;
113 
114 	rcu_read_lock();
115 
116 	if (prev_root)
117 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 						  &prev_root->link,
119 						  typeof(*prev_root), link);
120 	else
121 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 						   typeof(*next_root), link);
123 
124 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 				&next_root->link, typeof(*next_root), link);
127 
128 	rcu_read_unlock();
129 
130 	if (prev_root)
131 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
132 
133 	return next_root;
134 }
135 
136 /*
137  * Note: this iterator gets and puts references to the roots it iterates over.
138  * This makes it safe to release the MMU lock and yield within the loop, but
139  * if exiting the loop early, the caller must drop the reference to the most
140  * recent root. (Unless keeping a live reference is desirable.)
141  *
142  * If shared is set, this function is operating under the MMU lock in read
143  * mode. In the unlikely event that this thread must free a root, the lock
144  * will be temporarily dropped and reacquired in write mode.
145  */
146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
147 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
148 	     _root;							\
149 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
150 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
151 		} else
152 
153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
154 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
155 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
156 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
157 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
158 		} else
159 
160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 						   int level)
162 {
163 	union kvm_mmu_page_role role;
164 
165 	role = vcpu->arch.mmu->mmu_role.base;
166 	role.level = level;
167 	role.direct = true;
168 	role.gpte_is_8_bytes = true;
169 	role.access = ACC_ALL;
170 	role.ad_disabled = !shadow_accessed_mask;
171 
172 	return role;
173 }
174 
175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
176 					       int level)
177 {
178 	struct kvm_mmu_page *sp;
179 
180 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
181 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
182 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
183 
184 	sp->role.word = page_role_for_level(vcpu, level).word;
185 	sp->gfn = gfn;
186 	sp->tdp_mmu_page = true;
187 
188 	trace_kvm_mmu_get_page(sp, true);
189 
190 	return sp;
191 }
192 
193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
194 {
195 	union kvm_mmu_page_role role;
196 	struct kvm *kvm = vcpu->kvm;
197 	struct kvm_mmu_page *root;
198 
199 	lockdep_assert_held_write(&kvm->mmu_lock);
200 
201 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
202 
203 	/* Check for an existing root before allocating a new one. */
204 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
205 		if (root->role.word == role.word &&
206 		    kvm_tdp_mmu_get_root(kvm, root))
207 			goto out;
208 	}
209 
210 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
211 	refcount_set(&root->tdp_mmu_root_count, 1);
212 
213 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
214 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
215 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
216 
217 out:
218 	return __pa(root->spt);
219 }
220 
221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
222 				u64 old_spte, u64 new_spte, int level,
223 				bool shared);
224 
225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
226 {
227 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
228 		return;
229 
230 	if (is_accessed_spte(old_spte) &&
231 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
232 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
233 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
234 }
235 
236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
237 					  u64 old_spte, u64 new_spte, int level)
238 {
239 	bool pfn_changed;
240 	struct kvm_memory_slot *slot;
241 
242 	if (level > PG_LEVEL_4K)
243 		return;
244 
245 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
246 
247 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
248 	    is_writable_pte(new_spte)) {
249 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
250 		mark_page_dirty_in_slot(kvm, slot, gfn);
251 	}
252 }
253 
254 /**
255  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
256  *
257  * @kvm: kvm instance
258  * @sp: the new page
259  * @account_nx: This page replaces a NX large page and should be marked for
260  *		eventual reclaim.
261  */
262 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
263 			      bool account_nx)
264 {
265 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
266 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
267 	if (account_nx)
268 		account_huge_nx_page(kvm, sp);
269 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
270 }
271 
272 /**
273  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
274  *
275  * @kvm: kvm instance
276  * @sp: the page to be removed
277  * @shared: This operation may not be running under the exclusive use of
278  *	    the MMU lock and the operation must synchronize with other
279  *	    threads that might be adding or removing pages.
280  */
281 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
282 				bool shared)
283 {
284 	if (shared)
285 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
286 	else
287 		lockdep_assert_held_write(&kvm->mmu_lock);
288 
289 	list_del(&sp->link);
290 	if (sp->lpage_disallowed)
291 		unaccount_huge_nx_page(kvm, sp);
292 
293 	if (shared)
294 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
295 }
296 
297 /**
298  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
299  *
300  * @kvm: kvm instance
301  * @pt: the page removed from the paging structure
302  * @shared: This operation may not be running under the exclusive use
303  *	    of the MMU lock and the operation must synchronize with other
304  *	    threads that might be modifying SPTEs.
305  *
306  * Given a page table that has been removed from the TDP paging structure,
307  * iterates through the page table to clear SPTEs and free child page tables.
308  *
309  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
310  * protection. Since this thread removed it from the paging structure,
311  * this thread will be responsible for ensuring the page is freed. Hence the
312  * early rcu_dereferences in the function.
313  */
314 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
315 					bool shared)
316 {
317 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
318 	int level = sp->role.level;
319 	gfn_t base_gfn = sp->gfn;
320 	int i;
321 
322 	trace_kvm_mmu_prepare_zap_page(sp);
323 
324 	tdp_mmu_unlink_page(kvm, sp, shared);
325 
326 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
327 		u64 *sptep = rcu_dereference(pt) + i;
328 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
329 		u64 old_child_spte;
330 
331 		if (shared) {
332 			/*
333 			 * Set the SPTE to a nonpresent value that other
334 			 * threads will not overwrite. If the SPTE was
335 			 * already marked as removed then another thread
336 			 * handling a page fault could overwrite it, so
337 			 * set the SPTE until it is set from some other
338 			 * value to the removed SPTE value.
339 			 */
340 			for (;;) {
341 				old_child_spte = xchg(sptep, REMOVED_SPTE);
342 				if (!is_removed_spte(old_child_spte))
343 					break;
344 				cpu_relax();
345 			}
346 		} else {
347 			/*
348 			 * If the SPTE is not MMU-present, there is no backing
349 			 * page associated with the SPTE and so no side effects
350 			 * that need to be recorded, and exclusive ownership of
351 			 * mmu_lock ensures the SPTE can't be made present.
352 			 * Note, zapping MMIO SPTEs is also unnecessary as they
353 			 * are guarded by the memslots generation, not by being
354 			 * unreachable.
355 			 */
356 			old_child_spte = READ_ONCE(*sptep);
357 			if (!is_shadow_present_pte(old_child_spte))
358 				continue;
359 
360 			/*
361 			 * Marking the SPTE as a removed SPTE is not
362 			 * strictly necessary here as the MMU lock will
363 			 * stop other threads from concurrently modifying
364 			 * this SPTE. Using the removed SPTE value keeps
365 			 * the two branches consistent and simplifies
366 			 * the function.
367 			 */
368 			WRITE_ONCE(*sptep, REMOVED_SPTE);
369 		}
370 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
371 				    old_child_spte, REMOVED_SPTE, level,
372 				    shared);
373 	}
374 
375 	kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
376 					   KVM_PAGES_PER_HPAGE(level + 1));
377 
378 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
379 }
380 
381 /**
382  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
383  * @kvm: kvm instance
384  * @as_id: the address space of the paging structure the SPTE was a part of
385  * @gfn: the base GFN that was mapped by the SPTE
386  * @old_spte: The value of the SPTE before the change
387  * @new_spte: The value of the SPTE after the change
388  * @level: the level of the PT the SPTE is part of in the paging structure
389  * @shared: This operation may not be running under the exclusive use of
390  *	    the MMU lock and the operation must synchronize with other
391  *	    threads that might be modifying SPTEs.
392  *
393  * Handle bookkeeping that might result from the modification of a SPTE.
394  * This function must be called for all TDP SPTE modifications.
395  */
396 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
397 				  u64 old_spte, u64 new_spte, int level,
398 				  bool shared)
399 {
400 	bool was_present = is_shadow_present_pte(old_spte);
401 	bool is_present = is_shadow_present_pte(new_spte);
402 	bool was_leaf = was_present && is_last_spte(old_spte, level);
403 	bool is_leaf = is_present && is_last_spte(new_spte, level);
404 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
405 
406 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
407 	WARN_ON(level < PG_LEVEL_4K);
408 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
409 
410 	/*
411 	 * If this warning were to trigger it would indicate that there was a
412 	 * missing MMU notifier or a race with some notifier handler.
413 	 * A present, leaf SPTE should never be directly replaced with another
414 	 * present leaf SPTE pointing to a different PFN. A notifier handler
415 	 * should be zapping the SPTE before the main MM's page table is
416 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
417 	 * thread before replacement.
418 	 */
419 	if (was_leaf && is_leaf && pfn_changed) {
420 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
421 		       "SPTE with another present leaf SPTE mapping a\n"
422 		       "different PFN!\n"
423 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
424 		       as_id, gfn, old_spte, new_spte, level);
425 
426 		/*
427 		 * Crash the host to prevent error propagation and guest data
428 		 * corruption.
429 		 */
430 		BUG();
431 	}
432 
433 	if (old_spte == new_spte)
434 		return;
435 
436 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
437 
438 	/*
439 	 * The only times a SPTE should be changed from a non-present to
440 	 * non-present state is when an MMIO entry is installed/modified/
441 	 * removed. In that case, there is nothing to do here.
442 	 */
443 	if (!was_present && !is_present) {
444 		/*
445 		 * If this change does not involve a MMIO SPTE or removed SPTE,
446 		 * it is unexpected. Log the change, though it should not
447 		 * impact the guest since both the former and current SPTEs
448 		 * are nonpresent.
449 		 */
450 		if (WARN_ON(!is_mmio_spte(old_spte) &&
451 			    !is_mmio_spte(new_spte) &&
452 			    !is_removed_spte(new_spte)))
453 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
454 			       "should not be replaced with another,\n"
455 			       "different nonpresent SPTE, unless one or both\n"
456 			       "are MMIO SPTEs, or the new SPTE is\n"
457 			       "a temporary removed SPTE.\n"
458 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
459 			       as_id, gfn, old_spte, new_spte, level);
460 		return;
461 	}
462 
463 	if (is_leaf != was_leaf)
464 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
465 
466 	if (was_leaf && is_dirty_spte(old_spte) &&
467 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
468 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
469 
470 	/*
471 	 * Recursively handle child PTs if the change removed a subtree from
472 	 * the paging structure.
473 	 */
474 	if (was_present && !was_leaf && (pfn_changed || !is_present))
475 		handle_removed_tdp_mmu_page(kvm,
476 				spte_to_child_pt(old_spte, level), shared);
477 }
478 
479 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
480 				u64 old_spte, u64 new_spte, int level,
481 				bool shared)
482 {
483 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
484 			      shared);
485 	handle_changed_spte_acc_track(old_spte, new_spte, level);
486 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
487 				      new_spte, level);
488 }
489 
490 /*
491  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
492  * and handle the associated bookkeeping.  Do not mark the page dirty
493  * in KVM's dirty bitmaps.
494  *
495  * @kvm: kvm instance
496  * @iter: a tdp_iter instance currently on the SPTE that should be set
497  * @new_spte: The value the SPTE should be set to
498  * Returns: true if the SPTE was set, false if it was not. If false is returned,
499  *	    this function will have no side-effects.
500  */
501 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
502 					   struct tdp_iter *iter,
503 					   u64 new_spte)
504 {
505 	lockdep_assert_held_read(&kvm->mmu_lock);
506 
507 	/*
508 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
509 	 * may modify it.
510 	 */
511 	if (is_removed_spte(iter->old_spte))
512 		return false;
513 
514 	/*
515 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
516 	 * does not hold the mmu_lock.
517 	 */
518 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
519 		      new_spte) != iter->old_spte)
520 		return false;
521 
522 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
523 			      new_spte, iter->level, true);
524 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
525 
526 	return true;
527 }
528 
529 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
530 					   struct tdp_iter *iter)
531 {
532 	/*
533 	 * Freeze the SPTE by setting it to a special,
534 	 * non-present value. This will stop other threads from
535 	 * immediately installing a present entry in its place
536 	 * before the TLBs are flushed.
537 	 */
538 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
539 		return false;
540 
541 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
542 					   KVM_PAGES_PER_HPAGE(iter->level));
543 
544 	/*
545 	 * No other thread can overwrite the removed SPTE as they
546 	 * must either wait on the MMU lock or use
547 	 * tdp_mmu_set_spte_atomic which will not overwrite the
548 	 * special removed SPTE value. No bookkeeping is needed
549 	 * here since the SPTE is going from non-present
550 	 * to non-present.
551 	 */
552 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
553 
554 	return true;
555 }
556 
557 
558 /*
559  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
560  * @kvm: kvm instance
561  * @iter: a tdp_iter instance currently on the SPTE that should be set
562  * @new_spte: The value the SPTE should be set to
563  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
564  *		      of the page. Should be set unless handling an MMU
565  *		      notifier for access tracking. Leaving record_acc_track
566  *		      unset in that case prevents page accesses from being
567  *		      double counted.
568  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
569  *		      appropriate for the change being made. Should be set
570  *		      unless performing certain dirty logging operations.
571  *		      Leaving record_dirty_log unset in that case prevents page
572  *		      writes from being double counted.
573  */
574 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
575 				      u64 new_spte, bool record_acc_track,
576 				      bool record_dirty_log)
577 {
578 	lockdep_assert_held_write(&kvm->mmu_lock);
579 
580 	/*
581 	 * No thread should be using this function to set SPTEs to the
582 	 * temporary removed SPTE value.
583 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
584 	 * should be used. If operating under the MMU lock in write mode, the
585 	 * use of the removed SPTE should not be necessary.
586 	 */
587 	WARN_ON(is_removed_spte(iter->old_spte));
588 
589 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
590 
591 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
592 			      new_spte, iter->level, false);
593 	if (record_acc_track)
594 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
595 					      iter->level);
596 	if (record_dirty_log)
597 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
598 					      iter->old_spte, new_spte,
599 					      iter->level);
600 }
601 
602 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
603 				    u64 new_spte)
604 {
605 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
606 }
607 
608 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
609 						 struct tdp_iter *iter,
610 						 u64 new_spte)
611 {
612 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
613 }
614 
615 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
616 						 struct tdp_iter *iter,
617 						 u64 new_spte)
618 {
619 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
620 }
621 
622 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
623 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
624 
625 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
626 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
627 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
628 		    !is_last_spte(_iter.old_spte, _iter.level))		\
629 			continue;					\
630 		else
631 
632 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
633 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
634 			 _mmu->shadow_root_level, _start, _end)
635 
636 /*
637  * Yield if the MMU lock is contended or this thread needs to return control
638  * to the scheduler.
639  *
640  * If this function should yield and flush is set, it will perform a remote
641  * TLB flush before yielding.
642  *
643  * If this function yields, it will also reset the tdp_iter's walk over the
644  * paging structure and the calling function should skip to the next
645  * iteration to allow the iterator to continue its traversal from the
646  * paging structure root.
647  *
648  * Return true if this function yielded and the iterator's traversal was reset.
649  * Return false if a yield was not needed.
650  */
651 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
652 					     struct tdp_iter *iter, bool flush,
653 					     bool shared)
654 {
655 	/* Ensure forward progress has been made before yielding. */
656 	if (iter->next_last_level_gfn == iter->yielded_gfn)
657 		return false;
658 
659 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
660 		rcu_read_unlock();
661 
662 		if (flush)
663 			kvm_flush_remote_tlbs(kvm);
664 
665 		if (shared)
666 			cond_resched_rwlock_read(&kvm->mmu_lock);
667 		else
668 			cond_resched_rwlock_write(&kvm->mmu_lock);
669 
670 		rcu_read_lock();
671 
672 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
673 
674 		tdp_iter_restart(iter);
675 
676 		return true;
677 	}
678 
679 	return false;
680 }
681 
682 /*
683  * Tears down the mappings for the range of gfns, [start, end), and frees the
684  * non-root pages mapping GFNs strictly within that range. Returns true if
685  * SPTEs have been cleared and a TLB flush is needed before releasing the
686  * MMU lock.
687  *
688  * If can_yield is true, will release the MMU lock and reschedule if the
689  * scheduler needs the CPU or there is contention on the MMU lock. If this
690  * function cannot yield, it will not release the MMU lock or reschedule and
691  * the caller must ensure it does not supply too large a GFN range, or the
692  * operation can cause a soft lockup.
693  *
694  * If shared is true, this thread holds the MMU lock in read mode and must
695  * account for the possibility that other threads are modifying the paging
696  * structures concurrently. If shared is false, this thread should hold the
697  * MMU lock in write mode.
698  */
699 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
700 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
701 			  bool shared)
702 {
703 	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
704 	bool zap_all = (start == 0 && end >= max_gfn_host);
705 	struct tdp_iter iter;
706 
707 	/*
708 	 * No need to try to step down in the iterator when zapping all SPTEs,
709 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
710 	 */
711 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
712 
713 	/*
714 	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
715 	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
716 	 * and so KVM will never install a SPTE for such addresses.
717 	 */
718 	end = min(end, max_gfn_host);
719 
720 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
721 
722 	rcu_read_lock();
723 
724 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
725 				   min_level, start, end) {
726 retry:
727 		if (can_yield &&
728 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
729 			flush = false;
730 			continue;
731 		}
732 
733 		if (!is_shadow_present_pte(iter.old_spte))
734 			continue;
735 
736 		/*
737 		 * If this is a non-last-level SPTE that covers a larger range
738 		 * than should be zapped, continue, and zap the mappings at a
739 		 * lower level, except when zapping all SPTEs.
740 		 */
741 		if (!zap_all &&
742 		    (iter.gfn < start ||
743 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
744 		    !is_last_spte(iter.old_spte, iter.level))
745 			continue;
746 
747 		if (!shared) {
748 			tdp_mmu_set_spte(kvm, &iter, 0);
749 			flush = true;
750 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
751 			/*
752 			 * The iter must explicitly re-read the SPTE because
753 			 * the atomic cmpxchg failed.
754 			 */
755 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
756 			goto retry;
757 		}
758 	}
759 
760 	rcu_read_unlock();
761 	return flush;
762 }
763 
764 /*
765  * Tears down the mappings for the range of gfns, [start, end), and frees the
766  * non-root pages mapping GFNs strictly within that range. Returns true if
767  * SPTEs have been cleared and a TLB flush is needed before releasing the
768  * MMU lock.
769  */
770 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
771 				 gfn_t end, bool can_yield, bool flush)
772 {
773 	struct kvm_mmu_page *root;
774 
775 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
776 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
777 				      false);
778 
779 	return flush;
780 }
781 
782 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
783 {
784 	bool flush = false;
785 	int i;
786 
787 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
788 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
789 
790 	if (flush)
791 		kvm_flush_remote_tlbs(kvm);
792 }
793 
794 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
795 						  struct kvm_mmu_page *prev_root)
796 {
797 	struct kvm_mmu_page *next_root;
798 
799 	if (prev_root)
800 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
801 						  &prev_root->link,
802 						  typeof(*prev_root), link);
803 	else
804 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
805 						   typeof(*next_root), link);
806 
807 	while (next_root && !(next_root->role.invalid &&
808 			      refcount_read(&next_root->tdp_mmu_root_count)))
809 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
810 						  &next_root->link,
811 						  typeof(*next_root), link);
812 
813 	return next_root;
814 }
815 
816 /*
817  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
818  * invalidated root, they will not be freed until this function drops the
819  * reference. Before dropping that reference, tear down the paging
820  * structure so that whichever thread does drop the last reference
821  * only has to do a trivial amount of work. Since the roots are invalid,
822  * no new SPTEs should be created under them.
823  */
824 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
825 {
826 	struct kvm_mmu_page *next_root;
827 	struct kvm_mmu_page *root;
828 	bool flush = false;
829 
830 	lockdep_assert_held_read(&kvm->mmu_lock);
831 
832 	rcu_read_lock();
833 
834 	root = next_invalidated_root(kvm, NULL);
835 
836 	while (root) {
837 		next_root = next_invalidated_root(kvm, root);
838 
839 		rcu_read_unlock();
840 
841 		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
842 
843 		/*
844 		 * Put the reference acquired in
845 		 * kvm_tdp_mmu_invalidate_roots
846 		 */
847 		kvm_tdp_mmu_put_root(kvm, root, true);
848 
849 		root = next_root;
850 
851 		rcu_read_lock();
852 	}
853 
854 	rcu_read_unlock();
855 
856 	if (flush)
857 		kvm_flush_remote_tlbs(kvm);
858 }
859 
860 /*
861  * Mark each TDP MMU root as invalid so that other threads
862  * will drop their references and allow the root count to
863  * go to 0.
864  *
865  * Also take a reference on all roots so that this thread
866  * can do the bulk of the work required to free the roots
867  * once they are invalidated. Without this reference, a
868  * vCPU thread might drop the last reference to a root and
869  * get stuck with tearing down the entire paging structure.
870  *
871  * Roots which have a zero refcount should be skipped as
872  * they're already being torn down.
873  * Already invalid roots should be referenced again so that
874  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
875  * done with them.
876  *
877  * This has essentially the same effect for the TDP MMU
878  * as updating mmu_valid_gen does for the shadow MMU.
879  */
880 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
881 {
882 	struct kvm_mmu_page *root;
883 
884 	lockdep_assert_held_write(&kvm->mmu_lock);
885 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
886 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
887 			root->role.invalid = true;
888 }
889 
890 /*
891  * Installs a last-level SPTE to handle a TDP page fault.
892  * (NPT/EPT violation/misconfiguration)
893  */
894 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
895 					  struct kvm_page_fault *fault,
896 					  struct tdp_iter *iter)
897 {
898 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
899 	u64 new_spte;
900 	int ret = RET_PF_FIXED;
901 	bool wrprot = false;
902 
903 	WARN_ON(sp->role.level != fault->goal_level);
904 	if (unlikely(!fault->slot))
905 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
906 	else
907 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
908 					 fault->pfn, iter->old_spte, fault->prefetch, true,
909 					 fault->map_writable, &new_spte);
910 
911 	if (new_spte == iter->old_spte)
912 		ret = RET_PF_SPURIOUS;
913 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
914 		return RET_PF_RETRY;
915 
916 	/*
917 	 * If the page fault was caused by a write but the page is write
918 	 * protected, emulation is needed. If the emulation was skipped,
919 	 * the vCPU would have the same fault again.
920 	 */
921 	if (wrprot) {
922 		if (fault->write)
923 			ret = RET_PF_EMULATE;
924 	}
925 
926 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
927 	if (unlikely(is_mmio_spte(new_spte))) {
928 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
929 				     new_spte);
930 		ret = RET_PF_EMULATE;
931 	} else {
932 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
933 				       rcu_dereference(iter->sptep));
934 	}
935 
936 	/*
937 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
938 	 * consistent with legacy MMU behavior.
939 	 */
940 	if (ret != RET_PF_SPURIOUS)
941 		vcpu->stat.pf_fixed++;
942 
943 	return ret;
944 }
945 
946 /*
947  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
948  * page tables and SPTEs to translate the faulting guest physical address.
949  */
950 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
951 {
952 	struct kvm_mmu *mmu = vcpu->arch.mmu;
953 	struct tdp_iter iter;
954 	struct kvm_mmu_page *sp;
955 	u64 *child_pt;
956 	u64 new_spte;
957 	int ret;
958 
959 	kvm_mmu_hugepage_adjust(vcpu, fault);
960 
961 	trace_kvm_mmu_spte_requested(fault);
962 
963 	rcu_read_lock();
964 
965 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
966 		if (fault->nx_huge_page_workaround_enabled)
967 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
968 
969 		if (iter.level == fault->goal_level)
970 			break;
971 
972 		/*
973 		 * If there is an SPTE mapping a large page at a higher level
974 		 * than the target, that SPTE must be cleared and replaced
975 		 * with a non-leaf SPTE.
976 		 */
977 		if (is_shadow_present_pte(iter.old_spte) &&
978 		    is_large_pte(iter.old_spte)) {
979 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
980 				break;
981 
982 			/*
983 			 * The iter must explicitly re-read the spte here
984 			 * because the new value informs the !present
985 			 * path below.
986 			 */
987 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
988 		}
989 
990 		if (!is_shadow_present_pte(iter.old_spte)) {
991 			/*
992 			 * If SPTE has been frozen by another thread, just
993 			 * give up and retry, avoiding unnecessary page table
994 			 * allocation and free.
995 			 */
996 			if (is_removed_spte(iter.old_spte))
997 				break;
998 
999 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1000 			child_pt = sp->spt;
1001 
1002 			new_spte = make_nonleaf_spte(child_pt,
1003 						     !shadow_accessed_mask);
1004 
1005 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
1006 				tdp_mmu_link_page(vcpu->kvm, sp,
1007 						  fault->huge_page_disallowed &&
1008 						  fault->req_level >= iter.level);
1009 
1010 				trace_kvm_mmu_get_page(sp, true);
1011 			} else {
1012 				tdp_mmu_free_sp(sp);
1013 				break;
1014 			}
1015 		}
1016 	}
1017 
1018 	if (iter.level != fault->goal_level) {
1019 		rcu_read_unlock();
1020 		return RET_PF_RETRY;
1021 	}
1022 
1023 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1024 	rcu_read_unlock();
1025 
1026 	return ret;
1027 }
1028 
1029 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1030 				 bool flush)
1031 {
1032 	struct kvm_mmu_page *root;
1033 
1034 	for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
1035 		flush = zap_gfn_range(kvm, root, range->start, range->end,
1036 				      range->may_block, flush, false);
1037 
1038 	return flush;
1039 }
1040 
1041 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1042 			      struct kvm_gfn_range *range);
1043 
1044 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1045 						   struct kvm_gfn_range *range,
1046 						   tdp_handler_t handler)
1047 {
1048 	struct kvm_mmu_page *root;
1049 	struct tdp_iter iter;
1050 	bool ret = false;
1051 
1052 	rcu_read_lock();
1053 
1054 	/*
1055 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1056 	 * into this helper allow blocking; it'd be dead, wasteful code.
1057 	 */
1058 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1059 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1060 			ret |= handler(kvm, &iter, range);
1061 	}
1062 
1063 	rcu_read_unlock();
1064 
1065 	return ret;
1066 }
1067 
1068 /*
1069  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1070  * if any of the GFNs in the range have been accessed.
1071  */
1072 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1073 			  struct kvm_gfn_range *range)
1074 {
1075 	u64 new_spte = 0;
1076 
1077 	/* If we have a non-accessed entry we don't need to change the pte. */
1078 	if (!is_accessed_spte(iter->old_spte))
1079 		return false;
1080 
1081 	new_spte = iter->old_spte;
1082 
1083 	if (spte_ad_enabled(new_spte)) {
1084 		new_spte &= ~shadow_accessed_mask;
1085 	} else {
1086 		/*
1087 		 * Capture the dirty status of the page, so that it doesn't get
1088 		 * lost when the SPTE is marked for access tracking.
1089 		 */
1090 		if (is_writable_pte(new_spte))
1091 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1092 
1093 		new_spte = mark_spte_for_access_track(new_spte);
1094 	}
1095 
1096 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1097 
1098 	return true;
1099 }
1100 
1101 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1102 {
1103 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1104 }
1105 
1106 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1107 			 struct kvm_gfn_range *range)
1108 {
1109 	return is_accessed_spte(iter->old_spte);
1110 }
1111 
1112 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1113 {
1114 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1115 }
1116 
1117 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1118 			 struct kvm_gfn_range *range)
1119 {
1120 	u64 new_spte;
1121 
1122 	/* Huge pages aren't expected to be modified without first being zapped. */
1123 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1124 
1125 	if (iter->level != PG_LEVEL_4K ||
1126 	    !is_shadow_present_pte(iter->old_spte))
1127 		return false;
1128 
1129 	/*
1130 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1131 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1132 	 * invariant that the PFN of a present * leaf SPTE can never change.
1133 	 * See __handle_changed_spte().
1134 	 */
1135 	tdp_mmu_set_spte(kvm, iter, 0);
1136 
1137 	if (!pte_write(range->pte)) {
1138 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1139 								  pte_pfn(range->pte));
1140 
1141 		tdp_mmu_set_spte(kvm, iter, new_spte);
1142 	}
1143 
1144 	return true;
1145 }
1146 
1147 /*
1148  * Handle the changed_pte MMU notifier for the TDP MMU.
1149  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1150  * notifier.
1151  * Returns non-zero if a flush is needed before releasing the MMU lock.
1152  */
1153 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1154 {
1155 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1156 
1157 	/* FIXME: return 'flush' instead of flushing here. */
1158 	if (flush)
1159 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1160 
1161 	return false;
1162 }
1163 
1164 /*
1165  * Remove write access from all SPTEs at or above min_level that map GFNs
1166  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1167  * be flushed.
1168  */
1169 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1170 			     gfn_t start, gfn_t end, int min_level)
1171 {
1172 	struct tdp_iter iter;
1173 	u64 new_spte;
1174 	bool spte_set = false;
1175 
1176 	rcu_read_lock();
1177 
1178 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1179 
1180 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1181 				   min_level, start, end) {
1182 retry:
1183 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1184 			continue;
1185 
1186 		if (!is_shadow_present_pte(iter.old_spte) ||
1187 		    !is_last_spte(iter.old_spte, iter.level) ||
1188 		    !(iter.old_spte & PT_WRITABLE_MASK))
1189 			continue;
1190 
1191 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1192 
1193 		if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1194 			/*
1195 			 * The iter must explicitly re-read the SPTE because
1196 			 * the atomic cmpxchg failed.
1197 			 */
1198 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1199 			goto retry;
1200 		}
1201 		spte_set = true;
1202 	}
1203 
1204 	rcu_read_unlock();
1205 	return spte_set;
1206 }
1207 
1208 /*
1209  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1210  * only affect leaf SPTEs down to min_level.
1211  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1212  */
1213 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1214 			     const struct kvm_memory_slot *slot, int min_level)
1215 {
1216 	struct kvm_mmu_page *root;
1217 	bool spte_set = false;
1218 
1219 	lockdep_assert_held_read(&kvm->mmu_lock);
1220 
1221 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1222 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1223 			     slot->base_gfn + slot->npages, min_level);
1224 
1225 	return spte_set;
1226 }
1227 
1228 /*
1229  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1230  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1231  * If AD bits are not enabled, this will require clearing the writable bit on
1232  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1233  * be flushed.
1234  */
1235 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1236 			   gfn_t start, gfn_t end)
1237 {
1238 	struct tdp_iter iter;
1239 	u64 new_spte;
1240 	bool spte_set = false;
1241 
1242 	rcu_read_lock();
1243 
1244 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1245 retry:
1246 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1247 			continue;
1248 
1249 		if (spte_ad_need_write_protect(iter.old_spte)) {
1250 			if (is_writable_pte(iter.old_spte))
1251 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1252 			else
1253 				continue;
1254 		} else {
1255 			if (iter.old_spte & shadow_dirty_mask)
1256 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1257 			else
1258 				continue;
1259 		}
1260 
1261 		if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1262 			/*
1263 			 * The iter must explicitly re-read the SPTE because
1264 			 * the atomic cmpxchg failed.
1265 			 */
1266 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1267 			goto retry;
1268 		}
1269 		spte_set = true;
1270 	}
1271 
1272 	rcu_read_unlock();
1273 	return spte_set;
1274 }
1275 
1276 /*
1277  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1278  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1279  * If AD bits are not enabled, this will require clearing the writable bit on
1280  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1281  * be flushed.
1282  */
1283 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1284 				  const struct kvm_memory_slot *slot)
1285 {
1286 	struct kvm_mmu_page *root;
1287 	bool spte_set = false;
1288 
1289 	lockdep_assert_held_read(&kvm->mmu_lock);
1290 
1291 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1292 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1293 				slot->base_gfn + slot->npages);
1294 
1295 	return spte_set;
1296 }
1297 
1298 /*
1299  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1300  * set in mask, starting at gfn. The given memslot is expected to contain all
1301  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1302  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1303  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1304  */
1305 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1306 				  gfn_t gfn, unsigned long mask, bool wrprot)
1307 {
1308 	struct tdp_iter iter;
1309 	u64 new_spte;
1310 
1311 	rcu_read_lock();
1312 
1313 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1314 				    gfn + BITS_PER_LONG) {
1315 		if (!mask)
1316 			break;
1317 
1318 		if (iter.level > PG_LEVEL_4K ||
1319 		    !(mask & (1UL << (iter.gfn - gfn))))
1320 			continue;
1321 
1322 		mask &= ~(1UL << (iter.gfn - gfn));
1323 
1324 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1325 			if (is_writable_pte(iter.old_spte))
1326 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1327 			else
1328 				continue;
1329 		} else {
1330 			if (iter.old_spte & shadow_dirty_mask)
1331 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1332 			else
1333 				continue;
1334 		}
1335 
1336 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1337 	}
1338 
1339 	rcu_read_unlock();
1340 }
1341 
1342 /*
1343  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1344  * set in mask, starting at gfn. The given memslot is expected to contain all
1345  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1346  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1347  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1348  */
1349 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1350 				       struct kvm_memory_slot *slot,
1351 				       gfn_t gfn, unsigned long mask,
1352 				       bool wrprot)
1353 {
1354 	struct kvm_mmu_page *root;
1355 
1356 	lockdep_assert_held_write(&kvm->mmu_lock);
1357 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1358 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1359 }
1360 
1361 /*
1362  * Clear leaf entries which could be replaced by large mappings, for
1363  * GFNs within the slot.
1364  */
1365 static void zap_collapsible_spte_range(struct kvm *kvm,
1366 				       struct kvm_mmu_page *root,
1367 				       const struct kvm_memory_slot *slot)
1368 {
1369 	gfn_t start = slot->base_gfn;
1370 	gfn_t end = start + slot->npages;
1371 	struct tdp_iter iter;
1372 	kvm_pfn_t pfn;
1373 
1374 	rcu_read_lock();
1375 
1376 	tdp_root_for_each_pte(iter, root, start, end) {
1377 retry:
1378 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1379 			continue;
1380 
1381 		if (!is_shadow_present_pte(iter.old_spte) ||
1382 		    !is_last_spte(iter.old_spte, iter.level))
1383 			continue;
1384 
1385 		pfn = spte_to_pfn(iter.old_spte);
1386 		if (kvm_is_reserved_pfn(pfn) ||
1387 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1388 							    pfn, PG_LEVEL_NUM))
1389 			continue;
1390 
1391 		/* Note, a successful atomic zap also does a remote TLB flush. */
1392 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1393 			/*
1394 			 * The iter must explicitly re-read the SPTE because
1395 			 * the atomic cmpxchg failed.
1396 			 */
1397 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1398 			goto retry;
1399 		}
1400 	}
1401 
1402 	rcu_read_unlock();
1403 }
1404 
1405 /*
1406  * Clear non-leaf entries (and free associated page tables) which could
1407  * be replaced by large mappings, for GFNs within the slot.
1408  */
1409 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1410 				       const struct kvm_memory_slot *slot)
1411 {
1412 	struct kvm_mmu_page *root;
1413 
1414 	lockdep_assert_held_read(&kvm->mmu_lock);
1415 
1416 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1417 		zap_collapsible_spte_range(kvm, root, slot);
1418 }
1419 
1420 /*
1421  * Removes write access on the last level SPTE mapping this GFN and unsets the
1422  * MMU-writable bit to ensure future writes continue to be intercepted.
1423  * Returns true if an SPTE was set and a TLB flush is needed.
1424  */
1425 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1426 			      gfn_t gfn, int min_level)
1427 {
1428 	struct tdp_iter iter;
1429 	u64 new_spte;
1430 	bool spte_set = false;
1431 
1432 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1433 
1434 	rcu_read_lock();
1435 
1436 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1437 				   min_level, gfn, gfn + 1) {
1438 		if (!is_shadow_present_pte(iter.old_spte) ||
1439 		    !is_last_spte(iter.old_spte, iter.level))
1440 			continue;
1441 
1442 		if (!is_writable_pte(iter.old_spte))
1443 			break;
1444 
1445 		new_spte = iter.old_spte &
1446 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1447 
1448 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1449 		spte_set = true;
1450 	}
1451 
1452 	rcu_read_unlock();
1453 
1454 	return spte_set;
1455 }
1456 
1457 /*
1458  * Removes write access on the last level SPTE mapping this GFN and unsets the
1459  * MMU-writable bit to ensure future writes continue to be intercepted.
1460  * Returns true if an SPTE was set and a TLB flush is needed.
1461  */
1462 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1463 				   struct kvm_memory_slot *slot, gfn_t gfn,
1464 				   int min_level)
1465 {
1466 	struct kvm_mmu_page *root;
1467 	bool spte_set = false;
1468 
1469 	lockdep_assert_held_write(&kvm->mmu_lock);
1470 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1471 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1472 
1473 	return spte_set;
1474 }
1475 
1476 /*
1477  * Return the level of the lowest level SPTE added to sptes.
1478  * That SPTE may be non-present.
1479  *
1480  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1481  */
1482 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1483 			 int *root_level)
1484 {
1485 	struct tdp_iter iter;
1486 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1487 	gfn_t gfn = addr >> PAGE_SHIFT;
1488 	int leaf = -1;
1489 
1490 	*root_level = vcpu->arch.mmu->shadow_root_level;
1491 
1492 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1493 		leaf = iter.level;
1494 		sptes[leaf] = iter.old_spte;
1495 	}
1496 
1497 	return leaf;
1498 }
1499 
1500 /*
1501  * Returns the last level spte pointer of the shadow page walk for the given
1502  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1503  * walk could be performed, returns NULL and *spte does not contain valid data.
1504  *
1505  * Contract:
1506  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1507  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1508  *
1509  * WARNING: This function is only intended to be called during fast_page_fault.
1510  */
1511 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1512 					u64 *spte)
1513 {
1514 	struct tdp_iter iter;
1515 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1516 	gfn_t gfn = addr >> PAGE_SHIFT;
1517 	tdp_ptep_t sptep = NULL;
1518 
1519 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1520 		*spte = iter.old_spte;
1521 		sptep = iter.sptep;
1522 	}
1523 
1524 	/*
1525 	 * Perform the rcu_dereference to get the raw spte pointer value since
1526 	 * we are passing it up to fast_page_fault, which is shared with the
1527 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1528 	 * annotation.
1529 	 *
1530 	 * This is safe since fast_page_fault obeys the contracts of this
1531 	 * function as well as all TDP MMU contracts around modifying SPTEs
1532 	 * outside of mmu_lock.
1533 	 */
1534 	return rcu_dereference(sptep);
1535 }
1536