xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision bbaa836b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.
52 	 */
53 	rcu_barrier();
54 }
55 
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
58 			  bool shared);
59 
60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 	free_page((unsigned long)sp->spt);
63 	kmem_cache_free(mmu_page_header_cache, sp);
64 }
65 
66 /*
67  * This is called through call_rcu in order to free TDP page table memory
68  * safely with respect to other kernel threads that may be operating on
69  * the memory.
70  * By only accessing TDP MMU page table memory in an RCU read critical
71  * section, and freeing it after a grace period, lockless access to that
72  * memory won't use it after it is freed.
73  */
74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 					       rcu_head);
78 
79 	tdp_mmu_free_sp(sp);
80 }
81 
82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 			  bool shared)
84 {
85 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86 
87 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 		return;
89 
90 	WARN_ON(!root->tdp_mmu_page);
91 
92 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 	list_del_rcu(&root->link);
94 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95 
96 	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97 
98 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100 
101 /*
102  * Finds the next valid root after root (or the first valid root if root
103  * is NULL), takes a reference on it, and returns that next root. If root
104  * is not NULL, this thread should have already taken a reference on it, and
105  * that reference will be dropped. If no valid root is found, this
106  * function will return NULL.
107  */
108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
109 					      struct kvm_mmu_page *prev_root,
110 					      bool shared)
111 {
112 	struct kvm_mmu_page *next_root;
113 
114 	rcu_read_lock();
115 
116 	if (prev_root)
117 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 						  &prev_root->link,
119 						  typeof(*prev_root), link);
120 	else
121 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 						   typeof(*next_root), link);
123 
124 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 				&next_root->link, typeof(*next_root), link);
127 
128 	rcu_read_unlock();
129 
130 	if (prev_root)
131 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
132 
133 	return next_root;
134 }
135 
136 /*
137  * Note: this iterator gets and puts references to the roots it iterates over.
138  * This makes it safe to release the MMU lock and yield within the loop, but
139  * if exiting the loop early, the caller must drop the reference to the most
140  * recent root. (Unless keeping a live reference is desirable.)
141  *
142  * If shared is set, this function is operating under the MMU lock in read
143  * mode. In the unlikely event that this thread must free a root, the lock
144  * will be temporarily dropped and reacquired in write mode.
145  */
146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
147 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
148 	     _root;							\
149 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
150 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
151 		} else
152 
153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
154 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
155 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
156 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
157 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
158 		} else
159 
160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 						   int level)
162 {
163 	union kvm_mmu_page_role role;
164 
165 	role = vcpu->arch.mmu->mmu_role.base;
166 	role.level = level;
167 	role.direct = true;
168 	role.gpte_is_8_bytes = true;
169 	role.access = ACC_ALL;
170 	role.ad_disabled = !shadow_accessed_mask;
171 
172 	return role;
173 }
174 
175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
176 					       int level)
177 {
178 	struct kvm_mmu_page *sp;
179 
180 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
181 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
182 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
183 
184 	sp->role.word = page_role_for_level(vcpu, level).word;
185 	sp->gfn = gfn;
186 	sp->tdp_mmu_page = true;
187 
188 	trace_kvm_mmu_get_page(sp, true);
189 
190 	return sp;
191 }
192 
193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
194 {
195 	union kvm_mmu_page_role role;
196 	struct kvm *kvm = vcpu->kvm;
197 	struct kvm_mmu_page *root;
198 
199 	lockdep_assert_held_write(&kvm->mmu_lock);
200 
201 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
202 
203 	/* Check for an existing root before allocating a new one. */
204 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
205 		if (root->role.word == role.word &&
206 		    kvm_tdp_mmu_get_root(kvm, root))
207 			goto out;
208 	}
209 
210 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
211 	refcount_set(&root->tdp_mmu_root_count, 1);
212 
213 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
214 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
215 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
216 
217 out:
218 	return __pa(root->spt);
219 }
220 
221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
222 				u64 old_spte, u64 new_spte, int level,
223 				bool shared);
224 
225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
226 {
227 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
228 		return;
229 
230 	if (is_accessed_spte(old_spte) &&
231 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
232 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
233 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
234 }
235 
236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
237 					  u64 old_spte, u64 new_spte, int level)
238 {
239 	bool pfn_changed;
240 	struct kvm_memory_slot *slot;
241 
242 	if (level > PG_LEVEL_4K)
243 		return;
244 
245 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
246 
247 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
248 	    is_writable_pte(new_spte)) {
249 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
250 		mark_page_dirty_in_slot(kvm, slot, gfn);
251 	}
252 }
253 
254 /**
255  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
256  *
257  * @kvm: kvm instance
258  * @sp: the new page
259  * @account_nx: This page replaces a NX large page and should be marked for
260  *		eventual reclaim.
261  */
262 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
263 			      bool account_nx)
264 {
265 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
266 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
267 	if (account_nx)
268 		account_huge_nx_page(kvm, sp);
269 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
270 }
271 
272 /**
273  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
274  *
275  * @kvm: kvm instance
276  * @sp: the page to be removed
277  * @shared: This operation may not be running under the exclusive use of
278  *	    the MMU lock and the operation must synchronize with other
279  *	    threads that might be adding or removing pages.
280  */
281 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
282 				bool shared)
283 {
284 	if (shared)
285 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
286 	else
287 		lockdep_assert_held_write(&kvm->mmu_lock);
288 
289 	list_del(&sp->link);
290 	if (sp->lpage_disallowed)
291 		unaccount_huge_nx_page(kvm, sp);
292 
293 	if (shared)
294 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
295 }
296 
297 /**
298  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
299  *
300  * @kvm: kvm instance
301  * @pt: the page removed from the paging structure
302  * @shared: This operation may not be running under the exclusive use
303  *	    of the MMU lock and the operation must synchronize with other
304  *	    threads that might be modifying SPTEs.
305  *
306  * Given a page table that has been removed from the TDP paging structure,
307  * iterates through the page table to clear SPTEs and free child page tables.
308  *
309  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
310  * protection. Since this thread removed it from the paging structure,
311  * this thread will be responsible for ensuring the page is freed. Hence the
312  * early rcu_dereferences in the function.
313  */
314 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
315 					bool shared)
316 {
317 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
318 	int level = sp->role.level;
319 	gfn_t base_gfn = sp->gfn;
320 	u64 old_child_spte;
321 	u64 *sptep;
322 	gfn_t gfn;
323 	int i;
324 
325 	trace_kvm_mmu_prepare_zap_page(sp);
326 
327 	tdp_mmu_unlink_page(kvm, sp, shared);
328 
329 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
330 		sptep = rcu_dereference(pt) + i;
331 		gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
332 
333 		if (shared) {
334 			/*
335 			 * Set the SPTE to a nonpresent value that other
336 			 * threads will not overwrite. If the SPTE was
337 			 * already marked as removed then another thread
338 			 * handling a page fault could overwrite it, so
339 			 * set the SPTE until it is set from some other
340 			 * value to the removed SPTE value.
341 			 */
342 			for (;;) {
343 				old_child_spte = xchg(sptep, REMOVED_SPTE);
344 				if (!is_removed_spte(old_child_spte))
345 					break;
346 				cpu_relax();
347 			}
348 		} else {
349 			/*
350 			 * If the SPTE is not MMU-present, there is no backing
351 			 * page associated with the SPTE and so no side effects
352 			 * that need to be recorded, and exclusive ownership of
353 			 * mmu_lock ensures the SPTE can't be made present.
354 			 * Note, zapping MMIO SPTEs is also unnecessary as they
355 			 * are guarded by the memslots generation, not by being
356 			 * unreachable.
357 			 */
358 			old_child_spte = READ_ONCE(*sptep);
359 			if (!is_shadow_present_pte(old_child_spte))
360 				continue;
361 
362 			/*
363 			 * Marking the SPTE as a removed SPTE is not
364 			 * strictly necessary here as the MMU lock will
365 			 * stop other threads from concurrently modifying
366 			 * this SPTE. Using the removed SPTE value keeps
367 			 * the two branches consistent and simplifies
368 			 * the function.
369 			 */
370 			WRITE_ONCE(*sptep, REMOVED_SPTE);
371 		}
372 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
373 				    old_child_spte, REMOVED_SPTE, level,
374 				    shared);
375 	}
376 
377 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
378 					   KVM_PAGES_PER_HPAGE(level + 1));
379 
380 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
381 }
382 
383 /**
384  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
385  * @kvm: kvm instance
386  * @as_id: the address space of the paging structure the SPTE was a part of
387  * @gfn: the base GFN that was mapped by the SPTE
388  * @old_spte: The value of the SPTE before the change
389  * @new_spte: The value of the SPTE after the change
390  * @level: the level of the PT the SPTE is part of in the paging structure
391  * @shared: This operation may not be running under the exclusive use of
392  *	    the MMU lock and the operation must synchronize with other
393  *	    threads that might be modifying SPTEs.
394  *
395  * Handle bookkeeping that might result from the modification of a SPTE.
396  * This function must be called for all TDP SPTE modifications.
397  */
398 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
399 				  u64 old_spte, u64 new_spte, int level,
400 				  bool shared)
401 {
402 	bool was_present = is_shadow_present_pte(old_spte);
403 	bool is_present = is_shadow_present_pte(new_spte);
404 	bool was_leaf = was_present && is_last_spte(old_spte, level);
405 	bool is_leaf = is_present && is_last_spte(new_spte, level);
406 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
407 
408 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
409 	WARN_ON(level < PG_LEVEL_4K);
410 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
411 
412 	/*
413 	 * If this warning were to trigger it would indicate that there was a
414 	 * missing MMU notifier or a race with some notifier handler.
415 	 * A present, leaf SPTE should never be directly replaced with another
416 	 * present leaf SPTE pointing to a different PFN. A notifier handler
417 	 * should be zapping the SPTE before the main MM's page table is
418 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
419 	 * thread before replacement.
420 	 */
421 	if (was_leaf && is_leaf && pfn_changed) {
422 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
423 		       "SPTE with another present leaf SPTE mapping a\n"
424 		       "different PFN!\n"
425 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
426 		       as_id, gfn, old_spte, new_spte, level);
427 
428 		/*
429 		 * Crash the host to prevent error propagation and guest data
430 		 * corruption.
431 		 */
432 		BUG();
433 	}
434 
435 	if (old_spte == new_spte)
436 		return;
437 
438 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
439 
440 	/*
441 	 * The only times a SPTE should be changed from a non-present to
442 	 * non-present state is when an MMIO entry is installed/modified/
443 	 * removed. In that case, there is nothing to do here.
444 	 */
445 	if (!was_present && !is_present) {
446 		/*
447 		 * If this change does not involve a MMIO SPTE or removed SPTE,
448 		 * it is unexpected. Log the change, though it should not
449 		 * impact the guest since both the former and current SPTEs
450 		 * are nonpresent.
451 		 */
452 		if (WARN_ON(!is_mmio_spte(old_spte) &&
453 			    !is_mmio_spte(new_spte) &&
454 			    !is_removed_spte(new_spte)))
455 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
456 			       "should not be replaced with another,\n"
457 			       "different nonpresent SPTE, unless one or both\n"
458 			       "are MMIO SPTEs, or the new SPTE is\n"
459 			       "a temporary removed SPTE.\n"
460 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
461 			       as_id, gfn, old_spte, new_spte, level);
462 		return;
463 	}
464 
465 	if (is_leaf != was_leaf)
466 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
467 
468 	if (was_leaf && is_dirty_spte(old_spte) &&
469 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
470 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
471 
472 	/*
473 	 * Recursively handle child PTs if the change removed a subtree from
474 	 * the paging structure.
475 	 */
476 	if (was_present && !was_leaf && (pfn_changed || !is_present))
477 		handle_removed_tdp_mmu_page(kvm,
478 				spte_to_child_pt(old_spte, level), shared);
479 }
480 
481 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
482 				u64 old_spte, u64 new_spte, int level,
483 				bool shared)
484 {
485 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
486 			      shared);
487 	handle_changed_spte_acc_track(old_spte, new_spte, level);
488 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
489 				      new_spte, level);
490 }
491 
492 /*
493  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
494  * and handle the associated bookkeeping.  Do not mark the page dirty
495  * in KVM's dirty bitmaps.
496  *
497  * @kvm: kvm instance
498  * @iter: a tdp_iter instance currently on the SPTE that should be set
499  * @new_spte: The value the SPTE should be set to
500  * Returns: true if the SPTE was set, false if it was not. If false is returned,
501  *	    this function will have no side-effects.
502  */
503 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
504 					   struct tdp_iter *iter,
505 					   u64 new_spte)
506 {
507 	lockdep_assert_held_read(&kvm->mmu_lock);
508 
509 	/*
510 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
511 	 * may modify it.
512 	 */
513 	if (is_removed_spte(iter->old_spte))
514 		return false;
515 
516 	/*
517 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
518 	 * does not hold the mmu_lock.
519 	 */
520 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
521 		      new_spte) != iter->old_spte)
522 		return false;
523 
524 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
525 			      new_spte, iter->level, true);
526 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
527 
528 	return true;
529 }
530 
531 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
532 					   struct tdp_iter *iter)
533 {
534 	/*
535 	 * Freeze the SPTE by setting it to a special,
536 	 * non-present value. This will stop other threads from
537 	 * immediately installing a present entry in its place
538 	 * before the TLBs are flushed.
539 	 */
540 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
541 		return false;
542 
543 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
544 					   KVM_PAGES_PER_HPAGE(iter->level));
545 
546 	/*
547 	 * No other thread can overwrite the removed SPTE as they
548 	 * must either wait on the MMU lock or use
549 	 * tdp_mmu_set_spte_atomic which will not overwrite the
550 	 * special removed SPTE value. No bookkeeping is needed
551 	 * here since the SPTE is going from non-present
552 	 * to non-present.
553 	 */
554 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
555 
556 	return true;
557 }
558 
559 
560 /*
561  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
562  * @kvm: kvm instance
563  * @iter: a tdp_iter instance currently on the SPTE that should be set
564  * @new_spte: The value the SPTE should be set to
565  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
566  *		      of the page. Should be set unless handling an MMU
567  *		      notifier for access tracking. Leaving record_acc_track
568  *		      unset in that case prevents page accesses from being
569  *		      double counted.
570  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
571  *		      appropriate for the change being made. Should be set
572  *		      unless performing certain dirty logging operations.
573  *		      Leaving record_dirty_log unset in that case prevents page
574  *		      writes from being double counted.
575  */
576 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
577 				      u64 new_spte, bool record_acc_track,
578 				      bool record_dirty_log)
579 {
580 	lockdep_assert_held_write(&kvm->mmu_lock);
581 
582 	/*
583 	 * No thread should be using this function to set SPTEs to the
584 	 * temporary removed SPTE value.
585 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
586 	 * should be used. If operating under the MMU lock in write mode, the
587 	 * use of the removed SPTE should not be necessary.
588 	 */
589 	WARN_ON(is_removed_spte(iter->old_spte));
590 
591 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
592 
593 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
594 			      new_spte, iter->level, false);
595 	if (record_acc_track)
596 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
597 					      iter->level);
598 	if (record_dirty_log)
599 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
600 					      iter->old_spte, new_spte,
601 					      iter->level);
602 }
603 
604 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
605 				    u64 new_spte)
606 {
607 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
608 }
609 
610 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
611 						 struct tdp_iter *iter,
612 						 u64 new_spte)
613 {
614 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
615 }
616 
617 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
618 						 struct tdp_iter *iter,
619 						 u64 new_spte)
620 {
621 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
622 }
623 
624 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
625 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
626 
627 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
628 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
629 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
630 		    !is_last_spte(_iter.old_spte, _iter.level))		\
631 			continue;					\
632 		else
633 
634 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
635 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
636 			 _mmu->shadow_root_level, _start, _end)
637 
638 /*
639  * Yield if the MMU lock is contended or this thread needs to return control
640  * to the scheduler.
641  *
642  * If this function should yield and flush is set, it will perform a remote
643  * TLB flush before yielding.
644  *
645  * If this function yields, it will also reset the tdp_iter's walk over the
646  * paging structure and the calling function should skip to the next
647  * iteration to allow the iterator to continue its traversal from the
648  * paging structure root.
649  *
650  * Return true if this function yielded and the iterator's traversal was reset.
651  * Return false if a yield was not needed.
652  */
653 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
654 					     struct tdp_iter *iter, bool flush,
655 					     bool shared)
656 {
657 	/* Ensure forward progress has been made before yielding. */
658 	if (iter->next_last_level_gfn == iter->yielded_gfn)
659 		return false;
660 
661 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
662 		rcu_read_unlock();
663 
664 		if (flush)
665 			kvm_flush_remote_tlbs(kvm);
666 
667 		if (shared)
668 			cond_resched_rwlock_read(&kvm->mmu_lock);
669 		else
670 			cond_resched_rwlock_write(&kvm->mmu_lock);
671 
672 		rcu_read_lock();
673 
674 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
675 
676 		tdp_iter_restart(iter);
677 
678 		return true;
679 	}
680 
681 	return false;
682 }
683 
684 /*
685  * Tears down the mappings for the range of gfns, [start, end), and frees the
686  * non-root pages mapping GFNs strictly within that range. Returns true if
687  * SPTEs have been cleared and a TLB flush is needed before releasing the
688  * MMU lock.
689  *
690  * If can_yield is true, will release the MMU lock and reschedule if the
691  * scheduler needs the CPU or there is contention on the MMU lock. If this
692  * function cannot yield, it will not release the MMU lock or reschedule and
693  * the caller must ensure it does not supply too large a GFN range, or the
694  * operation can cause a soft lockup.
695  *
696  * If shared is true, this thread holds the MMU lock in read mode and must
697  * account for the possibility that other threads are modifying the paging
698  * structures concurrently. If shared is false, this thread should hold the
699  * MMU lock in write mode.
700  */
701 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
702 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
703 			  bool shared)
704 {
705 	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
706 	bool zap_all = (start == 0 && end >= max_gfn_host);
707 	struct tdp_iter iter;
708 
709 	/*
710 	 * No need to try to step down in the iterator when zapping all SPTEs,
711 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
712 	 */
713 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
714 
715 	/*
716 	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
717 	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
718 	 * and so KVM will never install a SPTE for such addresses.
719 	 */
720 	end = min(end, max_gfn_host);
721 
722 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
723 
724 	rcu_read_lock();
725 
726 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
727 				   min_level, start, end) {
728 retry:
729 		if (can_yield &&
730 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
731 			flush = false;
732 			continue;
733 		}
734 
735 		if (!is_shadow_present_pte(iter.old_spte))
736 			continue;
737 
738 		/*
739 		 * If this is a non-last-level SPTE that covers a larger range
740 		 * than should be zapped, continue, and zap the mappings at a
741 		 * lower level, except when zapping all SPTEs.
742 		 */
743 		if (!zap_all &&
744 		    (iter.gfn < start ||
745 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
746 		    !is_last_spte(iter.old_spte, iter.level))
747 			continue;
748 
749 		if (!shared) {
750 			tdp_mmu_set_spte(kvm, &iter, 0);
751 			flush = true;
752 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
753 			/*
754 			 * The iter must explicitly re-read the SPTE because
755 			 * the atomic cmpxchg failed.
756 			 */
757 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
758 			goto retry;
759 		}
760 	}
761 
762 	rcu_read_unlock();
763 	return flush;
764 }
765 
766 /*
767  * Tears down the mappings for the range of gfns, [start, end), and frees the
768  * non-root pages mapping GFNs strictly within that range. Returns true if
769  * SPTEs have been cleared and a TLB flush is needed before releasing the
770  * MMU lock.
771  */
772 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
773 				 gfn_t end, bool can_yield, bool flush)
774 {
775 	struct kvm_mmu_page *root;
776 
777 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
778 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
779 				      false);
780 
781 	return flush;
782 }
783 
784 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
785 {
786 	bool flush = false;
787 	int i;
788 
789 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
790 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
791 
792 	if (flush)
793 		kvm_flush_remote_tlbs(kvm);
794 }
795 
796 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
797 						  struct kvm_mmu_page *prev_root)
798 {
799 	struct kvm_mmu_page *next_root;
800 
801 	if (prev_root)
802 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
803 						  &prev_root->link,
804 						  typeof(*prev_root), link);
805 	else
806 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
807 						   typeof(*next_root), link);
808 
809 	while (next_root && !(next_root->role.invalid &&
810 			      refcount_read(&next_root->tdp_mmu_root_count)))
811 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
812 						  &next_root->link,
813 						  typeof(*next_root), link);
814 
815 	return next_root;
816 }
817 
818 /*
819  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
820  * invalidated root, they will not be freed until this function drops the
821  * reference. Before dropping that reference, tear down the paging
822  * structure so that whichever thread does drop the last reference
823  * only has to do a trivial amount of work. Since the roots are invalid,
824  * no new SPTEs should be created under them.
825  */
826 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
827 {
828 	struct kvm_mmu_page *next_root;
829 	struct kvm_mmu_page *root;
830 	bool flush = false;
831 
832 	lockdep_assert_held_read(&kvm->mmu_lock);
833 
834 	rcu_read_lock();
835 
836 	root = next_invalidated_root(kvm, NULL);
837 
838 	while (root) {
839 		next_root = next_invalidated_root(kvm, root);
840 
841 		rcu_read_unlock();
842 
843 		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
844 
845 		/*
846 		 * Put the reference acquired in
847 		 * kvm_tdp_mmu_invalidate_roots
848 		 */
849 		kvm_tdp_mmu_put_root(kvm, root, true);
850 
851 		root = next_root;
852 
853 		rcu_read_lock();
854 	}
855 
856 	rcu_read_unlock();
857 
858 	if (flush)
859 		kvm_flush_remote_tlbs(kvm);
860 }
861 
862 /*
863  * Mark each TDP MMU root as invalid so that other threads
864  * will drop their references and allow the root count to
865  * go to 0.
866  *
867  * Also take a reference on all roots so that this thread
868  * can do the bulk of the work required to free the roots
869  * once they are invalidated. Without this reference, a
870  * vCPU thread might drop the last reference to a root and
871  * get stuck with tearing down the entire paging structure.
872  *
873  * Roots which have a zero refcount should be skipped as
874  * they're already being torn down.
875  * Already invalid roots should be referenced again so that
876  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
877  * done with them.
878  *
879  * This has essentially the same effect for the TDP MMU
880  * as updating mmu_valid_gen does for the shadow MMU.
881  */
882 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
883 {
884 	struct kvm_mmu_page *root;
885 
886 	lockdep_assert_held_write(&kvm->mmu_lock);
887 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
888 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
889 			root->role.invalid = true;
890 }
891 
892 /*
893  * Installs a last-level SPTE to handle a TDP page fault.
894  * (NPT/EPT violation/misconfiguration)
895  */
896 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
897 					  struct kvm_page_fault *fault,
898 					  struct tdp_iter *iter)
899 {
900 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
901 	u64 new_spte;
902 	int ret = RET_PF_FIXED;
903 	bool wrprot = false;
904 
905 	WARN_ON(sp->role.level != fault->goal_level);
906 	if (unlikely(!fault->slot))
907 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
908 	else
909 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
910 					 fault->pfn, iter->old_spte, fault->prefetch, true,
911 					 fault->map_writable, &new_spte);
912 
913 	if (new_spte == iter->old_spte)
914 		ret = RET_PF_SPURIOUS;
915 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
916 		return RET_PF_RETRY;
917 
918 	/*
919 	 * If the page fault was caused by a write but the page is write
920 	 * protected, emulation is needed. If the emulation was skipped,
921 	 * the vCPU would have the same fault again.
922 	 */
923 	if (wrprot) {
924 		if (fault->write)
925 			ret = RET_PF_EMULATE;
926 	}
927 
928 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
929 	if (unlikely(is_mmio_spte(new_spte))) {
930 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
931 				     new_spte);
932 		ret = RET_PF_EMULATE;
933 	} else {
934 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
935 				       rcu_dereference(iter->sptep));
936 	}
937 
938 	/*
939 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
940 	 * consistent with legacy MMU behavior.
941 	 */
942 	if (ret != RET_PF_SPURIOUS)
943 		vcpu->stat.pf_fixed++;
944 
945 	return ret;
946 }
947 
948 /*
949  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
950  * page tables and SPTEs to translate the faulting guest physical address.
951  */
952 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
953 {
954 	struct kvm_mmu *mmu = vcpu->arch.mmu;
955 	struct tdp_iter iter;
956 	struct kvm_mmu_page *sp;
957 	u64 *child_pt;
958 	u64 new_spte;
959 	int ret;
960 
961 	kvm_mmu_hugepage_adjust(vcpu, fault);
962 
963 	trace_kvm_mmu_spte_requested(fault);
964 
965 	rcu_read_lock();
966 
967 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
968 		if (fault->nx_huge_page_workaround_enabled)
969 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
970 
971 		if (iter.level == fault->goal_level)
972 			break;
973 
974 		/*
975 		 * If there is an SPTE mapping a large page at a higher level
976 		 * than the target, that SPTE must be cleared and replaced
977 		 * with a non-leaf SPTE.
978 		 */
979 		if (is_shadow_present_pte(iter.old_spte) &&
980 		    is_large_pte(iter.old_spte)) {
981 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
982 				break;
983 
984 			/*
985 			 * The iter must explicitly re-read the spte here
986 			 * because the new value informs the !present
987 			 * path below.
988 			 */
989 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
990 		}
991 
992 		if (!is_shadow_present_pte(iter.old_spte)) {
993 			/*
994 			 * If SPTE has been frozen by another thread, just
995 			 * give up and retry, avoiding unnecessary page table
996 			 * allocation and free.
997 			 */
998 			if (is_removed_spte(iter.old_spte))
999 				break;
1000 
1001 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1002 			child_pt = sp->spt;
1003 
1004 			new_spte = make_nonleaf_spte(child_pt,
1005 						     !shadow_accessed_mask);
1006 
1007 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
1008 				tdp_mmu_link_page(vcpu->kvm, sp,
1009 						  fault->huge_page_disallowed &&
1010 						  fault->req_level >= iter.level);
1011 
1012 				trace_kvm_mmu_get_page(sp, true);
1013 			} else {
1014 				tdp_mmu_free_sp(sp);
1015 				break;
1016 			}
1017 		}
1018 	}
1019 
1020 	if (iter.level != fault->goal_level) {
1021 		rcu_read_unlock();
1022 		return RET_PF_RETRY;
1023 	}
1024 
1025 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1026 	rcu_read_unlock();
1027 
1028 	return ret;
1029 }
1030 
1031 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1032 				 bool flush)
1033 {
1034 	struct kvm_mmu_page *root;
1035 
1036 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1037 		flush |= zap_gfn_range(kvm, root, range->start, range->end,
1038 				       range->may_block, flush, false);
1039 
1040 	return flush;
1041 }
1042 
1043 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1044 			      struct kvm_gfn_range *range);
1045 
1046 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1047 						   struct kvm_gfn_range *range,
1048 						   tdp_handler_t handler)
1049 {
1050 	struct kvm_mmu_page *root;
1051 	struct tdp_iter iter;
1052 	bool ret = false;
1053 
1054 	rcu_read_lock();
1055 
1056 	/*
1057 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1058 	 * into this helper allow blocking; it'd be dead, wasteful code.
1059 	 */
1060 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1061 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1062 			ret |= handler(kvm, &iter, range);
1063 	}
1064 
1065 	rcu_read_unlock();
1066 
1067 	return ret;
1068 }
1069 
1070 /*
1071  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1072  * if any of the GFNs in the range have been accessed.
1073  */
1074 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1075 			  struct kvm_gfn_range *range)
1076 {
1077 	u64 new_spte = 0;
1078 
1079 	/* If we have a non-accessed entry we don't need to change the pte. */
1080 	if (!is_accessed_spte(iter->old_spte))
1081 		return false;
1082 
1083 	new_spte = iter->old_spte;
1084 
1085 	if (spte_ad_enabled(new_spte)) {
1086 		new_spte &= ~shadow_accessed_mask;
1087 	} else {
1088 		/*
1089 		 * Capture the dirty status of the page, so that it doesn't get
1090 		 * lost when the SPTE is marked for access tracking.
1091 		 */
1092 		if (is_writable_pte(new_spte))
1093 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1094 
1095 		new_spte = mark_spte_for_access_track(new_spte);
1096 	}
1097 
1098 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1099 
1100 	return true;
1101 }
1102 
1103 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1104 {
1105 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1106 }
1107 
1108 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1109 			 struct kvm_gfn_range *range)
1110 {
1111 	return is_accessed_spte(iter->old_spte);
1112 }
1113 
1114 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1115 {
1116 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1117 }
1118 
1119 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1120 			 struct kvm_gfn_range *range)
1121 {
1122 	u64 new_spte;
1123 
1124 	/* Huge pages aren't expected to be modified without first being zapped. */
1125 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1126 
1127 	if (iter->level != PG_LEVEL_4K ||
1128 	    !is_shadow_present_pte(iter->old_spte))
1129 		return false;
1130 
1131 	/*
1132 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1133 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1134 	 * invariant that the PFN of a present * leaf SPTE can never change.
1135 	 * See __handle_changed_spte().
1136 	 */
1137 	tdp_mmu_set_spte(kvm, iter, 0);
1138 
1139 	if (!pte_write(range->pte)) {
1140 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1141 								  pte_pfn(range->pte));
1142 
1143 		tdp_mmu_set_spte(kvm, iter, new_spte);
1144 	}
1145 
1146 	return true;
1147 }
1148 
1149 /*
1150  * Handle the changed_pte MMU notifier for the TDP MMU.
1151  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1152  * notifier.
1153  * Returns non-zero if a flush is needed before releasing the MMU lock.
1154  */
1155 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1156 {
1157 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1158 
1159 	/* FIXME: return 'flush' instead of flushing here. */
1160 	if (flush)
1161 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1162 
1163 	return false;
1164 }
1165 
1166 /*
1167  * Remove write access from all SPTEs at or above min_level that map GFNs
1168  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1169  * be flushed.
1170  */
1171 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1172 			     gfn_t start, gfn_t end, int min_level)
1173 {
1174 	struct tdp_iter iter;
1175 	u64 new_spte;
1176 	bool spte_set = false;
1177 
1178 	rcu_read_lock();
1179 
1180 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1181 
1182 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1183 				   min_level, start, end) {
1184 retry:
1185 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1186 			continue;
1187 
1188 		if (!is_shadow_present_pte(iter.old_spte) ||
1189 		    !is_last_spte(iter.old_spte, iter.level) ||
1190 		    !(iter.old_spte & PT_WRITABLE_MASK))
1191 			continue;
1192 
1193 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1194 
1195 		if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1196 			/*
1197 			 * The iter must explicitly re-read the SPTE because
1198 			 * the atomic cmpxchg failed.
1199 			 */
1200 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1201 			goto retry;
1202 		}
1203 		spte_set = true;
1204 	}
1205 
1206 	rcu_read_unlock();
1207 	return spte_set;
1208 }
1209 
1210 /*
1211  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1212  * only affect leaf SPTEs down to min_level.
1213  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1214  */
1215 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1216 			     const struct kvm_memory_slot *slot, int min_level)
1217 {
1218 	struct kvm_mmu_page *root;
1219 	bool spte_set = false;
1220 
1221 	lockdep_assert_held_read(&kvm->mmu_lock);
1222 
1223 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1224 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1225 			     slot->base_gfn + slot->npages, min_level);
1226 
1227 	return spte_set;
1228 }
1229 
1230 /*
1231  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1232  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1233  * If AD bits are not enabled, this will require clearing the writable bit on
1234  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1235  * be flushed.
1236  */
1237 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1238 			   gfn_t start, gfn_t end)
1239 {
1240 	struct tdp_iter iter;
1241 	u64 new_spte;
1242 	bool spte_set = false;
1243 
1244 	rcu_read_lock();
1245 
1246 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1247 retry:
1248 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1249 			continue;
1250 
1251 		if (spte_ad_need_write_protect(iter.old_spte)) {
1252 			if (is_writable_pte(iter.old_spte))
1253 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1254 			else
1255 				continue;
1256 		} else {
1257 			if (iter.old_spte & shadow_dirty_mask)
1258 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1259 			else
1260 				continue;
1261 		}
1262 
1263 		if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1264 			/*
1265 			 * The iter must explicitly re-read the SPTE because
1266 			 * the atomic cmpxchg failed.
1267 			 */
1268 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1269 			goto retry;
1270 		}
1271 		spte_set = true;
1272 	}
1273 
1274 	rcu_read_unlock();
1275 	return spte_set;
1276 }
1277 
1278 /*
1279  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1280  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1281  * If AD bits are not enabled, this will require clearing the writable bit on
1282  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1283  * be flushed.
1284  */
1285 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1286 				  const struct kvm_memory_slot *slot)
1287 {
1288 	struct kvm_mmu_page *root;
1289 	bool spte_set = false;
1290 
1291 	lockdep_assert_held_read(&kvm->mmu_lock);
1292 
1293 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1294 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1295 				slot->base_gfn + slot->npages);
1296 
1297 	return spte_set;
1298 }
1299 
1300 /*
1301  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1302  * set in mask, starting at gfn. The given memslot is expected to contain all
1303  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1304  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1305  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1306  */
1307 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1308 				  gfn_t gfn, unsigned long mask, bool wrprot)
1309 {
1310 	struct tdp_iter iter;
1311 	u64 new_spte;
1312 
1313 	rcu_read_lock();
1314 
1315 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1316 				    gfn + BITS_PER_LONG) {
1317 		if (!mask)
1318 			break;
1319 
1320 		if (iter.level > PG_LEVEL_4K ||
1321 		    !(mask & (1UL << (iter.gfn - gfn))))
1322 			continue;
1323 
1324 		mask &= ~(1UL << (iter.gfn - gfn));
1325 
1326 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1327 			if (is_writable_pte(iter.old_spte))
1328 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1329 			else
1330 				continue;
1331 		} else {
1332 			if (iter.old_spte & shadow_dirty_mask)
1333 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1334 			else
1335 				continue;
1336 		}
1337 
1338 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1339 	}
1340 
1341 	rcu_read_unlock();
1342 }
1343 
1344 /*
1345  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1346  * set in mask, starting at gfn. The given memslot is expected to contain all
1347  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1348  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1349  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1350  */
1351 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1352 				       struct kvm_memory_slot *slot,
1353 				       gfn_t gfn, unsigned long mask,
1354 				       bool wrprot)
1355 {
1356 	struct kvm_mmu_page *root;
1357 
1358 	lockdep_assert_held_write(&kvm->mmu_lock);
1359 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1360 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1361 }
1362 
1363 /*
1364  * Clear leaf entries which could be replaced by large mappings, for
1365  * GFNs within the slot.
1366  */
1367 static bool zap_collapsible_spte_range(struct kvm *kvm,
1368 				       struct kvm_mmu_page *root,
1369 				       const struct kvm_memory_slot *slot,
1370 				       bool flush)
1371 {
1372 	gfn_t start = slot->base_gfn;
1373 	gfn_t end = start + slot->npages;
1374 	struct tdp_iter iter;
1375 	kvm_pfn_t pfn;
1376 
1377 	rcu_read_lock();
1378 
1379 	tdp_root_for_each_pte(iter, root, start, end) {
1380 retry:
1381 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1382 			flush = false;
1383 			continue;
1384 		}
1385 
1386 		if (!is_shadow_present_pte(iter.old_spte) ||
1387 		    !is_last_spte(iter.old_spte, iter.level))
1388 			continue;
1389 
1390 		pfn = spte_to_pfn(iter.old_spte);
1391 		if (kvm_is_reserved_pfn(pfn) ||
1392 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1393 							    pfn, PG_LEVEL_NUM))
1394 			continue;
1395 
1396 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1397 			/*
1398 			 * The iter must explicitly re-read the SPTE because
1399 			 * the atomic cmpxchg failed.
1400 			 */
1401 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1402 			goto retry;
1403 		}
1404 		flush = true;
1405 	}
1406 
1407 	rcu_read_unlock();
1408 
1409 	return flush;
1410 }
1411 
1412 /*
1413  * Clear non-leaf entries (and free associated page tables) which could
1414  * be replaced by large mappings, for GFNs within the slot.
1415  */
1416 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1417 				       const struct kvm_memory_slot *slot,
1418 				       bool flush)
1419 {
1420 	struct kvm_mmu_page *root;
1421 
1422 	lockdep_assert_held_read(&kvm->mmu_lock);
1423 
1424 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1425 		flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1426 
1427 	return flush;
1428 }
1429 
1430 /*
1431  * Removes write access on the last level SPTE mapping this GFN and unsets the
1432  * MMU-writable bit to ensure future writes continue to be intercepted.
1433  * Returns true if an SPTE was set and a TLB flush is needed.
1434  */
1435 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1436 			      gfn_t gfn, int min_level)
1437 {
1438 	struct tdp_iter iter;
1439 	u64 new_spte;
1440 	bool spte_set = false;
1441 
1442 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1443 
1444 	rcu_read_lock();
1445 
1446 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1447 				   min_level, gfn, gfn + 1) {
1448 		if (!is_shadow_present_pte(iter.old_spte) ||
1449 		    !is_last_spte(iter.old_spte, iter.level))
1450 			continue;
1451 
1452 		if (!is_writable_pte(iter.old_spte))
1453 			break;
1454 
1455 		new_spte = iter.old_spte &
1456 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1457 
1458 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1459 		spte_set = true;
1460 	}
1461 
1462 	rcu_read_unlock();
1463 
1464 	return spte_set;
1465 }
1466 
1467 /*
1468  * Removes write access on the last level SPTE mapping this GFN and unsets the
1469  * MMU-writable bit to ensure future writes continue to be intercepted.
1470  * Returns true if an SPTE was set and a TLB flush is needed.
1471  */
1472 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1473 				   struct kvm_memory_slot *slot, gfn_t gfn,
1474 				   int min_level)
1475 {
1476 	struct kvm_mmu_page *root;
1477 	bool spte_set = false;
1478 
1479 	lockdep_assert_held_write(&kvm->mmu_lock);
1480 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1481 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1482 
1483 	return spte_set;
1484 }
1485 
1486 /*
1487  * Return the level of the lowest level SPTE added to sptes.
1488  * That SPTE may be non-present.
1489  *
1490  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1491  */
1492 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1493 			 int *root_level)
1494 {
1495 	struct tdp_iter iter;
1496 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1497 	gfn_t gfn = addr >> PAGE_SHIFT;
1498 	int leaf = -1;
1499 
1500 	*root_level = vcpu->arch.mmu->shadow_root_level;
1501 
1502 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1503 		leaf = iter.level;
1504 		sptes[leaf] = iter.old_spte;
1505 	}
1506 
1507 	return leaf;
1508 }
1509 
1510 /*
1511  * Returns the last level spte pointer of the shadow page walk for the given
1512  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1513  * walk could be performed, returns NULL and *spte does not contain valid data.
1514  *
1515  * Contract:
1516  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1517  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1518  *
1519  * WARNING: This function is only intended to be called during fast_page_fault.
1520  */
1521 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1522 					u64 *spte)
1523 {
1524 	struct tdp_iter iter;
1525 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1526 	gfn_t gfn = addr >> PAGE_SHIFT;
1527 	tdp_ptep_t sptep = NULL;
1528 
1529 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1530 		*spte = iter.old_spte;
1531 		sptep = iter.sptep;
1532 	}
1533 
1534 	/*
1535 	 * Perform the rcu_dereference to get the raw spte pointer value since
1536 	 * we are passing it up to fast_page_fault, which is shared with the
1537 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1538 	 * annotation.
1539 	 *
1540 	 * This is safe since fast_page_fault obeys the contracts of this
1541 	 * function as well as all TDP MMU contracts around modifying SPTEs
1542 	 * outside of mmu_lock.
1543 	 */
1544 	return rcu_dereference(sptep);
1545 }
1546