xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 16c8d76a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	struct workqueue_struct *wq;
20 
21 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 		return 0;
23 
24 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 	if (!wq)
26 		return -ENOMEM;
27 
28 	/* This should not be changed for the lifetime of the VM. */
29 	kvm->arch.tdp_mmu_enabled = true;
30 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
32 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
33 	kvm->arch.tdp_mmu_zap_wq = wq;
34 	return 1;
35 }
36 
37 /* Arbitrarily returns true so that this may be used in if statements. */
38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
39 							     bool shared)
40 {
41 	if (shared)
42 		lockdep_assert_held_read(&kvm->mmu_lock);
43 	else
44 		lockdep_assert_held_write(&kvm->mmu_lock);
45 
46 	return true;
47 }
48 
49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
50 {
51 	if (!kvm->arch.tdp_mmu_enabled)
52 		return;
53 
54 	/* Also waits for any queued work items.  */
55 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56 
57 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
58 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
59 
60 	/*
61 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
62 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
63 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
64 	 */
65 	rcu_barrier();
66 }
67 
68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
69 {
70 	free_page((unsigned long)sp->spt);
71 	kmem_cache_free(mmu_page_header_cache, sp);
72 }
73 
74 /*
75  * This is called through call_rcu in order to free TDP page table memory
76  * safely with respect to other kernel threads that may be operating on
77  * the memory.
78  * By only accessing TDP MMU page table memory in an RCU read critical
79  * section, and freeing it after a grace period, lockless access to that
80  * memory won't use it after it is freed.
81  */
82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
83 {
84 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 					       rcu_head);
86 
87 	tdp_mmu_free_sp(sp);
88 }
89 
90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
91 			     bool shared);
92 
93 static void tdp_mmu_zap_root_work(struct work_struct *work)
94 {
95 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 						 tdp_mmu_async_work);
97 	struct kvm *kvm = root->tdp_mmu_async_data;
98 
99 	read_lock(&kvm->mmu_lock);
100 
101 	/*
102 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 	 * to a different pCPU.  Note, the local TLB flush on reuse also
105 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 	 * intermediate paging structures, that may be zapped, as such entries
107 	 * are associated with the ASID on both VMX and SVM.
108 	 */
109 	tdp_mmu_zap_root(kvm, root, true);
110 
111 	/*
112 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 	 * avoiding an infinite loop.  By design, the root is reachable while
114 	 * it's being asynchronously zapped, thus a different task can put its
115 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 	 * asynchronously zapped root is unavoidable.
117 	 */
118 	kvm_tdp_mmu_put_root(kvm, root, true);
119 
120 	read_unlock(&kvm->mmu_lock);
121 }
122 
123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124 {
125 	root->tdp_mmu_async_data = kvm;
126 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
128 }
129 
130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
131 {
132 	union kvm_mmu_page_role role = page->role;
133 	role.invalid = true;
134 
135 	/* No need to use cmpxchg, only the invalid bit can change.  */
136 	role.word = xchg(&page->role.word, role.word);
137 	return role.invalid;
138 }
139 
140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
141 			  bool shared)
142 {
143 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
144 
145 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
146 		return;
147 
148 	WARN_ON(!root->tdp_mmu_page);
149 
150 	/*
151 	 * The root now has refcount=0.  It is valid, but readers already
152 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 	 * rejects it.  This remains true for the rest of the execution
154 	 * of this function, because readers visit valid roots only
155 	 * (except for tdp_mmu_zap_root_work(), which however
156 	 * does not acquire any reference itself).
157 	 *
158 	 * Even though there are flows that need to visit all roots for
159 	 * correctness, they all take mmu_lock for write, so they cannot yet
160 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 	 * since the root still has refcount=0.
162 	 *
163 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 	 * So the root temporarily gets an extra reference, going to refcount=1
166 	 * while staying invalid.  Readers still cannot acquire any reference;
167 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
168 	 * they might take an extra reference if they themselves yield.
169 	 * Therefore, when the reference is given back by the worker,
170 	 * there is no guarantee that the refcount is still 1.  If not, whoever
171 	 * puts the last reference will free the page, but they will not have to
172 	 * zap the root because a root cannot go from invalid to valid.
173 	 */
174 	if (!kvm_tdp_root_mark_invalid(root)) {
175 		refcount_set(&root->tdp_mmu_root_count, 1);
176 
177 		/*
178 		 * Zapping the root in a worker is not just "nice to have";
179 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
181 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 		 * might return with some roots not zapped yet.
183 		 */
184 		tdp_mmu_schedule_zap_root(kvm, root);
185 		return;
186 	}
187 
188 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 	list_del_rcu(&root->link);
190 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
191 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
192 }
193 
194 /*
195  * Returns the next root after @prev_root (or the first root if @prev_root is
196  * NULL).  A reference to the returned root is acquired, and the reference to
197  * @prev_root is released (the caller obviously must hold a reference to
198  * @prev_root if it's non-NULL).
199  *
200  * If @only_valid is true, invalid roots are skipped.
201  *
202  * Returns NULL if the end of tdp_mmu_roots was reached.
203  */
204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
205 					      struct kvm_mmu_page *prev_root,
206 					      bool shared, bool only_valid)
207 {
208 	struct kvm_mmu_page *next_root;
209 
210 	rcu_read_lock();
211 
212 	if (prev_root)
213 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214 						  &prev_root->link,
215 						  typeof(*prev_root), link);
216 	else
217 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 						   typeof(*next_root), link);
219 
220 	while (next_root) {
221 		if ((!only_valid || !next_root->role.invalid) &&
222 		    kvm_tdp_mmu_get_root(next_root))
223 			break;
224 
225 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 				&next_root->link, typeof(*next_root), link);
227 	}
228 
229 	rcu_read_unlock();
230 
231 	if (prev_root)
232 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
233 
234 	return next_root;
235 }
236 
237 /*
238  * Note: this iterator gets and puts references to the roots it iterates over.
239  * This makes it safe to release the MMU lock and yield within the loop, but
240  * if exiting the loop early, the caller must drop the reference to the most
241  * recent root. (Unless keeping a live reference is desirable.)
242  *
243  * If shared is set, this function is operating under the MMU lock in read
244  * mode. In the unlikely event that this thread must free a root, the lock
245  * will be temporarily dropped and reacquired in write mode.
246  */
247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
249 	     _root;								\
250 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
251 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
252 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
253 		} else
254 
255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
256 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
257 
258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
259 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
260 
261 /*
262  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
263  * the implication being that any flow that holds mmu_lock for read is
264  * inherently yield-friendly and should use the yield-safe variant above.
265  * Holding mmu_lock for write obviates the need for RCU protection as the list
266  * is guaranteed to be stable.
267  */
268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
269 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
270 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
271 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
272 		} else
273 
274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
275 {
276 	struct kvm_mmu_page *sp;
277 
278 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
280 
281 	return sp;
282 }
283 
284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 			    gfn_t gfn, union kvm_mmu_page_role role)
286 {
287 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
288 
289 	sp->role = role;
290 	sp->gfn = gfn;
291 	sp->ptep = sptep;
292 	sp->tdp_mmu_page = true;
293 
294 	trace_kvm_mmu_get_page(sp, true);
295 }
296 
297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 				  struct tdp_iter *iter)
299 {
300 	struct kvm_mmu_page *parent_sp;
301 	union kvm_mmu_page_role role;
302 
303 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
304 
305 	role = parent_sp->role;
306 	role.level--;
307 
308 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
309 }
310 
311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
312 {
313 	union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
314 	struct kvm *kvm = vcpu->kvm;
315 	struct kvm_mmu_page *root;
316 
317 	lockdep_assert_held_write(&kvm->mmu_lock);
318 
319 	/*
320 	 * Check for an existing root before allocating a new one.  Note, the
321 	 * role check prevents consuming an invalid root.
322 	 */
323 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
324 		if (root->role.word == role.word &&
325 		    kvm_tdp_mmu_get_root(root))
326 			goto out;
327 	}
328 
329 	root = tdp_mmu_alloc_sp(vcpu);
330 	tdp_mmu_init_sp(root, NULL, 0, role);
331 
332 	refcount_set(&root->tdp_mmu_root_count, 1);
333 
334 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
337 
338 out:
339 	return __pa(root->spt);
340 }
341 
342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
343 				u64 old_spte, u64 new_spte, int level,
344 				bool shared);
345 
346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
347 {
348 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
349 		return;
350 
351 	if (is_accessed_spte(old_spte) &&
352 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
354 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
355 }
356 
357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 					  u64 old_spte, u64 new_spte, int level)
359 {
360 	bool pfn_changed;
361 	struct kvm_memory_slot *slot;
362 
363 	if (level > PG_LEVEL_4K)
364 		return;
365 
366 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
367 
368 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 	    is_writable_pte(new_spte)) {
370 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
371 		mark_page_dirty_in_slot(kvm, slot, gfn);
372 	}
373 }
374 
375 /**
376  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
377  *
378  * @kvm: kvm instance
379  * @sp: the page to be removed
380  * @shared: This operation may not be running under the exclusive use of
381  *	    the MMU lock and the operation must synchronize with other
382  *	    threads that might be adding or removing pages.
383  */
384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
385 			      bool shared)
386 {
387 	if (shared)
388 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
389 	else
390 		lockdep_assert_held_write(&kvm->mmu_lock);
391 
392 	list_del(&sp->link);
393 	if (sp->lpage_disallowed)
394 		unaccount_huge_nx_page(kvm, sp);
395 
396 	if (shared)
397 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
398 }
399 
400 /**
401  * handle_removed_pt() - handle a page table removed from the TDP structure
402  *
403  * @kvm: kvm instance
404  * @pt: the page removed from the paging structure
405  * @shared: This operation may not be running under the exclusive use
406  *	    of the MMU lock and the operation must synchronize with other
407  *	    threads that might be modifying SPTEs.
408  *
409  * Given a page table that has been removed from the TDP paging structure,
410  * iterates through the page table to clear SPTEs and free child page tables.
411  *
412  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
413  * protection. Since this thread removed it from the paging structure,
414  * this thread will be responsible for ensuring the page is freed. Hence the
415  * early rcu_dereferences in the function.
416  */
417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
418 {
419 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
420 	int level = sp->role.level;
421 	gfn_t base_gfn = sp->gfn;
422 	int i;
423 
424 	trace_kvm_mmu_prepare_zap_page(sp);
425 
426 	tdp_mmu_unlink_sp(kvm, sp, shared);
427 
428 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
429 		u64 *sptep = rcu_dereference(pt) + i;
430 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
431 		u64 old_child_spte;
432 
433 		if (shared) {
434 			/*
435 			 * Set the SPTE to a nonpresent value that other
436 			 * threads will not overwrite. If the SPTE was
437 			 * already marked as removed then another thread
438 			 * handling a page fault could overwrite it, so
439 			 * set the SPTE until it is set from some other
440 			 * value to the removed SPTE value.
441 			 */
442 			for (;;) {
443 				old_child_spte = xchg(sptep, REMOVED_SPTE);
444 				if (!is_removed_spte(old_child_spte))
445 					break;
446 				cpu_relax();
447 			}
448 		} else {
449 			/*
450 			 * If the SPTE is not MMU-present, there is no backing
451 			 * page associated with the SPTE and so no side effects
452 			 * that need to be recorded, and exclusive ownership of
453 			 * mmu_lock ensures the SPTE can't be made present.
454 			 * Note, zapping MMIO SPTEs is also unnecessary as they
455 			 * are guarded by the memslots generation, not by being
456 			 * unreachable.
457 			 */
458 			old_child_spte = READ_ONCE(*sptep);
459 			if (!is_shadow_present_pte(old_child_spte))
460 				continue;
461 
462 			/*
463 			 * Marking the SPTE as a removed SPTE is not
464 			 * strictly necessary here as the MMU lock will
465 			 * stop other threads from concurrently modifying
466 			 * this SPTE. Using the removed SPTE value keeps
467 			 * the two branches consistent and simplifies
468 			 * the function.
469 			 */
470 			WRITE_ONCE(*sptep, REMOVED_SPTE);
471 		}
472 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
473 				    old_child_spte, REMOVED_SPTE, level,
474 				    shared);
475 	}
476 
477 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
478 }
479 
480 /**
481  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
482  * @kvm: kvm instance
483  * @as_id: the address space of the paging structure the SPTE was a part of
484  * @gfn: the base GFN that was mapped by the SPTE
485  * @old_spte: The value of the SPTE before the change
486  * @new_spte: The value of the SPTE after the change
487  * @level: the level of the PT the SPTE is part of in the paging structure
488  * @shared: This operation may not be running under the exclusive use of
489  *	    the MMU lock and the operation must synchronize with other
490  *	    threads that might be modifying SPTEs.
491  *
492  * Handle bookkeeping that might result from the modification of a SPTE.
493  * This function must be called for all TDP SPTE modifications.
494  */
495 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
496 				  u64 old_spte, u64 new_spte, int level,
497 				  bool shared)
498 {
499 	bool was_present = is_shadow_present_pte(old_spte);
500 	bool is_present = is_shadow_present_pte(new_spte);
501 	bool was_leaf = was_present && is_last_spte(old_spte, level);
502 	bool is_leaf = is_present && is_last_spte(new_spte, level);
503 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
504 
505 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
506 	WARN_ON(level < PG_LEVEL_4K);
507 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
508 
509 	/*
510 	 * If this warning were to trigger it would indicate that there was a
511 	 * missing MMU notifier or a race with some notifier handler.
512 	 * A present, leaf SPTE should never be directly replaced with another
513 	 * present leaf SPTE pointing to a different PFN. A notifier handler
514 	 * should be zapping the SPTE before the main MM's page table is
515 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
516 	 * thread before replacement.
517 	 */
518 	if (was_leaf && is_leaf && pfn_changed) {
519 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
520 		       "SPTE with another present leaf SPTE mapping a\n"
521 		       "different PFN!\n"
522 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
523 		       as_id, gfn, old_spte, new_spte, level);
524 
525 		/*
526 		 * Crash the host to prevent error propagation and guest data
527 		 * corruption.
528 		 */
529 		BUG();
530 	}
531 
532 	if (old_spte == new_spte)
533 		return;
534 
535 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
536 
537 	if (is_leaf)
538 		check_spte_writable_invariants(new_spte);
539 
540 	/*
541 	 * The only times a SPTE should be changed from a non-present to
542 	 * non-present state is when an MMIO entry is installed/modified/
543 	 * removed. In that case, there is nothing to do here.
544 	 */
545 	if (!was_present && !is_present) {
546 		/*
547 		 * If this change does not involve a MMIO SPTE or removed SPTE,
548 		 * it is unexpected. Log the change, though it should not
549 		 * impact the guest since both the former and current SPTEs
550 		 * are nonpresent.
551 		 */
552 		if (WARN_ON(!is_mmio_spte(old_spte) &&
553 			    !is_mmio_spte(new_spte) &&
554 			    !is_removed_spte(new_spte)))
555 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
556 			       "should not be replaced with another,\n"
557 			       "different nonpresent SPTE, unless one or both\n"
558 			       "are MMIO SPTEs, or the new SPTE is\n"
559 			       "a temporary removed SPTE.\n"
560 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
561 			       as_id, gfn, old_spte, new_spte, level);
562 		return;
563 	}
564 
565 	if (is_leaf != was_leaf)
566 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
567 
568 	if (was_leaf && is_dirty_spte(old_spte) &&
569 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
570 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
571 
572 	/*
573 	 * Recursively handle child PTs if the change removed a subtree from
574 	 * the paging structure.  Note the WARN on the PFN changing without the
575 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
576 	 * pages are kernel allocations and should never be migrated.
577 	 */
578 	if (was_present && !was_leaf &&
579 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
580 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
581 }
582 
583 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
584 				u64 old_spte, u64 new_spte, int level,
585 				bool shared)
586 {
587 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
588 			      shared);
589 	handle_changed_spte_acc_track(old_spte, new_spte, level);
590 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
591 				      new_spte, level);
592 }
593 
594 /*
595  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
596  * and handle the associated bookkeeping.  Do not mark the page dirty
597  * in KVM's dirty bitmaps.
598  *
599  * If setting the SPTE fails because it has changed, iter->old_spte will be
600  * refreshed to the current value of the spte.
601  *
602  * @kvm: kvm instance
603  * @iter: a tdp_iter instance currently on the SPTE that should be set
604  * @new_spte: The value the SPTE should be set to
605  * Return:
606  * * 0      - If the SPTE was set.
607  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
608  *            no side-effects other than setting iter->old_spte to the last
609  *            known value of the spte.
610  */
611 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
612 					  struct tdp_iter *iter,
613 					  u64 new_spte)
614 {
615 	u64 *sptep = rcu_dereference(iter->sptep);
616 	u64 old_spte;
617 
618 	/*
619 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
620 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
621 	 * and pre-checking before inserting a new SPTE is advantageous as it
622 	 * avoids unnecessary work.
623 	 */
624 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
625 
626 	lockdep_assert_held_read(&kvm->mmu_lock);
627 
628 	/*
629 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
630 	 * does not hold the mmu_lock.
631 	 */
632 	old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
633 	if (old_spte != iter->old_spte) {
634 		/*
635 		 * The page table entry was modified by a different logical
636 		 * CPU. Refresh iter->old_spte with the current value so the
637 		 * caller operates on fresh data, e.g. if it retries
638 		 * tdp_mmu_set_spte_atomic().
639 		 */
640 		iter->old_spte = old_spte;
641 		return -EBUSY;
642 	}
643 
644 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
645 			      new_spte, iter->level, true);
646 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
647 
648 	return 0;
649 }
650 
651 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
652 					  struct tdp_iter *iter)
653 {
654 	int ret;
655 
656 	/*
657 	 * Freeze the SPTE by setting it to a special,
658 	 * non-present value. This will stop other threads from
659 	 * immediately installing a present entry in its place
660 	 * before the TLBs are flushed.
661 	 */
662 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
663 	if (ret)
664 		return ret;
665 
666 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
667 					   KVM_PAGES_PER_HPAGE(iter->level));
668 
669 	/*
670 	 * No other thread can overwrite the removed SPTE as they
671 	 * must either wait on the MMU lock or use
672 	 * tdp_mmu_set_spte_atomic which will not overwrite the
673 	 * special removed SPTE value. No bookkeeping is needed
674 	 * here since the SPTE is going from non-present
675 	 * to non-present.
676 	 */
677 	kvm_tdp_mmu_write_spte(iter->sptep, 0);
678 
679 	return 0;
680 }
681 
682 
683 /*
684  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
685  * @kvm:	      KVM instance
686  * @as_id:	      Address space ID, i.e. regular vs. SMM
687  * @sptep:	      Pointer to the SPTE
688  * @old_spte:	      The current value of the SPTE
689  * @new_spte:	      The new value that will be set for the SPTE
690  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
691  * @level:	      The level _containing_ the SPTE (its parent PT's level)
692  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
693  *		      of the page. Should be set unless handling an MMU
694  *		      notifier for access tracking. Leaving record_acc_track
695  *		      unset in that case prevents page accesses from being
696  *		      double counted.
697  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
698  *		      appropriate for the change being made. Should be set
699  *		      unless performing certain dirty logging operations.
700  *		      Leaving record_dirty_log unset in that case prevents page
701  *		      writes from being double counted.
702  */
703 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
704 			       u64 old_spte, u64 new_spte, gfn_t gfn, int level,
705 			       bool record_acc_track, bool record_dirty_log)
706 {
707 	lockdep_assert_held_write(&kvm->mmu_lock);
708 
709 	/*
710 	 * No thread should be using this function to set SPTEs to or from the
711 	 * temporary removed SPTE value.
712 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
713 	 * should be used. If operating under the MMU lock in write mode, the
714 	 * use of the removed SPTE should not be necessary.
715 	 */
716 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
717 
718 	kvm_tdp_mmu_write_spte(sptep, new_spte);
719 
720 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
721 
722 	if (record_acc_track)
723 		handle_changed_spte_acc_track(old_spte, new_spte, level);
724 	if (record_dirty_log)
725 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
726 					      new_spte, level);
727 }
728 
729 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
730 				     u64 new_spte, bool record_acc_track,
731 				     bool record_dirty_log)
732 {
733 	WARN_ON_ONCE(iter->yielded);
734 
735 	__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
736 			   new_spte, iter->gfn, iter->level,
737 			   record_acc_track, record_dirty_log);
738 }
739 
740 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
741 				    u64 new_spte)
742 {
743 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
744 }
745 
746 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
747 						 struct tdp_iter *iter,
748 						 u64 new_spte)
749 {
750 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
751 }
752 
753 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
754 						 struct tdp_iter *iter,
755 						 u64 new_spte)
756 {
757 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
758 }
759 
760 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
761 	for_each_tdp_pte(_iter, _root, _start, _end)
762 
763 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
764 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
765 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
766 		    !is_last_spte(_iter.old_spte, _iter.level))		\
767 			continue;					\
768 		else
769 
770 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
771 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
772 
773 /*
774  * Yield if the MMU lock is contended or this thread needs to return control
775  * to the scheduler.
776  *
777  * If this function should yield and flush is set, it will perform a remote
778  * TLB flush before yielding.
779  *
780  * If this function yields, iter->yielded is set and the caller must skip to
781  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
782  * over the paging structures to allow the iterator to continue its traversal
783  * from the paging structure root.
784  *
785  * Returns true if this function yielded.
786  */
787 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
788 							  struct tdp_iter *iter,
789 							  bool flush, bool shared)
790 {
791 	WARN_ON(iter->yielded);
792 
793 	/* Ensure forward progress has been made before yielding. */
794 	if (iter->next_last_level_gfn == iter->yielded_gfn)
795 		return false;
796 
797 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
798 		if (flush)
799 			kvm_flush_remote_tlbs(kvm);
800 
801 		rcu_read_unlock();
802 
803 		if (shared)
804 			cond_resched_rwlock_read(&kvm->mmu_lock);
805 		else
806 			cond_resched_rwlock_write(&kvm->mmu_lock);
807 
808 		rcu_read_lock();
809 
810 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
811 
812 		iter->yielded = true;
813 	}
814 
815 	return iter->yielded;
816 }
817 
818 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
819 {
820 	/*
821 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
822 	 * a gpa range that would exceed the max gfn, and KVM does not create
823 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
824 	 * the slow emulation path every time.
825 	 */
826 	return kvm_mmu_max_gfn() + 1;
827 }
828 
829 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
830 			       bool shared, int zap_level)
831 {
832 	struct tdp_iter iter;
833 
834 	gfn_t end = tdp_mmu_max_gfn_exclusive();
835 	gfn_t start = 0;
836 
837 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
838 retry:
839 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
840 			continue;
841 
842 		if (!is_shadow_present_pte(iter.old_spte))
843 			continue;
844 
845 		if (iter.level > zap_level)
846 			continue;
847 
848 		if (!shared)
849 			tdp_mmu_set_spte(kvm, &iter, 0);
850 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
851 			goto retry;
852 	}
853 }
854 
855 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
856 			     bool shared)
857 {
858 
859 	/*
860 	 * The root must have an elevated refcount so that it's reachable via
861 	 * mmu_notifier callbacks, which allows this path to yield and drop
862 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
863 	 * must drop all references to relevant pages prior to completing the
864 	 * callback.  Dropping mmu_lock with an unreachable root would result
865 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
866 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
867 	 * dirty accessed bits to the SPTE's associated struct page.
868 	 */
869 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
870 
871 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
872 
873 	rcu_read_lock();
874 
875 	/*
876 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
877 	 * split the zap into two passes.  On the first pass, zap at the 1gb
878 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
879 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
880 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
881 	 *
882 	 * Because zapping a SP recurses on its children, stepping down to
883 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
884 	 */
885 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
886 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
887 
888 	rcu_read_unlock();
889 }
890 
891 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
892 {
893 	u64 old_spte;
894 
895 	/*
896 	 * This helper intentionally doesn't allow zapping a root shadow page,
897 	 * which doesn't have a parent page table and thus no associated entry.
898 	 */
899 	if (WARN_ON_ONCE(!sp->ptep))
900 		return false;
901 
902 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
903 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
904 		return false;
905 
906 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
907 			   sp->gfn, sp->role.level + 1, true, true);
908 
909 	return true;
910 }
911 
912 /*
913  * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
914  * have been cleared and a TLB flush is needed before releasing the MMU lock.
915  *
916  * If can_yield is true, will release the MMU lock and reschedule if the
917  * scheduler needs the CPU or there is contention on the MMU lock. If this
918  * function cannot yield, it will not release the MMU lock or reschedule and
919  * the caller must ensure it does not supply too large a GFN range, or the
920  * operation can cause a soft lockup.
921  */
922 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
923 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
924 {
925 	struct tdp_iter iter;
926 
927 	end = min(end, tdp_mmu_max_gfn_exclusive());
928 
929 	lockdep_assert_held_write(&kvm->mmu_lock);
930 
931 	rcu_read_lock();
932 
933 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
934 		if (can_yield &&
935 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
936 			flush = false;
937 			continue;
938 		}
939 
940 		if (!is_shadow_present_pte(iter.old_spte) ||
941 		    !is_last_spte(iter.old_spte, iter.level))
942 			continue;
943 
944 		tdp_mmu_set_spte(kvm, &iter, 0);
945 		flush = true;
946 	}
947 
948 	rcu_read_unlock();
949 
950 	/*
951 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
952 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
953 	 */
954 	return flush;
955 }
956 
957 /*
958  * Tears down the mappings for the range of gfns, [start, end), and frees the
959  * non-root pages mapping GFNs strictly within that range. Returns true if
960  * SPTEs have been cleared and a TLB flush is needed before releasing the
961  * MMU lock.
962  */
963 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
964 			   bool can_yield, bool flush)
965 {
966 	struct kvm_mmu_page *root;
967 
968 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
969 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
970 
971 	return flush;
972 }
973 
974 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
975 {
976 	struct kvm_mmu_page *root;
977 	int i;
978 
979 	/*
980 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
981 	 * before returning to the caller.  Zap directly even if the root is
982 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
983 	 * all that expensive and mmu_lock is already held, which means the
984 	 * worker has yielded, i.e. flushing the work instead of zapping here
985 	 * isn't guaranteed to be any faster.
986 	 *
987 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
988 	 * is being destroyed or the userspace VMM has exited.  In both cases,
989 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
990 	 */
991 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
992 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
993 			tdp_mmu_zap_root(kvm, root, false);
994 	}
995 }
996 
997 /*
998  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
999  * zap" completes.
1000  */
1001 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1002 {
1003 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1004 }
1005 
1006 /*
1007  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1008  * is about to be zapped, e.g. in response to a memslots update.  The actual
1009  * zapping is performed asynchronously, so a reference is taken on all roots.
1010  * Using a separate workqueue makes it easy to ensure that the destruction is
1011  * performed before the "fast zap" completes, without keeping a separate list
1012  * of invalidated roots; the list is effectively the list of work items in
1013  * the workqueue.
1014  *
1015  * Get a reference even if the root is already invalid, the asynchronous worker
1016  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1017  * is held for write, it should be impossible to observe a root with zero refcount,
1018  * i.e. the list of roots cannot be stale.
1019  *
1020  * This has essentially the same effect for the TDP MMU
1021  * as updating mmu_valid_gen does for the shadow MMU.
1022  */
1023 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1024 {
1025 	struct kvm_mmu_page *root;
1026 
1027 	lockdep_assert_held_write(&kvm->mmu_lock);
1028 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1029 		if (!root->role.invalid &&
1030 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1031 			root->role.invalid = true;
1032 			tdp_mmu_schedule_zap_root(kvm, root);
1033 		}
1034 	}
1035 }
1036 
1037 /*
1038  * Installs a last-level SPTE to handle a TDP page fault.
1039  * (NPT/EPT violation/misconfiguration)
1040  */
1041 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1042 					  struct kvm_page_fault *fault,
1043 					  struct tdp_iter *iter)
1044 {
1045 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1046 	u64 new_spte;
1047 	int ret = RET_PF_FIXED;
1048 	bool wrprot = false;
1049 
1050 	WARN_ON(sp->role.level != fault->goal_level);
1051 	if (unlikely(!fault->slot))
1052 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1053 	else
1054 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1055 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1056 					 fault->map_writable, &new_spte);
1057 
1058 	if (new_spte == iter->old_spte)
1059 		ret = RET_PF_SPURIOUS;
1060 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1061 		return RET_PF_RETRY;
1062 	else if (is_shadow_present_pte(iter->old_spte) &&
1063 		 !is_last_spte(iter->old_spte, iter->level))
1064 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1065 						   KVM_PAGES_PER_HPAGE(iter->level + 1));
1066 
1067 	/*
1068 	 * If the page fault was caused by a write but the page is write
1069 	 * protected, emulation is needed. If the emulation was skipped,
1070 	 * the vCPU would have the same fault again.
1071 	 */
1072 	if (wrprot) {
1073 		if (fault->write)
1074 			ret = RET_PF_EMULATE;
1075 	}
1076 
1077 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1078 	if (unlikely(is_mmio_spte(new_spte))) {
1079 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1080 				     new_spte);
1081 		ret = RET_PF_EMULATE;
1082 	} else {
1083 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1084 				       rcu_dereference(iter->sptep));
1085 	}
1086 
1087 	/*
1088 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1089 	 * consistent with legacy MMU behavior.
1090 	 */
1091 	if (ret != RET_PF_SPURIOUS)
1092 		vcpu->stat.pf_fixed++;
1093 
1094 	return ret;
1095 }
1096 
1097 /*
1098  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1099  * provided page table.
1100  *
1101  * @kvm: kvm instance
1102  * @iter: a tdp_iter instance currently on the SPTE that should be set
1103  * @sp: The new TDP page table to install.
1104  * @account_nx: True if this page table is being installed to split a
1105  *              non-executable huge page.
1106  * @shared: This operation is running under the MMU lock in read mode.
1107  *
1108  * Returns: 0 if the new page table was installed. Non-0 if the page table
1109  *          could not be installed (e.g. the atomic compare-exchange failed).
1110  */
1111 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1112 			   struct kvm_mmu_page *sp, bool account_nx,
1113 			   bool shared)
1114 {
1115 	u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1116 	int ret = 0;
1117 
1118 	if (shared) {
1119 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1120 		if (ret)
1121 			return ret;
1122 	} else {
1123 		tdp_mmu_set_spte(kvm, iter, spte);
1124 	}
1125 
1126 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1127 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1128 	if (account_nx)
1129 		account_huge_nx_page(kvm, sp);
1130 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1131 
1132 	return 0;
1133 }
1134 
1135 /*
1136  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1137  * page tables and SPTEs to translate the faulting guest physical address.
1138  */
1139 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1140 {
1141 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1142 	struct tdp_iter iter;
1143 	struct kvm_mmu_page *sp;
1144 	int ret;
1145 
1146 	kvm_mmu_hugepage_adjust(vcpu, fault);
1147 
1148 	trace_kvm_mmu_spte_requested(fault);
1149 
1150 	rcu_read_lock();
1151 
1152 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1153 		if (fault->nx_huge_page_workaround_enabled)
1154 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1155 
1156 		if (iter.level == fault->goal_level)
1157 			break;
1158 
1159 		/*
1160 		 * If there is an SPTE mapping a large page at a higher level
1161 		 * than the target, that SPTE must be cleared and replaced
1162 		 * with a non-leaf SPTE.
1163 		 */
1164 		if (is_shadow_present_pte(iter.old_spte) &&
1165 		    is_large_pte(iter.old_spte)) {
1166 			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1167 				break;
1168 
1169 			/*
1170 			 * The iter must explicitly re-read the spte here
1171 			 * because the new value informs the !present
1172 			 * path below.
1173 			 */
1174 			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1175 		}
1176 
1177 		if (!is_shadow_present_pte(iter.old_spte)) {
1178 			bool account_nx = fault->huge_page_disallowed &&
1179 					  fault->req_level >= iter.level;
1180 
1181 			/*
1182 			 * If SPTE has been frozen by another thread, just
1183 			 * give up and retry, avoiding unnecessary page table
1184 			 * allocation and free.
1185 			 */
1186 			if (is_removed_spte(iter.old_spte))
1187 				break;
1188 
1189 			sp = tdp_mmu_alloc_sp(vcpu);
1190 			tdp_mmu_init_child_sp(sp, &iter);
1191 
1192 			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1193 				tdp_mmu_free_sp(sp);
1194 				break;
1195 			}
1196 		}
1197 	}
1198 
1199 	/*
1200 	 * Force the guest to retry the access if the upper level SPTEs aren't
1201 	 * in place, or if the target leaf SPTE is frozen by another CPU.
1202 	 */
1203 	if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1204 		rcu_read_unlock();
1205 		return RET_PF_RETRY;
1206 	}
1207 
1208 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1209 	rcu_read_unlock();
1210 
1211 	return ret;
1212 }
1213 
1214 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1215 				 bool flush)
1216 {
1217 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1218 				     range->end, range->may_block, flush);
1219 }
1220 
1221 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1222 			      struct kvm_gfn_range *range);
1223 
1224 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1225 						   struct kvm_gfn_range *range,
1226 						   tdp_handler_t handler)
1227 {
1228 	struct kvm_mmu_page *root;
1229 	struct tdp_iter iter;
1230 	bool ret = false;
1231 
1232 	/*
1233 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1234 	 * into this helper allow blocking; it'd be dead, wasteful code.
1235 	 */
1236 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1237 		rcu_read_lock();
1238 
1239 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1240 			ret |= handler(kvm, &iter, range);
1241 
1242 		rcu_read_unlock();
1243 	}
1244 
1245 	return ret;
1246 }
1247 
1248 /*
1249  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1250  * if any of the GFNs in the range have been accessed.
1251  */
1252 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1253 			  struct kvm_gfn_range *range)
1254 {
1255 	u64 new_spte = 0;
1256 
1257 	/* If we have a non-accessed entry we don't need to change the pte. */
1258 	if (!is_accessed_spte(iter->old_spte))
1259 		return false;
1260 
1261 	new_spte = iter->old_spte;
1262 
1263 	if (spte_ad_enabled(new_spte)) {
1264 		new_spte &= ~shadow_accessed_mask;
1265 	} else {
1266 		/*
1267 		 * Capture the dirty status of the page, so that it doesn't get
1268 		 * lost when the SPTE is marked for access tracking.
1269 		 */
1270 		if (is_writable_pte(new_spte))
1271 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1272 
1273 		new_spte = mark_spte_for_access_track(new_spte);
1274 	}
1275 
1276 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1277 
1278 	return true;
1279 }
1280 
1281 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1282 {
1283 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1284 }
1285 
1286 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1287 			 struct kvm_gfn_range *range)
1288 {
1289 	return is_accessed_spte(iter->old_spte);
1290 }
1291 
1292 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1293 {
1294 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1295 }
1296 
1297 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1298 			 struct kvm_gfn_range *range)
1299 {
1300 	u64 new_spte;
1301 
1302 	/* Huge pages aren't expected to be modified without first being zapped. */
1303 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1304 
1305 	if (iter->level != PG_LEVEL_4K ||
1306 	    !is_shadow_present_pte(iter->old_spte))
1307 		return false;
1308 
1309 	/*
1310 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1311 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1312 	 * invariant that the PFN of a present * leaf SPTE can never change.
1313 	 * See __handle_changed_spte().
1314 	 */
1315 	tdp_mmu_set_spte(kvm, iter, 0);
1316 
1317 	if (!pte_write(range->pte)) {
1318 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1319 								  pte_pfn(range->pte));
1320 
1321 		tdp_mmu_set_spte(kvm, iter, new_spte);
1322 	}
1323 
1324 	return true;
1325 }
1326 
1327 /*
1328  * Handle the changed_pte MMU notifier for the TDP MMU.
1329  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1330  * notifier.
1331  * Returns non-zero if a flush is needed before releasing the MMU lock.
1332  */
1333 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1334 {
1335 	/*
1336 	 * No need to handle the remote TLB flush under RCU protection, the
1337 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1338 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1339 	 */
1340 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1341 }
1342 
1343 /*
1344  * Remove write access from all SPTEs at or above min_level that map GFNs
1345  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1346  * be flushed.
1347  */
1348 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1349 			     gfn_t start, gfn_t end, int min_level)
1350 {
1351 	struct tdp_iter iter;
1352 	u64 new_spte;
1353 	bool spte_set = false;
1354 
1355 	rcu_read_lock();
1356 
1357 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1358 
1359 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1360 retry:
1361 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1362 			continue;
1363 
1364 		if (!is_shadow_present_pte(iter.old_spte) ||
1365 		    !is_last_spte(iter.old_spte, iter.level) ||
1366 		    !(iter.old_spte & PT_WRITABLE_MASK))
1367 			continue;
1368 
1369 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1370 
1371 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1372 			goto retry;
1373 
1374 		spte_set = true;
1375 	}
1376 
1377 	rcu_read_unlock();
1378 	return spte_set;
1379 }
1380 
1381 /*
1382  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1383  * only affect leaf SPTEs down to min_level.
1384  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1385  */
1386 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1387 			     const struct kvm_memory_slot *slot, int min_level)
1388 {
1389 	struct kvm_mmu_page *root;
1390 	bool spte_set = false;
1391 
1392 	lockdep_assert_held_read(&kvm->mmu_lock);
1393 
1394 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1395 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1396 			     slot->base_gfn + slot->npages, min_level);
1397 
1398 	return spte_set;
1399 }
1400 
1401 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1402 {
1403 	struct kvm_mmu_page *sp;
1404 
1405 	gfp |= __GFP_ZERO;
1406 
1407 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1408 	if (!sp)
1409 		return NULL;
1410 
1411 	sp->spt = (void *)__get_free_page(gfp);
1412 	if (!sp->spt) {
1413 		kmem_cache_free(mmu_page_header_cache, sp);
1414 		return NULL;
1415 	}
1416 
1417 	return sp;
1418 }
1419 
1420 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1421 						       struct tdp_iter *iter,
1422 						       bool shared)
1423 {
1424 	struct kvm_mmu_page *sp;
1425 
1426 	/*
1427 	 * Since we are allocating while under the MMU lock we have to be
1428 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1429 	 * reclaim and to avoid making any filesystem callbacks (which can end
1430 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1431 	 *
1432 	 * If this allocation fails we drop the lock and retry with reclaim
1433 	 * allowed.
1434 	 */
1435 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1436 	if (sp)
1437 		return sp;
1438 
1439 	rcu_read_unlock();
1440 
1441 	if (shared)
1442 		read_unlock(&kvm->mmu_lock);
1443 	else
1444 		write_unlock(&kvm->mmu_lock);
1445 
1446 	iter->yielded = true;
1447 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1448 
1449 	if (shared)
1450 		read_lock(&kvm->mmu_lock);
1451 	else
1452 		write_lock(&kvm->mmu_lock);
1453 
1454 	rcu_read_lock();
1455 
1456 	return sp;
1457 }
1458 
1459 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1460 				   struct kvm_mmu_page *sp, bool shared)
1461 {
1462 	const u64 huge_spte = iter->old_spte;
1463 	const int level = iter->level;
1464 	int ret, i;
1465 
1466 	tdp_mmu_init_child_sp(sp, iter);
1467 
1468 	/*
1469 	 * No need for atomics when writing to sp->spt since the page table has
1470 	 * not been linked in yet and thus is not reachable from any other CPU.
1471 	 */
1472 	for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1473 		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1474 
1475 	/*
1476 	 * Replace the huge spte with a pointer to the populated lower level
1477 	 * page table. Since we are making this change without a TLB flush vCPUs
1478 	 * will see a mix of the split mappings and the original huge mapping,
1479 	 * depending on what's currently in their TLB. This is fine from a
1480 	 * correctness standpoint since the translation will be the same either
1481 	 * way.
1482 	 */
1483 	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1484 	if (ret)
1485 		goto out;
1486 
1487 	/*
1488 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1489 	 * are overwriting from the page stats. But we have to manually update
1490 	 * the page stats with the new present child pages.
1491 	 */
1492 	kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1493 
1494 out:
1495 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1496 	return ret;
1497 }
1498 
1499 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1500 					 struct kvm_mmu_page *root,
1501 					 gfn_t start, gfn_t end,
1502 					 int target_level, bool shared)
1503 {
1504 	struct kvm_mmu_page *sp = NULL;
1505 	struct tdp_iter iter;
1506 	int ret = 0;
1507 
1508 	rcu_read_lock();
1509 
1510 	/*
1511 	 * Traverse the page table splitting all huge pages above the target
1512 	 * level into one lower level. For example, if we encounter a 1GB page
1513 	 * we split it into 512 2MB pages.
1514 	 *
1515 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1516 	 * to visit an SPTE before ever visiting its children, which means we
1517 	 * will correctly recursively split huge pages that are more than one
1518 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1519 	 * and then splitting each of those to 512 4KB pages).
1520 	 */
1521 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1522 retry:
1523 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1524 			continue;
1525 
1526 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1527 			continue;
1528 
1529 		if (!sp) {
1530 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1531 			if (!sp) {
1532 				ret = -ENOMEM;
1533 				trace_kvm_mmu_split_huge_page(iter.gfn,
1534 							      iter.old_spte,
1535 							      iter.level, ret);
1536 				break;
1537 			}
1538 
1539 			if (iter.yielded)
1540 				continue;
1541 		}
1542 
1543 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1544 			goto retry;
1545 
1546 		sp = NULL;
1547 	}
1548 
1549 	rcu_read_unlock();
1550 
1551 	/*
1552 	 * It's possible to exit the loop having never used the last sp if, for
1553 	 * example, a vCPU doing HugePage NX splitting wins the race and
1554 	 * installs its own sp in place of the last sp we tried to split.
1555 	 */
1556 	if (sp)
1557 		tdp_mmu_free_sp(sp);
1558 
1559 	return ret;
1560 }
1561 
1562 
1563 /*
1564  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1565  */
1566 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1567 				      const struct kvm_memory_slot *slot,
1568 				      gfn_t start, gfn_t end,
1569 				      int target_level, bool shared)
1570 {
1571 	struct kvm_mmu_page *root;
1572 	int r = 0;
1573 
1574 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1575 
1576 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1577 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1578 		if (r) {
1579 			kvm_tdp_mmu_put_root(kvm, root, shared);
1580 			break;
1581 		}
1582 	}
1583 }
1584 
1585 /*
1586  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1587  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1588  * If AD bits are not enabled, this will require clearing the writable bit on
1589  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1590  * be flushed.
1591  */
1592 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1593 			   gfn_t start, gfn_t end)
1594 {
1595 	struct tdp_iter iter;
1596 	u64 new_spte;
1597 	bool spte_set = false;
1598 
1599 	rcu_read_lock();
1600 
1601 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1602 retry:
1603 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1604 			continue;
1605 
1606 		if (!is_shadow_present_pte(iter.old_spte))
1607 			continue;
1608 
1609 		if (spte_ad_need_write_protect(iter.old_spte)) {
1610 			if (is_writable_pte(iter.old_spte))
1611 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1612 			else
1613 				continue;
1614 		} else {
1615 			if (iter.old_spte & shadow_dirty_mask)
1616 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1617 			else
1618 				continue;
1619 		}
1620 
1621 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1622 			goto retry;
1623 
1624 		spte_set = true;
1625 	}
1626 
1627 	rcu_read_unlock();
1628 	return spte_set;
1629 }
1630 
1631 /*
1632  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1633  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1634  * If AD bits are not enabled, this will require clearing the writable bit on
1635  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1636  * be flushed.
1637  */
1638 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1639 				  const struct kvm_memory_slot *slot)
1640 {
1641 	struct kvm_mmu_page *root;
1642 	bool spte_set = false;
1643 
1644 	lockdep_assert_held_read(&kvm->mmu_lock);
1645 
1646 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1647 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1648 				slot->base_gfn + slot->npages);
1649 
1650 	return spte_set;
1651 }
1652 
1653 /*
1654  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1655  * set in mask, starting at gfn. The given memslot is expected to contain all
1656  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1657  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1658  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1659  */
1660 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1661 				  gfn_t gfn, unsigned long mask, bool wrprot)
1662 {
1663 	struct tdp_iter iter;
1664 	u64 new_spte;
1665 
1666 	rcu_read_lock();
1667 
1668 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1669 				    gfn + BITS_PER_LONG) {
1670 		if (!mask)
1671 			break;
1672 
1673 		if (iter.level > PG_LEVEL_4K ||
1674 		    !(mask & (1UL << (iter.gfn - gfn))))
1675 			continue;
1676 
1677 		mask &= ~(1UL << (iter.gfn - gfn));
1678 
1679 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1680 			if (is_writable_pte(iter.old_spte))
1681 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1682 			else
1683 				continue;
1684 		} else {
1685 			if (iter.old_spte & shadow_dirty_mask)
1686 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1687 			else
1688 				continue;
1689 		}
1690 
1691 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1692 	}
1693 
1694 	rcu_read_unlock();
1695 }
1696 
1697 /*
1698  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1699  * set in mask, starting at gfn. The given memslot is expected to contain all
1700  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1701  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1702  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1703  */
1704 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1705 				       struct kvm_memory_slot *slot,
1706 				       gfn_t gfn, unsigned long mask,
1707 				       bool wrprot)
1708 {
1709 	struct kvm_mmu_page *root;
1710 
1711 	lockdep_assert_held_write(&kvm->mmu_lock);
1712 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1713 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1714 }
1715 
1716 /*
1717  * Clear leaf entries which could be replaced by large mappings, for
1718  * GFNs within the slot.
1719  */
1720 static void zap_collapsible_spte_range(struct kvm *kvm,
1721 				       struct kvm_mmu_page *root,
1722 				       const struct kvm_memory_slot *slot)
1723 {
1724 	gfn_t start = slot->base_gfn;
1725 	gfn_t end = start + slot->npages;
1726 	struct tdp_iter iter;
1727 	kvm_pfn_t pfn;
1728 
1729 	rcu_read_lock();
1730 
1731 	tdp_root_for_each_pte(iter, root, start, end) {
1732 retry:
1733 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1734 			continue;
1735 
1736 		if (!is_shadow_present_pte(iter.old_spte) ||
1737 		    !is_last_spte(iter.old_spte, iter.level))
1738 			continue;
1739 
1740 		pfn = spte_to_pfn(iter.old_spte);
1741 		if (kvm_is_reserved_pfn(pfn) ||
1742 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1743 							    pfn, PG_LEVEL_NUM))
1744 			continue;
1745 
1746 		/* Note, a successful atomic zap also does a remote TLB flush. */
1747 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1748 			goto retry;
1749 	}
1750 
1751 	rcu_read_unlock();
1752 }
1753 
1754 /*
1755  * Clear non-leaf entries (and free associated page tables) which could
1756  * be replaced by large mappings, for GFNs within the slot.
1757  */
1758 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1759 				       const struct kvm_memory_slot *slot)
1760 {
1761 	struct kvm_mmu_page *root;
1762 
1763 	lockdep_assert_held_read(&kvm->mmu_lock);
1764 
1765 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1766 		zap_collapsible_spte_range(kvm, root, slot);
1767 }
1768 
1769 /*
1770  * Removes write access on the last level SPTE mapping this GFN and unsets the
1771  * MMU-writable bit to ensure future writes continue to be intercepted.
1772  * Returns true if an SPTE was set and a TLB flush is needed.
1773  */
1774 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1775 			      gfn_t gfn, int min_level)
1776 {
1777 	struct tdp_iter iter;
1778 	u64 new_spte;
1779 	bool spte_set = false;
1780 
1781 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1782 
1783 	rcu_read_lock();
1784 
1785 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1786 		if (!is_shadow_present_pte(iter.old_spte) ||
1787 		    !is_last_spte(iter.old_spte, iter.level))
1788 			continue;
1789 
1790 		new_spte = iter.old_spte &
1791 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1792 
1793 		if (new_spte == iter.old_spte)
1794 			break;
1795 
1796 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1797 		spte_set = true;
1798 	}
1799 
1800 	rcu_read_unlock();
1801 
1802 	return spte_set;
1803 }
1804 
1805 /*
1806  * Removes write access on the last level SPTE mapping this GFN and unsets the
1807  * MMU-writable bit to ensure future writes continue to be intercepted.
1808  * Returns true if an SPTE was set and a TLB flush is needed.
1809  */
1810 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1811 				   struct kvm_memory_slot *slot, gfn_t gfn,
1812 				   int min_level)
1813 {
1814 	struct kvm_mmu_page *root;
1815 	bool spte_set = false;
1816 
1817 	lockdep_assert_held_write(&kvm->mmu_lock);
1818 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1819 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1820 
1821 	return spte_set;
1822 }
1823 
1824 /*
1825  * Return the level of the lowest level SPTE added to sptes.
1826  * That SPTE may be non-present.
1827  *
1828  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1829  */
1830 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1831 			 int *root_level)
1832 {
1833 	struct tdp_iter iter;
1834 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1835 	gfn_t gfn = addr >> PAGE_SHIFT;
1836 	int leaf = -1;
1837 
1838 	*root_level = vcpu->arch.mmu->shadow_root_level;
1839 
1840 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1841 		leaf = iter.level;
1842 		sptes[leaf] = iter.old_spte;
1843 	}
1844 
1845 	return leaf;
1846 }
1847 
1848 /*
1849  * Returns the last level spte pointer of the shadow page walk for the given
1850  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1851  * walk could be performed, returns NULL and *spte does not contain valid data.
1852  *
1853  * Contract:
1854  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1855  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1856  *
1857  * WARNING: This function is only intended to be called during fast_page_fault.
1858  */
1859 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1860 					u64 *spte)
1861 {
1862 	struct tdp_iter iter;
1863 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1864 	gfn_t gfn = addr >> PAGE_SHIFT;
1865 	tdp_ptep_t sptep = NULL;
1866 
1867 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1868 		*spte = iter.old_spte;
1869 		sptep = iter.sptep;
1870 	}
1871 
1872 	/*
1873 	 * Perform the rcu_dereference to get the raw spte pointer value since
1874 	 * we are passing it up to fast_page_fault, which is shared with the
1875 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1876 	 * annotation.
1877 	 *
1878 	 * This is safe since fast_page_fault obeys the contracts of this
1879 	 * function as well as all TDP MMU contracts around modifying SPTEs
1880 	 * outside of mmu_lock.
1881 	 */
1882 	return rcu_dereference(sptep);
1883 }
1884