xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 6846d656)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 	kvm->arch.tdp_mmu_zap_wq =
29 		alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
30 
31 	return true;
32 }
33 
34 /* Arbitrarily returns true so that this may be used in if statements. */
35 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
36 							     bool shared)
37 {
38 	if (shared)
39 		lockdep_assert_held_read(&kvm->mmu_lock);
40 	else
41 		lockdep_assert_held_write(&kvm->mmu_lock);
42 
43 	return true;
44 }
45 
46 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
47 {
48 	if (!kvm->arch.tdp_mmu_enabled)
49 		return;
50 
51 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
52 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
53 
54 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
55 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
56 
57 	/*
58 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
59 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
60 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
61 	 */
62 	rcu_barrier();
63 }
64 
65 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
66 {
67 	free_page((unsigned long)sp->spt);
68 	kmem_cache_free(mmu_page_header_cache, sp);
69 }
70 
71 /*
72  * This is called through call_rcu in order to free TDP page table memory
73  * safely with respect to other kernel threads that may be operating on
74  * the memory.
75  * By only accessing TDP MMU page table memory in an RCU read critical
76  * section, and freeing it after a grace period, lockless access to that
77  * memory won't use it after it is freed.
78  */
79 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
80 {
81 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
82 					       rcu_head);
83 
84 	tdp_mmu_free_sp(sp);
85 }
86 
87 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
88 			     bool shared);
89 
90 static void tdp_mmu_zap_root_work(struct work_struct *work)
91 {
92 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
93 						 tdp_mmu_async_work);
94 	struct kvm *kvm = root->tdp_mmu_async_data;
95 
96 	read_lock(&kvm->mmu_lock);
97 
98 	/*
99 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
100 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
101 	 * to a different pCPU.  Note, the local TLB flush on reuse also
102 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
103 	 * intermediate paging structures, that may be zapped, as such entries
104 	 * are associated with the ASID on both VMX and SVM.
105 	 */
106 	tdp_mmu_zap_root(kvm, root, true);
107 
108 	/*
109 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
110 	 * avoiding an infinite loop.  By design, the root is reachable while
111 	 * it's being asynchronously zapped, thus a different task can put its
112 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
113 	 * asynchronously zapped root is unavoidable.
114 	 */
115 	kvm_tdp_mmu_put_root(kvm, root, true);
116 
117 	read_unlock(&kvm->mmu_lock);
118 }
119 
120 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
121 {
122 	root->tdp_mmu_async_data = kvm;
123 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
124 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
125 }
126 
127 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
128 {
129 	union kvm_mmu_page_role role = page->role;
130 	role.invalid = true;
131 
132 	/* No need to use cmpxchg, only the invalid bit can change.  */
133 	role.word = xchg(&page->role.word, role.word);
134 	return role.invalid;
135 }
136 
137 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
138 			  bool shared)
139 {
140 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
141 
142 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
143 		return;
144 
145 	WARN_ON(!root->tdp_mmu_page);
146 
147 	/*
148 	 * The root now has refcount=0.  It is valid, but readers already
149 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
150 	 * rejects it.  This remains true for the rest of the execution
151 	 * of this function, because readers visit valid roots only
152 	 * (except for tdp_mmu_zap_root_work(), which however
153 	 * does not acquire any reference itself).
154 	 *
155 	 * Even though there are flows that need to visit all roots for
156 	 * correctness, they all take mmu_lock for write, so they cannot yet
157 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
158 	 * since the root still has refcount=0.
159 	 *
160 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
161 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
162 	 * So the root temporarily gets an extra reference, going to refcount=1
163 	 * while staying invalid.  Readers still cannot acquire any reference;
164 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
165 	 * they might take an extra reference if they themselves yield.
166 	 * Therefore, when the reference is given back by the worker,
167 	 * there is no guarantee that the refcount is still 1.  If not, whoever
168 	 * puts the last reference will free the page, but they will not have to
169 	 * zap the root because a root cannot go from invalid to valid.
170 	 */
171 	if (!kvm_tdp_root_mark_invalid(root)) {
172 		refcount_set(&root->tdp_mmu_root_count, 1);
173 
174 		/*
175 		 * Zapping the root in a worker is not just "nice to have";
176 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
177 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
178 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
179 		 * might return with some roots not zapped yet.
180 		 */
181 		tdp_mmu_schedule_zap_root(kvm, root);
182 		return;
183 	}
184 
185 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
186 	list_del_rcu(&root->link);
187 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
188 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
189 }
190 
191 /*
192  * Returns the next root after @prev_root (or the first root if @prev_root is
193  * NULL).  A reference to the returned root is acquired, and the reference to
194  * @prev_root is released (the caller obviously must hold a reference to
195  * @prev_root if it's non-NULL).
196  *
197  * If @only_valid is true, invalid roots are skipped.
198  *
199  * Returns NULL if the end of tdp_mmu_roots was reached.
200  */
201 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
202 					      struct kvm_mmu_page *prev_root,
203 					      bool shared, bool only_valid)
204 {
205 	struct kvm_mmu_page *next_root;
206 
207 	rcu_read_lock();
208 
209 	if (prev_root)
210 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
211 						  &prev_root->link,
212 						  typeof(*prev_root), link);
213 	else
214 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
215 						   typeof(*next_root), link);
216 
217 	while (next_root) {
218 		if ((!only_valid || !next_root->role.invalid) &&
219 		    kvm_tdp_mmu_get_root(next_root))
220 			break;
221 
222 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
223 				&next_root->link, typeof(*next_root), link);
224 	}
225 
226 	rcu_read_unlock();
227 
228 	if (prev_root)
229 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
230 
231 	return next_root;
232 }
233 
234 /*
235  * Note: this iterator gets and puts references to the roots it iterates over.
236  * This makes it safe to release the MMU lock and yield within the loop, but
237  * if exiting the loop early, the caller must drop the reference to the most
238  * recent root. (Unless keeping a live reference is desirable.)
239  *
240  * If shared is set, this function is operating under the MMU lock in read
241  * mode. In the unlikely event that this thread must free a root, the lock
242  * will be temporarily dropped and reacquired in write mode.
243  */
244 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
245 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
246 	     _root;								\
247 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
248 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
249 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
250 		} else
251 
252 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
253 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
254 
255 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
256 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
257 
258 /*
259  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
260  * the implication being that any flow that holds mmu_lock for read is
261  * inherently yield-friendly and should use the yield-safe variant above.
262  * Holding mmu_lock for write obviates the need for RCU protection as the list
263  * is guaranteed to be stable.
264  */
265 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
266 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
267 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
268 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
269 		} else
270 
271 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
272 {
273 	struct kvm_mmu_page *sp;
274 
275 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
276 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
277 
278 	return sp;
279 }
280 
281 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
282 			    gfn_t gfn, union kvm_mmu_page_role role)
283 {
284 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
285 
286 	sp->role = role;
287 	sp->gfn = gfn;
288 	sp->ptep = sptep;
289 	sp->tdp_mmu_page = true;
290 
291 	trace_kvm_mmu_get_page(sp, true);
292 }
293 
294 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
295 				  struct tdp_iter *iter)
296 {
297 	struct kvm_mmu_page *parent_sp;
298 	union kvm_mmu_page_role role;
299 
300 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
301 
302 	role = parent_sp->role;
303 	role.level--;
304 
305 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
306 }
307 
308 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
309 {
310 	union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
311 	struct kvm *kvm = vcpu->kvm;
312 	struct kvm_mmu_page *root;
313 
314 	lockdep_assert_held_write(&kvm->mmu_lock);
315 
316 	/*
317 	 * Check for an existing root before allocating a new one.  Note, the
318 	 * role check prevents consuming an invalid root.
319 	 */
320 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
321 		if (root->role.word == role.word &&
322 		    kvm_tdp_mmu_get_root(root))
323 			goto out;
324 	}
325 
326 	root = tdp_mmu_alloc_sp(vcpu);
327 	tdp_mmu_init_sp(root, NULL, 0, role);
328 
329 	refcount_set(&root->tdp_mmu_root_count, 1);
330 
331 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
332 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
333 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
334 
335 out:
336 	return __pa(root->spt);
337 }
338 
339 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
340 				u64 old_spte, u64 new_spte, int level,
341 				bool shared);
342 
343 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
344 {
345 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
346 		return;
347 
348 	if (is_accessed_spte(old_spte) &&
349 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
350 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
351 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
352 }
353 
354 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
355 					  u64 old_spte, u64 new_spte, int level)
356 {
357 	bool pfn_changed;
358 	struct kvm_memory_slot *slot;
359 
360 	if (level > PG_LEVEL_4K)
361 		return;
362 
363 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
364 
365 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
366 	    is_writable_pte(new_spte)) {
367 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
368 		mark_page_dirty_in_slot(kvm, slot, gfn);
369 	}
370 }
371 
372 /**
373  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
374  *
375  * @kvm: kvm instance
376  * @sp: the page to be removed
377  * @shared: This operation may not be running under the exclusive use of
378  *	    the MMU lock and the operation must synchronize with other
379  *	    threads that might be adding or removing pages.
380  */
381 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
382 			      bool shared)
383 {
384 	if (shared)
385 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
386 	else
387 		lockdep_assert_held_write(&kvm->mmu_lock);
388 
389 	list_del(&sp->link);
390 	if (sp->lpage_disallowed)
391 		unaccount_huge_nx_page(kvm, sp);
392 
393 	if (shared)
394 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
395 }
396 
397 /**
398  * handle_removed_pt() - handle a page table removed from the TDP structure
399  *
400  * @kvm: kvm instance
401  * @pt: the page removed from the paging structure
402  * @shared: This operation may not be running under the exclusive use
403  *	    of the MMU lock and the operation must synchronize with other
404  *	    threads that might be modifying SPTEs.
405  *
406  * Given a page table that has been removed from the TDP paging structure,
407  * iterates through the page table to clear SPTEs and free child page tables.
408  *
409  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
410  * protection. Since this thread removed it from the paging structure,
411  * this thread will be responsible for ensuring the page is freed. Hence the
412  * early rcu_dereferences in the function.
413  */
414 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
415 {
416 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
417 	int level = sp->role.level;
418 	gfn_t base_gfn = sp->gfn;
419 	int i;
420 
421 	trace_kvm_mmu_prepare_zap_page(sp);
422 
423 	tdp_mmu_unlink_sp(kvm, sp, shared);
424 
425 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
426 		u64 *sptep = rcu_dereference(pt) + i;
427 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
428 		u64 old_child_spte;
429 
430 		if (shared) {
431 			/*
432 			 * Set the SPTE to a nonpresent value that other
433 			 * threads will not overwrite. If the SPTE was
434 			 * already marked as removed then another thread
435 			 * handling a page fault could overwrite it, so
436 			 * set the SPTE until it is set from some other
437 			 * value to the removed SPTE value.
438 			 */
439 			for (;;) {
440 				old_child_spte = xchg(sptep, REMOVED_SPTE);
441 				if (!is_removed_spte(old_child_spte))
442 					break;
443 				cpu_relax();
444 			}
445 		} else {
446 			/*
447 			 * If the SPTE is not MMU-present, there is no backing
448 			 * page associated with the SPTE and so no side effects
449 			 * that need to be recorded, and exclusive ownership of
450 			 * mmu_lock ensures the SPTE can't be made present.
451 			 * Note, zapping MMIO SPTEs is also unnecessary as they
452 			 * are guarded by the memslots generation, not by being
453 			 * unreachable.
454 			 */
455 			old_child_spte = READ_ONCE(*sptep);
456 			if (!is_shadow_present_pte(old_child_spte))
457 				continue;
458 
459 			/*
460 			 * Marking the SPTE as a removed SPTE is not
461 			 * strictly necessary here as the MMU lock will
462 			 * stop other threads from concurrently modifying
463 			 * this SPTE. Using the removed SPTE value keeps
464 			 * the two branches consistent and simplifies
465 			 * the function.
466 			 */
467 			WRITE_ONCE(*sptep, REMOVED_SPTE);
468 		}
469 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
470 				    old_child_spte, REMOVED_SPTE, level,
471 				    shared);
472 	}
473 
474 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
475 }
476 
477 /**
478  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
479  * @kvm: kvm instance
480  * @as_id: the address space of the paging structure the SPTE was a part of
481  * @gfn: the base GFN that was mapped by the SPTE
482  * @old_spte: The value of the SPTE before the change
483  * @new_spte: The value of the SPTE after the change
484  * @level: the level of the PT the SPTE is part of in the paging structure
485  * @shared: This operation may not be running under the exclusive use of
486  *	    the MMU lock and the operation must synchronize with other
487  *	    threads that might be modifying SPTEs.
488  *
489  * Handle bookkeeping that might result from the modification of a SPTE.
490  * This function must be called for all TDP SPTE modifications.
491  */
492 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
493 				  u64 old_spte, u64 new_spte, int level,
494 				  bool shared)
495 {
496 	bool was_present = is_shadow_present_pte(old_spte);
497 	bool is_present = is_shadow_present_pte(new_spte);
498 	bool was_leaf = was_present && is_last_spte(old_spte, level);
499 	bool is_leaf = is_present && is_last_spte(new_spte, level);
500 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
501 
502 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
503 	WARN_ON(level < PG_LEVEL_4K);
504 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
505 
506 	/*
507 	 * If this warning were to trigger it would indicate that there was a
508 	 * missing MMU notifier or a race with some notifier handler.
509 	 * A present, leaf SPTE should never be directly replaced with another
510 	 * present leaf SPTE pointing to a different PFN. A notifier handler
511 	 * should be zapping the SPTE before the main MM's page table is
512 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
513 	 * thread before replacement.
514 	 */
515 	if (was_leaf && is_leaf && pfn_changed) {
516 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
517 		       "SPTE with another present leaf SPTE mapping a\n"
518 		       "different PFN!\n"
519 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
520 		       as_id, gfn, old_spte, new_spte, level);
521 
522 		/*
523 		 * Crash the host to prevent error propagation and guest data
524 		 * corruption.
525 		 */
526 		BUG();
527 	}
528 
529 	if (old_spte == new_spte)
530 		return;
531 
532 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
533 
534 	if (is_leaf)
535 		check_spte_writable_invariants(new_spte);
536 
537 	/*
538 	 * The only times a SPTE should be changed from a non-present to
539 	 * non-present state is when an MMIO entry is installed/modified/
540 	 * removed. In that case, there is nothing to do here.
541 	 */
542 	if (!was_present && !is_present) {
543 		/*
544 		 * If this change does not involve a MMIO SPTE or removed SPTE,
545 		 * it is unexpected. Log the change, though it should not
546 		 * impact the guest since both the former and current SPTEs
547 		 * are nonpresent.
548 		 */
549 		if (WARN_ON(!is_mmio_spte(old_spte) &&
550 			    !is_mmio_spte(new_spte) &&
551 			    !is_removed_spte(new_spte)))
552 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
553 			       "should not be replaced with another,\n"
554 			       "different nonpresent SPTE, unless one or both\n"
555 			       "are MMIO SPTEs, or the new SPTE is\n"
556 			       "a temporary removed SPTE.\n"
557 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
558 			       as_id, gfn, old_spte, new_spte, level);
559 		return;
560 	}
561 
562 	if (is_leaf != was_leaf)
563 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
564 
565 	if (was_leaf && is_dirty_spte(old_spte) &&
566 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
567 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
568 
569 	/*
570 	 * Recursively handle child PTs if the change removed a subtree from
571 	 * the paging structure.  Note the WARN on the PFN changing without the
572 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
573 	 * pages are kernel allocations and should never be migrated.
574 	 */
575 	if (was_present && !was_leaf &&
576 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
577 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
578 }
579 
580 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
581 				u64 old_spte, u64 new_spte, int level,
582 				bool shared)
583 {
584 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
585 			      shared);
586 	handle_changed_spte_acc_track(old_spte, new_spte, level);
587 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
588 				      new_spte, level);
589 }
590 
591 /*
592  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
593  * and handle the associated bookkeeping.  Do not mark the page dirty
594  * in KVM's dirty bitmaps.
595  *
596  * If setting the SPTE fails because it has changed, iter->old_spte will be
597  * refreshed to the current value of the spte.
598  *
599  * @kvm: kvm instance
600  * @iter: a tdp_iter instance currently on the SPTE that should be set
601  * @new_spte: The value the SPTE should be set to
602  * Return:
603  * * 0      - If the SPTE was set.
604  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
605  *            no side-effects other than setting iter->old_spte to the last
606  *            known value of the spte.
607  */
608 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
609 					  struct tdp_iter *iter,
610 					  u64 new_spte)
611 {
612 	u64 *sptep = rcu_dereference(iter->sptep);
613 	u64 old_spte;
614 
615 	/*
616 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
617 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
618 	 * and pre-checking before inserting a new SPTE is advantageous as it
619 	 * avoids unnecessary work.
620 	 */
621 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
622 
623 	lockdep_assert_held_read(&kvm->mmu_lock);
624 
625 	/*
626 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
627 	 * does not hold the mmu_lock.
628 	 */
629 	old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
630 	if (old_spte != iter->old_spte) {
631 		/*
632 		 * The page table entry was modified by a different logical
633 		 * CPU. Refresh iter->old_spte with the current value so the
634 		 * caller operates on fresh data, e.g. if it retries
635 		 * tdp_mmu_set_spte_atomic().
636 		 */
637 		iter->old_spte = old_spte;
638 		return -EBUSY;
639 	}
640 
641 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
642 			      new_spte, iter->level, true);
643 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
644 
645 	return 0;
646 }
647 
648 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
649 					  struct tdp_iter *iter)
650 {
651 	int ret;
652 
653 	/*
654 	 * Freeze the SPTE by setting it to a special,
655 	 * non-present value. This will stop other threads from
656 	 * immediately installing a present entry in its place
657 	 * before the TLBs are flushed.
658 	 */
659 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
660 	if (ret)
661 		return ret;
662 
663 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
664 					   KVM_PAGES_PER_HPAGE(iter->level));
665 
666 	/*
667 	 * No other thread can overwrite the removed SPTE as they
668 	 * must either wait on the MMU lock or use
669 	 * tdp_mmu_set_spte_atomic which will not overwrite the
670 	 * special removed SPTE value. No bookkeeping is needed
671 	 * here since the SPTE is going from non-present
672 	 * to non-present.
673 	 */
674 	kvm_tdp_mmu_write_spte(iter->sptep, 0);
675 
676 	return 0;
677 }
678 
679 
680 /*
681  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
682  * @kvm:	      KVM instance
683  * @as_id:	      Address space ID, i.e. regular vs. SMM
684  * @sptep:	      Pointer to the SPTE
685  * @old_spte:	      The current value of the SPTE
686  * @new_spte:	      The new value that will be set for the SPTE
687  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
688  * @level:	      The level _containing_ the SPTE (its parent PT's level)
689  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
690  *		      of the page. Should be set unless handling an MMU
691  *		      notifier for access tracking. Leaving record_acc_track
692  *		      unset in that case prevents page accesses from being
693  *		      double counted.
694  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
695  *		      appropriate for the change being made. Should be set
696  *		      unless performing certain dirty logging operations.
697  *		      Leaving record_dirty_log unset in that case prevents page
698  *		      writes from being double counted.
699  */
700 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
701 			       u64 old_spte, u64 new_spte, gfn_t gfn, int level,
702 			       bool record_acc_track, bool record_dirty_log)
703 {
704 	lockdep_assert_held_write(&kvm->mmu_lock);
705 
706 	/*
707 	 * No thread should be using this function to set SPTEs to or from the
708 	 * temporary removed SPTE value.
709 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
710 	 * should be used. If operating under the MMU lock in write mode, the
711 	 * use of the removed SPTE should not be necessary.
712 	 */
713 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
714 
715 	kvm_tdp_mmu_write_spte(sptep, new_spte);
716 
717 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
718 
719 	if (record_acc_track)
720 		handle_changed_spte_acc_track(old_spte, new_spte, level);
721 	if (record_dirty_log)
722 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
723 					      new_spte, level);
724 }
725 
726 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
727 				     u64 new_spte, bool record_acc_track,
728 				     bool record_dirty_log)
729 {
730 	WARN_ON_ONCE(iter->yielded);
731 
732 	__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
733 			   new_spte, iter->gfn, iter->level,
734 			   record_acc_track, record_dirty_log);
735 }
736 
737 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
738 				    u64 new_spte)
739 {
740 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
741 }
742 
743 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
744 						 struct tdp_iter *iter,
745 						 u64 new_spte)
746 {
747 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
748 }
749 
750 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
751 						 struct tdp_iter *iter,
752 						 u64 new_spte)
753 {
754 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
755 }
756 
757 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
758 	for_each_tdp_pte(_iter, _root, _start, _end)
759 
760 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
761 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
762 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
763 		    !is_last_spte(_iter.old_spte, _iter.level))		\
764 			continue;					\
765 		else
766 
767 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
768 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
769 
770 /*
771  * Yield if the MMU lock is contended or this thread needs to return control
772  * to the scheduler.
773  *
774  * If this function should yield and flush is set, it will perform a remote
775  * TLB flush before yielding.
776  *
777  * If this function yields, iter->yielded is set and the caller must skip to
778  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
779  * over the paging structures to allow the iterator to continue its traversal
780  * from the paging structure root.
781  *
782  * Returns true if this function yielded.
783  */
784 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
785 							  struct tdp_iter *iter,
786 							  bool flush, bool shared)
787 {
788 	WARN_ON(iter->yielded);
789 
790 	/* Ensure forward progress has been made before yielding. */
791 	if (iter->next_last_level_gfn == iter->yielded_gfn)
792 		return false;
793 
794 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
795 		if (flush)
796 			kvm_flush_remote_tlbs(kvm);
797 
798 		rcu_read_unlock();
799 
800 		if (shared)
801 			cond_resched_rwlock_read(&kvm->mmu_lock);
802 		else
803 			cond_resched_rwlock_write(&kvm->mmu_lock);
804 
805 		rcu_read_lock();
806 
807 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
808 
809 		iter->yielded = true;
810 	}
811 
812 	return iter->yielded;
813 }
814 
815 static inline gfn_t tdp_mmu_max_gfn_host(void)
816 {
817 	/*
818 	 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
819 	 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
820 	 * and so KVM will never install a SPTE for such addresses.
821 	 */
822 	return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
823 }
824 
825 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
826 			       bool shared, int zap_level)
827 {
828 	struct tdp_iter iter;
829 
830 	gfn_t end = tdp_mmu_max_gfn_host();
831 	gfn_t start = 0;
832 
833 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
834 retry:
835 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
836 			continue;
837 
838 		if (!is_shadow_present_pte(iter.old_spte))
839 			continue;
840 
841 		if (iter.level > zap_level)
842 			continue;
843 
844 		if (!shared)
845 			tdp_mmu_set_spte(kvm, &iter, 0);
846 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
847 			goto retry;
848 	}
849 }
850 
851 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
852 			     bool shared)
853 {
854 
855 	/*
856 	 * The root must have an elevated refcount so that it's reachable via
857 	 * mmu_notifier callbacks, which allows this path to yield and drop
858 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
859 	 * must drop all references to relevant pages prior to completing the
860 	 * callback.  Dropping mmu_lock with an unreachable root would result
861 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
862 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
863 	 * dirty accessed bits to the SPTE's associated struct page.
864 	 */
865 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
866 
867 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
868 
869 	rcu_read_lock();
870 
871 	/*
872 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
873 	 * split the zap into two passes.  On the first pass, zap at the 1gb
874 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
875 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
876 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
877 	 *
878 	 * Because zapping a SP recurses on its children, stepping down to
879 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
880 	 */
881 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
882 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
883 
884 	rcu_read_unlock();
885 }
886 
887 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
888 {
889 	u64 old_spte;
890 
891 	/*
892 	 * This helper intentionally doesn't allow zapping a root shadow page,
893 	 * which doesn't have a parent page table and thus no associated entry.
894 	 */
895 	if (WARN_ON_ONCE(!sp->ptep))
896 		return false;
897 
898 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
899 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
900 		return false;
901 
902 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
903 			   sp->gfn, sp->role.level + 1, true, true);
904 
905 	return true;
906 }
907 
908 /*
909  * Tears down the mappings for the range of gfns, [start, end), and frees the
910  * non-root pages mapping GFNs strictly within that range. Returns true if
911  * SPTEs have been cleared and a TLB flush is needed before releasing the
912  * MMU lock.
913  *
914  * If can_yield is true, will release the MMU lock and reschedule if the
915  * scheduler needs the CPU or there is contention on the MMU lock. If this
916  * function cannot yield, it will not release the MMU lock or reschedule and
917  * the caller must ensure it does not supply too large a GFN range, or the
918  * operation can cause a soft lockup.
919  */
920 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
921 			  gfn_t start, gfn_t end, bool can_yield, bool flush)
922 {
923 	bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
924 	struct tdp_iter iter;
925 
926 	/*
927 	 * No need to try to step down in the iterator when zapping all SPTEs,
928 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
929 	 */
930 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
931 
932 	end = min(end, tdp_mmu_max_gfn_host());
933 
934 	lockdep_assert_held_write(&kvm->mmu_lock);
935 
936 	rcu_read_lock();
937 
938 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
939 		if (can_yield &&
940 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
941 			flush = false;
942 			continue;
943 		}
944 
945 		if (!is_shadow_present_pte(iter.old_spte))
946 			continue;
947 
948 		/*
949 		 * If this is a non-last-level SPTE that covers a larger range
950 		 * than should be zapped, continue, and zap the mappings at a
951 		 * lower level, except when zapping all SPTEs.
952 		 */
953 		if (!zap_all &&
954 		    (iter.gfn < start ||
955 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
956 		    !is_last_spte(iter.old_spte, iter.level))
957 			continue;
958 
959 		tdp_mmu_set_spte(kvm, &iter, 0);
960 		flush = true;
961 	}
962 
963 	/*
964 	 * Need to flush before releasing RCU.  TODO: do it only if intermediate
965 	 * page tables were zapped; there is no need to flush under RCU protection
966 	 * if no 'struct kvm_mmu_page' is freed.
967 	 */
968 	if (flush)
969 		kvm_flush_remote_tlbs_with_address(kvm, start, end - start);
970 
971 	rcu_read_unlock();
972 
973 	return false;
974 }
975 
976 /*
977  * Tears down the mappings for the range of gfns, [start, end), and frees the
978  * non-root pages mapping GFNs strictly within that range. Returns true if
979  * SPTEs have been cleared and a TLB flush is needed before releasing the
980  * MMU lock.
981  */
982 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
983 				 gfn_t end, bool can_yield, bool flush)
984 {
985 	struct kvm_mmu_page *root;
986 
987 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
988 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
989 
990 	return flush;
991 }
992 
993 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
994 {
995 	struct kvm_mmu_page *root;
996 	int i;
997 
998 	/*
999 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
1000 	 * before returning to the caller.  Zap directly even if the root is
1001 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
1002 	 * all that expensive and mmu_lock is already held, which means the
1003 	 * worker has yielded, i.e. flushing the work instead of zapping here
1004 	 * isn't guaranteed to be any faster.
1005 	 *
1006 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1007 	 * is being destroyed or the userspace VMM has exited.  In both cases,
1008 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1009 	 */
1010 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1011 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1012 			tdp_mmu_zap_root(kvm, root, false);
1013 	}
1014 }
1015 
1016 /*
1017  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1018  * zap" completes.
1019  */
1020 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1021 {
1022 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1023 }
1024 
1025 /*
1026  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1027  * is about to be zapped, e.g. in response to a memslots update.  The actual
1028  * zapping is performed asynchronously, so a reference is taken on all roots.
1029  * Using a separate workqueue makes it easy to ensure that the destruction is
1030  * performed before the "fast zap" completes, without keeping a separate list
1031  * of invalidated roots; the list is effectively the list of work items in
1032  * the workqueue.
1033  *
1034  * Get a reference even if the root is already invalid, the asynchronous worker
1035  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1036  * is held for write, it should be impossible to observe a root with zero refcount,
1037  * i.e. the list of roots cannot be stale.
1038  *
1039  * This has essentially the same effect for the TDP MMU
1040  * as updating mmu_valid_gen does for the shadow MMU.
1041  */
1042 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1043 {
1044 	struct kvm_mmu_page *root;
1045 
1046 	lockdep_assert_held_write(&kvm->mmu_lock);
1047 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1048 		if (!root->role.invalid &&
1049 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1050 			root->role.invalid = true;
1051 			tdp_mmu_schedule_zap_root(kvm, root);
1052 		}
1053 	}
1054 }
1055 
1056 /*
1057  * Installs a last-level SPTE to handle a TDP page fault.
1058  * (NPT/EPT violation/misconfiguration)
1059  */
1060 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1061 					  struct kvm_page_fault *fault,
1062 					  struct tdp_iter *iter)
1063 {
1064 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1065 	u64 new_spte;
1066 	int ret = RET_PF_FIXED;
1067 	bool wrprot = false;
1068 
1069 	WARN_ON(sp->role.level != fault->goal_level);
1070 	if (unlikely(!fault->slot))
1071 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1072 	else
1073 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1074 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1075 					 fault->map_writable, &new_spte);
1076 
1077 	if (new_spte == iter->old_spte)
1078 		ret = RET_PF_SPURIOUS;
1079 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1080 		return RET_PF_RETRY;
1081 	else if (is_shadow_present_pte(iter->old_spte) &&
1082 		 !is_last_spte(iter->old_spte, iter->level))
1083 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1084 						   KVM_PAGES_PER_HPAGE(iter->level + 1));
1085 
1086 	/*
1087 	 * If the page fault was caused by a write but the page is write
1088 	 * protected, emulation is needed. If the emulation was skipped,
1089 	 * the vCPU would have the same fault again.
1090 	 */
1091 	if (wrprot) {
1092 		if (fault->write)
1093 			ret = RET_PF_EMULATE;
1094 	}
1095 
1096 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1097 	if (unlikely(is_mmio_spte(new_spte))) {
1098 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1099 				     new_spte);
1100 		ret = RET_PF_EMULATE;
1101 	} else {
1102 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1103 				       rcu_dereference(iter->sptep));
1104 	}
1105 
1106 	/*
1107 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1108 	 * consistent with legacy MMU behavior.
1109 	 */
1110 	if (ret != RET_PF_SPURIOUS)
1111 		vcpu->stat.pf_fixed++;
1112 
1113 	return ret;
1114 }
1115 
1116 /*
1117  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1118  * provided page table.
1119  *
1120  * @kvm: kvm instance
1121  * @iter: a tdp_iter instance currently on the SPTE that should be set
1122  * @sp: The new TDP page table to install.
1123  * @account_nx: True if this page table is being installed to split a
1124  *              non-executable huge page.
1125  * @shared: This operation is running under the MMU lock in read mode.
1126  *
1127  * Returns: 0 if the new page table was installed. Non-0 if the page table
1128  *          could not be installed (e.g. the atomic compare-exchange failed).
1129  */
1130 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1131 			   struct kvm_mmu_page *sp, bool account_nx,
1132 			   bool shared)
1133 {
1134 	u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1135 	int ret = 0;
1136 
1137 	if (shared) {
1138 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1139 		if (ret)
1140 			return ret;
1141 	} else {
1142 		tdp_mmu_set_spte(kvm, iter, spte);
1143 	}
1144 
1145 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1146 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1147 	if (account_nx)
1148 		account_huge_nx_page(kvm, sp);
1149 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1150 
1151 	return 0;
1152 }
1153 
1154 /*
1155  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1156  * page tables and SPTEs to translate the faulting guest physical address.
1157  */
1158 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1159 {
1160 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1161 	struct tdp_iter iter;
1162 	struct kvm_mmu_page *sp;
1163 	int ret;
1164 
1165 	kvm_mmu_hugepage_adjust(vcpu, fault);
1166 
1167 	trace_kvm_mmu_spte_requested(fault);
1168 
1169 	rcu_read_lock();
1170 
1171 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1172 		if (fault->nx_huge_page_workaround_enabled)
1173 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1174 
1175 		if (iter.level == fault->goal_level)
1176 			break;
1177 
1178 		/*
1179 		 * If there is an SPTE mapping a large page at a higher level
1180 		 * than the target, that SPTE must be cleared and replaced
1181 		 * with a non-leaf SPTE.
1182 		 */
1183 		if (is_shadow_present_pte(iter.old_spte) &&
1184 		    is_large_pte(iter.old_spte)) {
1185 			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1186 				break;
1187 
1188 			/*
1189 			 * The iter must explicitly re-read the spte here
1190 			 * because the new value informs the !present
1191 			 * path below.
1192 			 */
1193 			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1194 		}
1195 
1196 		if (!is_shadow_present_pte(iter.old_spte)) {
1197 			bool account_nx = fault->huge_page_disallowed &&
1198 					  fault->req_level >= iter.level;
1199 
1200 			/*
1201 			 * If SPTE has been frozen by another thread, just
1202 			 * give up and retry, avoiding unnecessary page table
1203 			 * allocation and free.
1204 			 */
1205 			if (is_removed_spte(iter.old_spte))
1206 				break;
1207 
1208 			sp = tdp_mmu_alloc_sp(vcpu);
1209 			tdp_mmu_init_child_sp(sp, &iter);
1210 
1211 			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1212 				tdp_mmu_free_sp(sp);
1213 				break;
1214 			}
1215 		}
1216 	}
1217 
1218 	/*
1219 	 * Force the guest to retry the access if the upper level SPTEs aren't
1220 	 * in place, or if the target leaf SPTE is frozen by another CPU.
1221 	 */
1222 	if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1223 		rcu_read_unlock();
1224 		return RET_PF_RETRY;
1225 	}
1226 
1227 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1228 	rcu_read_unlock();
1229 
1230 	return ret;
1231 }
1232 
1233 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1234 				 bool flush)
1235 {
1236 	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1237 					   range->end, range->may_block, flush);
1238 }
1239 
1240 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1241 			      struct kvm_gfn_range *range);
1242 
1243 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1244 						   struct kvm_gfn_range *range,
1245 						   tdp_handler_t handler)
1246 {
1247 	struct kvm_mmu_page *root;
1248 	struct tdp_iter iter;
1249 	bool ret = false;
1250 
1251 	/*
1252 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1253 	 * into this helper allow blocking; it'd be dead, wasteful code.
1254 	 */
1255 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1256 		rcu_read_lock();
1257 
1258 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1259 			ret |= handler(kvm, &iter, range);
1260 
1261 		rcu_read_unlock();
1262 	}
1263 
1264 	return ret;
1265 }
1266 
1267 /*
1268  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1269  * if any of the GFNs in the range have been accessed.
1270  */
1271 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1272 			  struct kvm_gfn_range *range)
1273 {
1274 	u64 new_spte = 0;
1275 
1276 	/* If we have a non-accessed entry we don't need to change the pte. */
1277 	if (!is_accessed_spte(iter->old_spte))
1278 		return false;
1279 
1280 	new_spte = iter->old_spte;
1281 
1282 	if (spte_ad_enabled(new_spte)) {
1283 		new_spte &= ~shadow_accessed_mask;
1284 	} else {
1285 		/*
1286 		 * Capture the dirty status of the page, so that it doesn't get
1287 		 * lost when the SPTE is marked for access tracking.
1288 		 */
1289 		if (is_writable_pte(new_spte))
1290 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1291 
1292 		new_spte = mark_spte_for_access_track(new_spte);
1293 	}
1294 
1295 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1296 
1297 	return true;
1298 }
1299 
1300 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1301 {
1302 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1303 }
1304 
1305 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1306 			 struct kvm_gfn_range *range)
1307 {
1308 	return is_accessed_spte(iter->old_spte);
1309 }
1310 
1311 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1312 {
1313 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1314 }
1315 
1316 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1317 			 struct kvm_gfn_range *range)
1318 {
1319 	u64 new_spte;
1320 
1321 	/* Huge pages aren't expected to be modified without first being zapped. */
1322 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1323 
1324 	if (iter->level != PG_LEVEL_4K ||
1325 	    !is_shadow_present_pte(iter->old_spte))
1326 		return false;
1327 
1328 	/*
1329 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1330 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1331 	 * invariant that the PFN of a present * leaf SPTE can never change.
1332 	 * See __handle_changed_spte().
1333 	 */
1334 	tdp_mmu_set_spte(kvm, iter, 0);
1335 
1336 	if (!pte_write(range->pte)) {
1337 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1338 								  pte_pfn(range->pte));
1339 
1340 		tdp_mmu_set_spte(kvm, iter, new_spte);
1341 	}
1342 
1343 	return true;
1344 }
1345 
1346 /*
1347  * Handle the changed_pte MMU notifier for the TDP MMU.
1348  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1349  * notifier.
1350  * Returns non-zero if a flush is needed before releasing the MMU lock.
1351  */
1352 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1353 {
1354 	/*
1355 	 * No need to handle the remote TLB flush under RCU protection, the
1356 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1357 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1358 	 */
1359 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1360 }
1361 
1362 /*
1363  * Remove write access from all SPTEs at or above min_level that map GFNs
1364  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1365  * be flushed.
1366  */
1367 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1368 			     gfn_t start, gfn_t end, int min_level)
1369 {
1370 	struct tdp_iter iter;
1371 	u64 new_spte;
1372 	bool spte_set = false;
1373 
1374 	rcu_read_lock();
1375 
1376 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1377 
1378 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1379 retry:
1380 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1381 			continue;
1382 
1383 		if (!is_shadow_present_pte(iter.old_spte) ||
1384 		    !is_last_spte(iter.old_spte, iter.level) ||
1385 		    !(iter.old_spte & PT_WRITABLE_MASK))
1386 			continue;
1387 
1388 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1389 
1390 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1391 			goto retry;
1392 
1393 		spte_set = true;
1394 	}
1395 
1396 	rcu_read_unlock();
1397 	return spte_set;
1398 }
1399 
1400 /*
1401  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1402  * only affect leaf SPTEs down to min_level.
1403  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1404  */
1405 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1406 			     const struct kvm_memory_slot *slot, int min_level)
1407 {
1408 	struct kvm_mmu_page *root;
1409 	bool spte_set = false;
1410 
1411 	lockdep_assert_held_read(&kvm->mmu_lock);
1412 
1413 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1414 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1415 			     slot->base_gfn + slot->npages, min_level);
1416 
1417 	return spte_set;
1418 }
1419 
1420 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1421 {
1422 	struct kvm_mmu_page *sp;
1423 
1424 	gfp |= __GFP_ZERO;
1425 
1426 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1427 	if (!sp)
1428 		return NULL;
1429 
1430 	sp->spt = (void *)__get_free_page(gfp);
1431 	if (!sp->spt) {
1432 		kmem_cache_free(mmu_page_header_cache, sp);
1433 		return NULL;
1434 	}
1435 
1436 	return sp;
1437 }
1438 
1439 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1440 						       struct tdp_iter *iter,
1441 						       bool shared)
1442 {
1443 	struct kvm_mmu_page *sp;
1444 
1445 	/*
1446 	 * Since we are allocating while under the MMU lock we have to be
1447 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1448 	 * reclaim and to avoid making any filesystem callbacks (which can end
1449 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1450 	 *
1451 	 * If this allocation fails we drop the lock and retry with reclaim
1452 	 * allowed.
1453 	 */
1454 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1455 	if (sp)
1456 		return sp;
1457 
1458 	rcu_read_unlock();
1459 
1460 	if (shared)
1461 		read_unlock(&kvm->mmu_lock);
1462 	else
1463 		write_unlock(&kvm->mmu_lock);
1464 
1465 	iter->yielded = true;
1466 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1467 
1468 	if (shared)
1469 		read_lock(&kvm->mmu_lock);
1470 	else
1471 		write_lock(&kvm->mmu_lock);
1472 
1473 	rcu_read_lock();
1474 
1475 	return sp;
1476 }
1477 
1478 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1479 				   struct kvm_mmu_page *sp, bool shared)
1480 {
1481 	const u64 huge_spte = iter->old_spte;
1482 	const int level = iter->level;
1483 	int ret, i;
1484 
1485 	tdp_mmu_init_child_sp(sp, iter);
1486 
1487 	/*
1488 	 * No need for atomics when writing to sp->spt since the page table has
1489 	 * not been linked in yet and thus is not reachable from any other CPU.
1490 	 */
1491 	for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1492 		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1493 
1494 	/*
1495 	 * Replace the huge spte with a pointer to the populated lower level
1496 	 * page table. Since we are making this change without a TLB flush vCPUs
1497 	 * will see a mix of the split mappings and the original huge mapping,
1498 	 * depending on what's currently in their TLB. This is fine from a
1499 	 * correctness standpoint since the translation will be the same either
1500 	 * way.
1501 	 */
1502 	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1503 	if (ret)
1504 		goto out;
1505 
1506 	/*
1507 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1508 	 * are overwriting from the page stats. But we have to manually update
1509 	 * the page stats with the new present child pages.
1510 	 */
1511 	kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1512 
1513 out:
1514 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1515 	return ret;
1516 }
1517 
1518 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1519 					 struct kvm_mmu_page *root,
1520 					 gfn_t start, gfn_t end,
1521 					 int target_level, bool shared)
1522 {
1523 	struct kvm_mmu_page *sp = NULL;
1524 	struct tdp_iter iter;
1525 	int ret = 0;
1526 
1527 	rcu_read_lock();
1528 
1529 	/*
1530 	 * Traverse the page table splitting all huge pages above the target
1531 	 * level into one lower level. For example, if we encounter a 1GB page
1532 	 * we split it into 512 2MB pages.
1533 	 *
1534 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1535 	 * to visit an SPTE before ever visiting its children, which means we
1536 	 * will correctly recursively split huge pages that are more than one
1537 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1538 	 * and then splitting each of those to 512 4KB pages).
1539 	 */
1540 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1541 retry:
1542 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1543 			continue;
1544 
1545 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1546 			continue;
1547 
1548 		if (!sp) {
1549 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1550 			if (!sp) {
1551 				ret = -ENOMEM;
1552 				trace_kvm_mmu_split_huge_page(iter.gfn,
1553 							      iter.old_spte,
1554 							      iter.level, ret);
1555 				break;
1556 			}
1557 
1558 			if (iter.yielded)
1559 				continue;
1560 		}
1561 
1562 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1563 			goto retry;
1564 
1565 		sp = NULL;
1566 	}
1567 
1568 	rcu_read_unlock();
1569 
1570 	/*
1571 	 * It's possible to exit the loop having never used the last sp if, for
1572 	 * example, a vCPU doing HugePage NX splitting wins the race and
1573 	 * installs its own sp in place of the last sp we tried to split.
1574 	 */
1575 	if (sp)
1576 		tdp_mmu_free_sp(sp);
1577 
1578 	return ret;
1579 }
1580 
1581 
1582 /*
1583  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1584  */
1585 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1586 				      const struct kvm_memory_slot *slot,
1587 				      gfn_t start, gfn_t end,
1588 				      int target_level, bool shared)
1589 {
1590 	struct kvm_mmu_page *root;
1591 	int r = 0;
1592 
1593 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1594 
1595 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1596 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1597 		if (r) {
1598 			kvm_tdp_mmu_put_root(kvm, root, shared);
1599 			break;
1600 		}
1601 	}
1602 }
1603 
1604 /*
1605  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1606  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1607  * If AD bits are not enabled, this will require clearing the writable bit on
1608  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1609  * be flushed.
1610  */
1611 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1612 			   gfn_t start, gfn_t end)
1613 {
1614 	struct tdp_iter iter;
1615 	u64 new_spte;
1616 	bool spte_set = false;
1617 
1618 	rcu_read_lock();
1619 
1620 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1621 retry:
1622 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1623 			continue;
1624 
1625 		if (!is_shadow_present_pte(iter.old_spte))
1626 			continue;
1627 
1628 		if (spte_ad_need_write_protect(iter.old_spte)) {
1629 			if (is_writable_pte(iter.old_spte))
1630 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1631 			else
1632 				continue;
1633 		} else {
1634 			if (iter.old_spte & shadow_dirty_mask)
1635 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1636 			else
1637 				continue;
1638 		}
1639 
1640 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1641 			goto retry;
1642 
1643 		spte_set = true;
1644 	}
1645 
1646 	rcu_read_unlock();
1647 	return spte_set;
1648 }
1649 
1650 /*
1651  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1652  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1653  * If AD bits are not enabled, this will require clearing the writable bit on
1654  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1655  * be flushed.
1656  */
1657 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1658 				  const struct kvm_memory_slot *slot)
1659 {
1660 	struct kvm_mmu_page *root;
1661 	bool spte_set = false;
1662 
1663 	lockdep_assert_held_read(&kvm->mmu_lock);
1664 
1665 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1666 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1667 				slot->base_gfn + slot->npages);
1668 
1669 	return spte_set;
1670 }
1671 
1672 /*
1673  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1674  * set in mask, starting at gfn. The given memslot is expected to contain all
1675  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1676  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1677  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1678  */
1679 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1680 				  gfn_t gfn, unsigned long mask, bool wrprot)
1681 {
1682 	struct tdp_iter iter;
1683 	u64 new_spte;
1684 
1685 	rcu_read_lock();
1686 
1687 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1688 				    gfn + BITS_PER_LONG) {
1689 		if (!mask)
1690 			break;
1691 
1692 		if (iter.level > PG_LEVEL_4K ||
1693 		    !(mask & (1UL << (iter.gfn - gfn))))
1694 			continue;
1695 
1696 		mask &= ~(1UL << (iter.gfn - gfn));
1697 
1698 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1699 			if (is_writable_pte(iter.old_spte))
1700 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1701 			else
1702 				continue;
1703 		} else {
1704 			if (iter.old_spte & shadow_dirty_mask)
1705 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1706 			else
1707 				continue;
1708 		}
1709 
1710 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1711 	}
1712 
1713 	rcu_read_unlock();
1714 }
1715 
1716 /*
1717  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1718  * set in mask, starting at gfn. The given memslot is expected to contain all
1719  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1720  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1721  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1722  */
1723 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1724 				       struct kvm_memory_slot *slot,
1725 				       gfn_t gfn, unsigned long mask,
1726 				       bool wrprot)
1727 {
1728 	struct kvm_mmu_page *root;
1729 
1730 	lockdep_assert_held_write(&kvm->mmu_lock);
1731 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1732 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1733 }
1734 
1735 /*
1736  * Clear leaf entries which could be replaced by large mappings, for
1737  * GFNs within the slot.
1738  */
1739 static void zap_collapsible_spte_range(struct kvm *kvm,
1740 				       struct kvm_mmu_page *root,
1741 				       const struct kvm_memory_slot *slot)
1742 {
1743 	gfn_t start = slot->base_gfn;
1744 	gfn_t end = start + slot->npages;
1745 	struct tdp_iter iter;
1746 	kvm_pfn_t pfn;
1747 
1748 	rcu_read_lock();
1749 
1750 	tdp_root_for_each_pte(iter, root, start, end) {
1751 retry:
1752 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1753 			continue;
1754 
1755 		if (!is_shadow_present_pte(iter.old_spte) ||
1756 		    !is_last_spte(iter.old_spte, iter.level))
1757 			continue;
1758 
1759 		pfn = spte_to_pfn(iter.old_spte);
1760 		if (kvm_is_reserved_pfn(pfn) ||
1761 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1762 							    pfn, PG_LEVEL_NUM))
1763 			continue;
1764 
1765 		/* Note, a successful atomic zap also does a remote TLB flush. */
1766 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1767 			goto retry;
1768 	}
1769 
1770 	rcu_read_unlock();
1771 }
1772 
1773 /*
1774  * Clear non-leaf entries (and free associated page tables) which could
1775  * be replaced by large mappings, for GFNs within the slot.
1776  */
1777 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1778 				       const struct kvm_memory_slot *slot)
1779 {
1780 	struct kvm_mmu_page *root;
1781 
1782 	lockdep_assert_held_read(&kvm->mmu_lock);
1783 
1784 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1785 		zap_collapsible_spte_range(kvm, root, slot);
1786 }
1787 
1788 /*
1789  * Removes write access on the last level SPTE mapping this GFN and unsets the
1790  * MMU-writable bit to ensure future writes continue to be intercepted.
1791  * Returns true if an SPTE was set and a TLB flush is needed.
1792  */
1793 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1794 			      gfn_t gfn, int min_level)
1795 {
1796 	struct tdp_iter iter;
1797 	u64 new_spte;
1798 	bool spte_set = false;
1799 
1800 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1801 
1802 	rcu_read_lock();
1803 
1804 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1805 		if (!is_shadow_present_pte(iter.old_spte) ||
1806 		    !is_last_spte(iter.old_spte, iter.level))
1807 			continue;
1808 
1809 		new_spte = iter.old_spte &
1810 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1811 
1812 		if (new_spte == iter.old_spte)
1813 			break;
1814 
1815 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1816 		spte_set = true;
1817 	}
1818 
1819 	rcu_read_unlock();
1820 
1821 	return spte_set;
1822 }
1823 
1824 /*
1825  * Removes write access on the last level SPTE mapping this GFN and unsets the
1826  * MMU-writable bit to ensure future writes continue to be intercepted.
1827  * Returns true if an SPTE was set and a TLB flush is needed.
1828  */
1829 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1830 				   struct kvm_memory_slot *slot, gfn_t gfn,
1831 				   int min_level)
1832 {
1833 	struct kvm_mmu_page *root;
1834 	bool spte_set = false;
1835 
1836 	lockdep_assert_held_write(&kvm->mmu_lock);
1837 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1838 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1839 
1840 	return spte_set;
1841 }
1842 
1843 /*
1844  * Return the level of the lowest level SPTE added to sptes.
1845  * That SPTE may be non-present.
1846  *
1847  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1848  */
1849 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1850 			 int *root_level)
1851 {
1852 	struct tdp_iter iter;
1853 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1854 	gfn_t gfn = addr >> PAGE_SHIFT;
1855 	int leaf = -1;
1856 
1857 	*root_level = vcpu->arch.mmu->shadow_root_level;
1858 
1859 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1860 		leaf = iter.level;
1861 		sptes[leaf] = iter.old_spte;
1862 	}
1863 
1864 	return leaf;
1865 }
1866 
1867 /*
1868  * Returns the last level spte pointer of the shadow page walk for the given
1869  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1870  * walk could be performed, returns NULL and *spte does not contain valid data.
1871  *
1872  * Contract:
1873  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1874  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1875  *
1876  * WARNING: This function is only intended to be called during fast_page_fault.
1877  */
1878 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1879 					u64 *spte)
1880 {
1881 	struct tdp_iter iter;
1882 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1883 	gfn_t gfn = addr >> PAGE_SHIFT;
1884 	tdp_ptep_t sptep = NULL;
1885 
1886 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1887 		*spte = iter.old_spte;
1888 		sptep = iter.sptep;
1889 	}
1890 
1891 	/*
1892 	 * Perform the rcu_dereference to get the raw spte pointer value since
1893 	 * we are passing it up to fast_page_fault, which is shared with the
1894 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1895 	 * annotation.
1896 	 *
1897 	 * This is safe since fast_page_fault obeys the contracts of this
1898 	 * function as well as all TDP MMU contracts around modifying SPTEs
1899 	 * outside of mmu_lock.
1900 	 */
1901 	return rcu_dereference(sptep);
1902 }
1903