xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 2dfb62d6ce80b3536d1a915177ae82496bd7ac4a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	struct workqueue_struct *wq;
20 
21 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 		return 0;
23 
24 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 	if (!wq)
26 		return -ENOMEM;
27 
28 	/* This should not be changed for the lifetime of the VM. */
29 	kvm->arch.tdp_mmu_enabled = true;
30 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
32 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
33 	kvm->arch.tdp_mmu_zap_wq = wq;
34 	return 1;
35 }
36 
37 /* Arbitrarily returns true so that this may be used in if statements. */
38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
39 							     bool shared)
40 {
41 	if (shared)
42 		lockdep_assert_held_read(&kvm->mmu_lock);
43 	else
44 		lockdep_assert_held_write(&kvm->mmu_lock);
45 
46 	return true;
47 }
48 
49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
50 {
51 	if (!kvm->arch.tdp_mmu_enabled)
52 		return;
53 
54 	/* Also waits for any queued work items.  */
55 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56 
57 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
58 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
59 
60 	/*
61 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
62 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
63 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
64 	 */
65 	rcu_barrier();
66 }
67 
68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
69 {
70 	free_page((unsigned long)sp->spt);
71 	kmem_cache_free(mmu_page_header_cache, sp);
72 }
73 
74 /*
75  * This is called through call_rcu in order to free TDP page table memory
76  * safely with respect to other kernel threads that may be operating on
77  * the memory.
78  * By only accessing TDP MMU page table memory in an RCU read critical
79  * section, and freeing it after a grace period, lockless access to that
80  * memory won't use it after it is freed.
81  */
82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
83 {
84 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 					       rcu_head);
86 
87 	tdp_mmu_free_sp(sp);
88 }
89 
90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
91 			     bool shared);
92 
93 static void tdp_mmu_zap_root_work(struct work_struct *work)
94 {
95 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 						 tdp_mmu_async_work);
97 	struct kvm *kvm = root->tdp_mmu_async_data;
98 
99 	read_lock(&kvm->mmu_lock);
100 
101 	/*
102 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 	 * to a different pCPU.  Note, the local TLB flush on reuse also
105 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 	 * intermediate paging structures, that may be zapped, as such entries
107 	 * are associated with the ASID on both VMX and SVM.
108 	 */
109 	tdp_mmu_zap_root(kvm, root, true);
110 
111 	/*
112 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 	 * avoiding an infinite loop.  By design, the root is reachable while
114 	 * it's being asynchronously zapped, thus a different task can put its
115 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 	 * asynchronously zapped root is unavoidable.
117 	 */
118 	kvm_tdp_mmu_put_root(kvm, root, true);
119 
120 	read_unlock(&kvm->mmu_lock);
121 }
122 
123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124 {
125 	root->tdp_mmu_async_data = kvm;
126 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
128 }
129 
130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
131 {
132 	union kvm_mmu_page_role role = page->role;
133 	role.invalid = true;
134 
135 	/* No need to use cmpxchg, only the invalid bit can change.  */
136 	role.word = xchg(&page->role.word, role.word);
137 	return role.invalid;
138 }
139 
140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
141 			  bool shared)
142 {
143 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
144 
145 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
146 		return;
147 
148 	WARN_ON(!root->tdp_mmu_page);
149 
150 	/*
151 	 * The root now has refcount=0.  It is valid, but readers already
152 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 	 * rejects it.  This remains true for the rest of the execution
154 	 * of this function, because readers visit valid roots only
155 	 * (except for tdp_mmu_zap_root_work(), which however
156 	 * does not acquire any reference itself).
157 	 *
158 	 * Even though there are flows that need to visit all roots for
159 	 * correctness, they all take mmu_lock for write, so they cannot yet
160 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 	 * since the root still has refcount=0.
162 	 *
163 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 	 * So the root temporarily gets an extra reference, going to refcount=1
166 	 * while staying invalid.  Readers still cannot acquire any reference;
167 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
168 	 * they might take an extra reference if they themselves yield.
169 	 * Therefore, when the reference is given back by the worker,
170 	 * there is no guarantee that the refcount is still 1.  If not, whoever
171 	 * puts the last reference will free the page, but they will not have to
172 	 * zap the root because a root cannot go from invalid to valid.
173 	 */
174 	if (!kvm_tdp_root_mark_invalid(root)) {
175 		refcount_set(&root->tdp_mmu_root_count, 1);
176 
177 		/*
178 		 * Zapping the root in a worker is not just "nice to have";
179 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
181 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 		 * might return with some roots not zapped yet.
183 		 */
184 		tdp_mmu_schedule_zap_root(kvm, root);
185 		return;
186 	}
187 
188 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 	list_del_rcu(&root->link);
190 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
191 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
192 }
193 
194 /*
195  * Returns the next root after @prev_root (or the first root if @prev_root is
196  * NULL).  A reference to the returned root is acquired, and the reference to
197  * @prev_root is released (the caller obviously must hold a reference to
198  * @prev_root if it's non-NULL).
199  *
200  * If @only_valid is true, invalid roots are skipped.
201  *
202  * Returns NULL if the end of tdp_mmu_roots was reached.
203  */
204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
205 					      struct kvm_mmu_page *prev_root,
206 					      bool shared, bool only_valid)
207 {
208 	struct kvm_mmu_page *next_root;
209 
210 	rcu_read_lock();
211 
212 	if (prev_root)
213 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214 						  &prev_root->link,
215 						  typeof(*prev_root), link);
216 	else
217 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 						   typeof(*next_root), link);
219 
220 	while (next_root) {
221 		if ((!only_valid || !next_root->role.invalid) &&
222 		    kvm_tdp_mmu_get_root(next_root))
223 			break;
224 
225 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 				&next_root->link, typeof(*next_root), link);
227 	}
228 
229 	rcu_read_unlock();
230 
231 	if (prev_root)
232 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
233 
234 	return next_root;
235 }
236 
237 /*
238  * Note: this iterator gets and puts references to the roots it iterates over.
239  * This makes it safe to release the MMU lock and yield within the loop, but
240  * if exiting the loop early, the caller must drop the reference to the most
241  * recent root. (Unless keeping a live reference is desirable.)
242  *
243  * If shared is set, this function is operating under the MMU lock in read
244  * mode. In the unlikely event that this thread must free a root, the lock
245  * will be temporarily dropped and reacquired in write mode.
246  */
247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
249 	     _root;								\
250 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
251 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
252 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
253 		} else
254 
255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
256 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
257 
258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
259 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
260 
261 /*
262  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
263  * the implication being that any flow that holds mmu_lock for read is
264  * inherently yield-friendly and should use the yield-safe variant above.
265  * Holding mmu_lock for write obviates the need for RCU protection as the list
266  * is guaranteed to be stable.
267  */
268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
269 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
270 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
271 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
272 		} else
273 
274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
275 {
276 	struct kvm_mmu_page *sp;
277 
278 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
280 
281 	return sp;
282 }
283 
284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 			    gfn_t gfn, union kvm_mmu_page_role role)
286 {
287 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
288 
289 	sp->role = role;
290 	sp->gfn = gfn;
291 	sp->ptep = sptep;
292 	sp->tdp_mmu_page = true;
293 
294 	trace_kvm_mmu_get_page(sp, true);
295 }
296 
297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 				  struct tdp_iter *iter)
299 {
300 	struct kvm_mmu_page *parent_sp;
301 	union kvm_mmu_page_role role;
302 
303 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
304 
305 	role = parent_sp->role;
306 	role.level--;
307 
308 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
309 }
310 
311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
312 {
313 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
314 	struct kvm *kvm = vcpu->kvm;
315 	struct kvm_mmu_page *root;
316 
317 	lockdep_assert_held_write(&kvm->mmu_lock);
318 
319 	/*
320 	 * Check for an existing root before allocating a new one.  Note, the
321 	 * role check prevents consuming an invalid root.
322 	 */
323 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
324 		if (root->role.word == role.word &&
325 		    kvm_tdp_mmu_get_root(root))
326 			goto out;
327 	}
328 
329 	root = tdp_mmu_alloc_sp(vcpu);
330 	tdp_mmu_init_sp(root, NULL, 0, role);
331 
332 	refcount_set(&root->tdp_mmu_root_count, 1);
333 
334 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
337 
338 out:
339 	return __pa(root->spt);
340 }
341 
342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
343 				u64 old_spte, u64 new_spte, int level,
344 				bool shared);
345 
346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
347 {
348 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
349 		return;
350 
351 	if (is_accessed_spte(old_spte) &&
352 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
354 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
355 }
356 
357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 					  u64 old_spte, u64 new_spte, int level)
359 {
360 	bool pfn_changed;
361 	struct kvm_memory_slot *slot;
362 
363 	if (level > PG_LEVEL_4K)
364 		return;
365 
366 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
367 
368 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 	    is_writable_pte(new_spte)) {
370 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
371 		mark_page_dirty_in_slot(kvm, slot, gfn);
372 	}
373 }
374 
375 /**
376  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
377  *
378  * @kvm: kvm instance
379  * @sp: the page to be removed
380  * @shared: This operation may not be running under the exclusive use of
381  *	    the MMU lock and the operation must synchronize with other
382  *	    threads that might be adding or removing pages.
383  */
384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
385 			      bool shared)
386 {
387 	if (shared)
388 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
389 	else
390 		lockdep_assert_held_write(&kvm->mmu_lock);
391 
392 	list_del(&sp->link);
393 	if (sp->lpage_disallowed)
394 		unaccount_huge_nx_page(kvm, sp);
395 
396 	if (shared)
397 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
398 }
399 
400 /**
401  * handle_removed_pt() - handle a page table removed from the TDP structure
402  *
403  * @kvm: kvm instance
404  * @pt: the page removed from the paging structure
405  * @shared: This operation may not be running under the exclusive use
406  *	    of the MMU lock and the operation must synchronize with other
407  *	    threads that might be modifying SPTEs.
408  *
409  * Given a page table that has been removed from the TDP paging structure,
410  * iterates through the page table to clear SPTEs and free child page tables.
411  *
412  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
413  * protection. Since this thread removed it from the paging structure,
414  * this thread will be responsible for ensuring the page is freed. Hence the
415  * early rcu_dereferences in the function.
416  */
417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
418 {
419 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
420 	int level = sp->role.level;
421 	gfn_t base_gfn = sp->gfn;
422 	int i;
423 
424 	trace_kvm_mmu_prepare_zap_page(sp);
425 
426 	tdp_mmu_unlink_sp(kvm, sp, shared);
427 
428 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
429 		tdp_ptep_t sptep = pt + i;
430 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
431 		u64 old_spte;
432 
433 		if (shared) {
434 			/*
435 			 * Set the SPTE to a nonpresent value that other
436 			 * threads will not overwrite. If the SPTE was
437 			 * already marked as removed then another thread
438 			 * handling a page fault could overwrite it, so
439 			 * set the SPTE until it is set from some other
440 			 * value to the removed SPTE value.
441 			 */
442 			for (;;) {
443 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
444 				if (!is_removed_spte(old_spte))
445 					break;
446 				cpu_relax();
447 			}
448 		} else {
449 			/*
450 			 * If the SPTE is not MMU-present, there is no backing
451 			 * page associated with the SPTE and so no side effects
452 			 * that need to be recorded, and exclusive ownership of
453 			 * mmu_lock ensures the SPTE can't be made present.
454 			 * Note, zapping MMIO SPTEs is also unnecessary as they
455 			 * are guarded by the memslots generation, not by being
456 			 * unreachable.
457 			 */
458 			old_spte = kvm_tdp_mmu_read_spte(sptep);
459 			if (!is_shadow_present_pte(old_spte))
460 				continue;
461 
462 			/*
463 			 * Use the common helper instead of a raw WRITE_ONCE as
464 			 * the SPTE needs to be updated atomically if it can be
465 			 * modified by a different vCPU outside of mmu_lock.
466 			 * Even though the parent SPTE is !PRESENT, the TLB
467 			 * hasn't yet been flushed, and both Intel and AMD
468 			 * document that A/D assists can use upper-level PxE
469 			 * entries that are cached in the TLB, i.e. the CPU can
470 			 * still access the page and mark it dirty.
471 			 *
472 			 * No retry is needed in the atomic update path as the
473 			 * sole concern is dropping a Dirty bit, i.e. no other
474 			 * task can zap/remove the SPTE as mmu_lock is held for
475 			 * write.  Marking the SPTE as a removed SPTE is not
476 			 * strictly necessary for the same reason, but using
477 			 * the remove SPTE value keeps the shared/exclusive
478 			 * paths consistent and allows the handle_changed_spte()
479 			 * call below to hardcode the new value to REMOVED_SPTE.
480 			 *
481 			 * Note, even though dropping a Dirty bit is the only
482 			 * scenario where a non-atomic update could result in a
483 			 * functional bug, simply checking the Dirty bit isn't
484 			 * sufficient as a fast page fault could read the upper
485 			 * level SPTE before it is zapped, and then make this
486 			 * target SPTE writable, resume the guest, and set the
487 			 * Dirty bit between reading the SPTE above and writing
488 			 * it here.
489 			 */
490 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
491 							  REMOVED_SPTE, level);
492 		}
493 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
494 				    old_spte, REMOVED_SPTE, level, shared);
495 	}
496 
497 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
498 }
499 
500 /**
501  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
502  * @kvm: kvm instance
503  * @as_id: the address space of the paging structure the SPTE was a part of
504  * @gfn: the base GFN that was mapped by the SPTE
505  * @old_spte: The value of the SPTE before the change
506  * @new_spte: The value of the SPTE after the change
507  * @level: the level of the PT the SPTE is part of in the paging structure
508  * @shared: This operation may not be running under the exclusive use of
509  *	    the MMU lock and the operation must synchronize with other
510  *	    threads that might be modifying SPTEs.
511  *
512  * Handle bookkeeping that might result from the modification of a SPTE.
513  * This function must be called for all TDP SPTE modifications.
514  */
515 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
516 				  u64 old_spte, u64 new_spte, int level,
517 				  bool shared)
518 {
519 	bool was_present = is_shadow_present_pte(old_spte);
520 	bool is_present = is_shadow_present_pte(new_spte);
521 	bool was_leaf = was_present && is_last_spte(old_spte, level);
522 	bool is_leaf = is_present && is_last_spte(new_spte, level);
523 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
524 
525 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
526 	WARN_ON(level < PG_LEVEL_4K);
527 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
528 
529 	/*
530 	 * If this warning were to trigger it would indicate that there was a
531 	 * missing MMU notifier or a race with some notifier handler.
532 	 * A present, leaf SPTE should never be directly replaced with another
533 	 * present leaf SPTE pointing to a different PFN. A notifier handler
534 	 * should be zapping the SPTE before the main MM's page table is
535 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
536 	 * thread before replacement.
537 	 */
538 	if (was_leaf && is_leaf && pfn_changed) {
539 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
540 		       "SPTE with another present leaf SPTE mapping a\n"
541 		       "different PFN!\n"
542 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
543 		       as_id, gfn, old_spte, new_spte, level);
544 
545 		/*
546 		 * Crash the host to prevent error propagation and guest data
547 		 * corruption.
548 		 */
549 		BUG();
550 	}
551 
552 	if (old_spte == new_spte)
553 		return;
554 
555 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
556 
557 	if (is_leaf)
558 		check_spte_writable_invariants(new_spte);
559 
560 	/*
561 	 * The only times a SPTE should be changed from a non-present to
562 	 * non-present state is when an MMIO entry is installed/modified/
563 	 * removed. In that case, there is nothing to do here.
564 	 */
565 	if (!was_present && !is_present) {
566 		/*
567 		 * If this change does not involve a MMIO SPTE or removed SPTE,
568 		 * it is unexpected. Log the change, though it should not
569 		 * impact the guest since both the former and current SPTEs
570 		 * are nonpresent.
571 		 */
572 		if (WARN_ON(!is_mmio_spte(old_spte) &&
573 			    !is_mmio_spte(new_spte) &&
574 			    !is_removed_spte(new_spte)))
575 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
576 			       "should not be replaced with another,\n"
577 			       "different nonpresent SPTE, unless one or both\n"
578 			       "are MMIO SPTEs, or the new SPTE is\n"
579 			       "a temporary removed SPTE.\n"
580 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
581 			       as_id, gfn, old_spte, new_spte, level);
582 		return;
583 	}
584 
585 	if (is_leaf != was_leaf)
586 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
587 
588 	if (was_leaf && is_dirty_spte(old_spte) &&
589 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
590 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
591 
592 	/*
593 	 * Recursively handle child PTs if the change removed a subtree from
594 	 * the paging structure.  Note the WARN on the PFN changing without the
595 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
596 	 * pages are kernel allocations and should never be migrated.
597 	 */
598 	if (was_present && !was_leaf &&
599 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
600 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
601 }
602 
603 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
604 				u64 old_spte, u64 new_spte, int level,
605 				bool shared)
606 {
607 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
608 			      shared);
609 	handle_changed_spte_acc_track(old_spte, new_spte, level);
610 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
611 				      new_spte, level);
612 }
613 
614 /*
615  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
616  * and handle the associated bookkeeping.  Do not mark the page dirty
617  * in KVM's dirty bitmaps.
618  *
619  * If setting the SPTE fails because it has changed, iter->old_spte will be
620  * refreshed to the current value of the spte.
621  *
622  * @kvm: kvm instance
623  * @iter: a tdp_iter instance currently on the SPTE that should be set
624  * @new_spte: The value the SPTE should be set to
625  * Return:
626  * * 0      - If the SPTE was set.
627  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
628  *            no side-effects other than setting iter->old_spte to the last
629  *            known value of the spte.
630  */
631 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
632 					  struct tdp_iter *iter,
633 					  u64 new_spte)
634 {
635 	u64 *sptep = rcu_dereference(iter->sptep);
636 
637 	/*
638 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
639 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
640 	 * and pre-checking before inserting a new SPTE is advantageous as it
641 	 * avoids unnecessary work.
642 	 */
643 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
644 
645 	lockdep_assert_held_read(&kvm->mmu_lock);
646 
647 	/*
648 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
649 	 * does not hold the mmu_lock.
650 	 */
651 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
652 		return -EBUSY;
653 
654 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
655 			      new_spte, iter->level, true);
656 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
657 
658 	return 0;
659 }
660 
661 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
662 					  struct tdp_iter *iter)
663 {
664 	int ret;
665 
666 	/*
667 	 * Freeze the SPTE by setting it to a special,
668 	 * non-present value. This will stop other threads from
669 	 * immediately installing a present entry in its place
670 	 * before the TLBs are flushed.
671 	 */
672 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
673 	if (ret)
674 		return ret;
675 
676 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
677 					   KVM_PAGES_PER_HPAGE(iter->level));
678 
679 	/*
680 	 * No other thread can overwrite the removed SPTE as they must either
681 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
682 	 * overwrite the special removed SPTE value. No bookkeeping is needed
683 	 * here since the SPTE is going from non-present to non-present.  Use
684 	 * the raw write helper to avoid an unnecessary check on volatile bits.
685 	 */
686 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
687 
688 	return 0;
689 }
690 
691 
692 /*
693  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
694  * @kvm:	      KVM instance
695  * @as_id:	      Address space ID, i.e. regular vs. SMM
696  * @sptep:	      Pointer to the SPTE
697  * @old_spte:	      The current value of the SPTE
698  * @new_spte:	      The new value that will be set for the SPTE
699  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
700  * @level:	      The level _containing_ the SPTE (its parent PT's level)
701  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
702  *		      of the page. Should be set unless handling an MMU
703  *		      notifier for access tracking. Leaving record_acc_track
704  *		      unset in that case prevents page accesses from being
705  *		      double counted.
706  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
707  *		      appropriate for the change being made. Should be set
708  *		      unless performing certain dirty logging operations.
709  *		      Leaving record_dirty_log unset in that case prevents page
710  *		      writes from being double counted.
711  *
712  * Returns the old SPTE value, which _may_ be different than @old_spte if the
713  * SPTE had voldatile bits.
714  */
715 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
716 			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
717 			      bool record_acc_track, bool record_dirty_log)
718 {
719 	lockdep_assert_held_write(&kvm->mmu_lock);
720 
721 	/*
722 	 * No thread should be using this function to set SPTEs to or from the
723 	 * temporary removed SPTE value.
724 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
725 	 * should be used. If operating under the MMU lock in write mode, the
726 	 * use of the removed SPTE should not be necessary.
727 	 */
728 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
729 
730 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
731 
732 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
733 
734 	if (record_acc_track)
735 		handle_changed_spte_acc_track(old_spte, new_spte, level);
736 	if (record_dirty_log)
737 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
738 					      new_spte, level);
739 	return old_spte;
740 }
741 
742 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
743 				     u64 new_spte, bool record_acc_track,
744 				     bool record_dirty_log)
745 {
746 	WARN_ON_ONCE(iter->yielded);
747 
748 	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
749 					    iter->old_spte, new_spte,
750 					    iter->gfn, iter->level,
751 					    record_acc_track, record_dirty_log);
752 }
753 
754 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
755 				    u64 new_spte)
756 {
757 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
758 }
759 
760 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
761 						 struct tdp_iter *iter,
762 						 u64 new_spte)
763 {
764 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
765 }
766 
767 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
768 						 struct tdp_iter *iter,
769 						 u64 new_spte)
770 {
771 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
772 }
773 
774 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
775 	for_each_tdp_pte(_iter, _root, _start, _end)
776 
777 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
778 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
779 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
780 		    !is_last_spte(_iter.old_spte, _iter.level))		\
781 			continue;					\
782 		else
783 
784 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
785 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
786 
787 /*
788  * Yield if the MMU lock is contended or this thread needs to return control
789  * to the scheduler.
790  *
791  * If this function should yield and flush is set, it will perform a remote
792  * TLB flush before yielding.
793  *
794  * If this function yields, iter->yielded is set and the caller must skip to
795  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
796  * over the paging structures to allow the iterator to continue its traversal
797  * from the paging structure root.
798  *
799  * Returns true if this function yielded.
800  */
801 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
802 							  struct tdp_iter *iter,
803 							  bool flush, bool shared)
804 {
805 	WARN_ON(iter->yielded);
806 
807 	/* Ensure forward progress has been made before yielding. */
808 	if (iter->next_last_level_gfn == iter->yielded_gfn)
809 		return false;
810 
811 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
812 		if (flush)
813 			kvm_flush_remote_tlbs(kvm);
814 
815 		rcu_read_unlock();
816 
817 		if (shared)
818 			cond_resched_rwlock_read(&kvm->mmu_lock);
819 		else
820 			cond_resched_rwlock_write(&kvm->mmu_lock);
821 
822 		rcu_read_lock();
823 
824 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
825 
826 		iter->yielded = true;
827 	}
828 
829 	return iter->yielded;
830 }
831 
832 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
833 {
834 	/*
835 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
836 	 * a gpa range that would exceed the max gfn, and KVM does not create
837 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
838 	 * the slow emulation path every time.
839 	 */
840 	return kvm_mmu_max_gfn() + 1;
841 }
842 
843 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
844 			       bool shared, int zap_level)
845 {
846 	struct tdp_iter iter;
847 
848 	gfn_t end = tdp_mmu_max_gfn_exclusive();
849 	gfn_t start = 0;
850 
851 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
852 retry:
853 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
854 			continue;
855 
856 		if (!is_shadow_present_pte(iter.old_spte))
857 			continue;
858 
859 		if (iter.level > zap_level)
860 			continue;
861 
862 		if (!shared)
863 			tdp_mmu_set_spte(kvm, &iter, 0);
864 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
865 			goto retry;
866 	}
867 }
868 
869 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
870 			     bool shared)
871 {
872 
873 	/*
874 	 * The root must have an elevated refcount so that it's reachable via
875 	 * mmu_notifier callbacks, which allows this path to yield and drop
876 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
877 	 * must drop all references to relevant pages prior to completing the
878 	 * callback.  Dropping mmu_lock with an unreachable root would result
879 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
880 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
881 	 * dirty accessed bits to the SPTE's associated struct page.
882 	 */
883 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
884 
885 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
886 
887 	rcu_read_lock();
888 
889 	/*
890 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
891 	 * split the zap into two passes.  On the first pass, zap at the 1gb
892 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
893 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
894 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
895 	 *
896 	 * Because zapping a SP recurses on its children, stepping down to
897 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
898 	 */
899 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
900 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
901 
902 	rcu_read_unlock();
903 }
904 
905 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
906 {
907 	u64 old_spte;
908 
909 	/*
910 	 * This helper intentionally doesn't allow zapping a root shadow page,
911 	 * which doesn't have a parent page table and thus no associated entry.
912 	 */
913 	if (WARN_ON_ONCE(!sp->ptep))
914 		return false;
915 
916 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
917 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
918 		return false;
919 
920 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
921 			   sp->gfn, sp->role.level + 1, true, true);
922 
923 	return true;
924 }
925 
926 /*
927  * If can_yield is true, will release the MMU lock and reschedule if the
928  * scheduler needs the CPU or there is contention on the MMU lock. If this
929  * function cannot yield, it will not release the MMU lock or reschedule and
930  * the caller must ensure it does not supply too large a GFN range, or the
931  * operation can cause a soft lockup.
932  */
933 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
934 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
935 {
936 	struct tdp_iter iter;
937 
938 	end = min(end, tdp_mmu_max_gfn_exclusive());
939 
940 	lockdep_assert_held_write(&kvm->mmu_lock);
941 
942 	rcu_read_lock();
943 
944 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
945 		if (can_yield &&
946 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
947 			flush = false;
948 			continue;
949 		}
950 
951 		if (!is_shadow_present_pte(iter.old_spte) ||
952 		    !is_last_spte(iter.old_spte, iter.level))
953 			continue;
954 
955 		tdp_mmu_set_spte(kvm, &iter, 0);
956 		flush = true;
957 	}
958 
959 	rcu_read_unlock();
960 
961 	/*
962 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
963 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
964 	 */
965 	return flush;
966 }
967 
968 /*
969  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
970  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
971  * more SPTEs were zapped since the MMU lock was last acquired.
972  */
973 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
974 			   bool can_yield, bool flush)
975 {
976 	struct kvm_mmu_page *root;
977 
978 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
979 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
980 
981 	return flush;
982 }
983 
984 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
985 {
986 	struct kvm_mmu_page *root;
987 	int i;
988 
989 	/*
990 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
991 	 * before returning to the caller.  Zap directly even if the root is
992 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
993 	 * all that expensive and mmu_lock is already held, which means the
994 	 * worker has yielded, i.e. flushing the work instead of zapping here
995 	 * isn't guaranteed to be any faster.
996 	 *
997 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
998 	 * is being destroyed or the userspace VMM has exited.  In both cases,
999 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1000 	 */
1001 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1002 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1003 			tdp_mmu_zap_root(kvm, root, false);
1004 	}
1005 }
1006 
1007 /*
1008  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1009  * zap" completes.
1010  */
1011 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1012 {
1013 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1014 }
1015 
1016 /*
1017  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1018  * is about to be zapped, e.g. in response to a memslots update.  The actual
1019  * zapping is performed asynchronously, so a reference is taken on all roots.
1020  * Using a separate workqueue makes it easy to ensure that the destruction is
1021  * performed before the "fast zap" completes, without keeping a separate list
1022  * of invalidated roots; the list is effectively the list of work items in
1023  * the workqueue.
1024  *
1025  * Get a reference even if the root is already invalid, the asynchronous worker
1026  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1027  * is held for write, it should be impossible to observe a root with zero refcount,
1028  * i.e. the list of roots cannot be stale.
1029  *
1030  * This has essentially the same effect for the TDP MMU
1031  * as updating mmu_valid_gen does for the shadow MMU.
1032  */
1033 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1034 {
1035 	struct kvm_mmu_page *root;
1036 
1037 	lockdep_assert_held_write(&kvm->mmu_lock);
1038 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1039 		if (!root->role.invalid &&
1040 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1041 			root->role.invalid = true;
1042 			tdp_mmu_schedule_zap_root(kvm, root);
1043 		}
1044 	}
1045 }
1046 
1047 /*
1048  * Installs a last-level SPTE to handle a TDP page fault.
1049  * (NPT/EPT violation/misconfiguration)
1050  */
1051 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1052 					  struct kvm_page_fault *fault,
1053 					  struct tdp_iter *iter)
1054 {
1055 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1056 	u64 new_spte;
1057 	int ret = RET_PF_FIXED;
1058 	bool wrprot = false;
1059 
1060 	WARN_ON(sp->role.level != fault->goal_level);
1061 	if (unlikely(!fault->slot))
1062 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1063 	else
1064 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1065 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1066 					 fault->map_writable, &new_spte);
1067 
1068 	if (new_spte == iter->old_spte)
1069 		ret = RET_PF_SPURIOUS;
1070 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1071 		return RET_PF_RETRY;
1072 	else if (is_shadow_present_pte(iter->old_spte) &&
1073 		 !is_last_spte(iter->old_spte, iter->level))
1074 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1075 						   KVM_PAGES_PER_HPAGE(iter->level + 1));
1076 
1077 	/*
1078 	 * If the page fault was caused by a write but the page is write
1079 	 * protected, emulation is needed. If the emulation was skipped,
1080 	 * the vCPU would have the same fault again.
1081 	 */
1082 	if (wrprot) {
1083 		if (fault->write)
1084 			ret = RET_PF_EMULATE;
1085 	}
1086 
1087 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1088 	if (unlikely(is_mmio_spte(new_spte))) {
1089 		vcpu->stat.pf_mmio_spte_created++;
1090 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1091 				     new_spte);
1092 		ret = RET_PF_EMULATE;
1093 	} else {
1094 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1095 				       rcu_dereference(iter->sptep));
1096 	}
1097 
1098 	return ret;
1099 }
1100 
1101 /*
1102  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1103  * provided page table.
1104  *
1105  * @kvm: kvm instance
1106  * @iter: a tdp_iter instance currently on the SPTE that should be set
1107  * @sp: The new TDP page table to install.
1108  * @account_nx: True if this page table is being installed to split a
1109  *              non-executable huge page.
1110  * @shared: This operation is running under the MMU lock in read mode.
1111  *
1112  * Returns: 0 if the new page table was installed. Non-0 if the page table
1113  *          could not be installed (e.g. the atomic compare-exchange failed).
1114  */
1115 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1116 			   struct kvm_mmu_page *sp, bool account_nx,
1117 			   bool shared)
1118 {
1119 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1120 	int ret = 0;
1121 
1122 	if (shared) {
1123 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1124 		if (ret)
1125 			return ret;
1126 	} else {
1127 		tdp_mmu_set_spte(kvm, iter, spte);
1128 	}
1129 
1130 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1131 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1132 	if (account_nx)
1133 		account_huge_nx_page(kvm, sp);
1134 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1135 
1136 	return 0;
1137 }
1138 
1139 /*
1140  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1141  * page tables and SPTEs to translate the faulting guest physical address.
1142  */
1143 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1144 {
1145 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1146 	struct tdp_iter iter;
1147 	struct kvm_mmu_page *sp;
1148 	int ret;
1149 
1150 	kvm_mmu_hugepage_adjust(vcpu, fault);
1151 
1152 	trace_kvm_mmu_spte_requested(fault);
1153 
1154 	rcu_read_lock();
1155 
1156 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1157 		if (fault->nx_huge_page_workaround_enabled)
1158 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1159 
1160 		if (iter.level == fault->goal_level)
1161 			break;
1162 
1163 		/*
1164 		 * If there is an SPTE mapping a large page at a higher level
1165 		 * than the target, that SPTE must be cleared and replaced
1166 		 * with a non-leaf SPTE.
1167 		 */
1168 		if (is_shadow_present_pte(iter.old_spte) &&
1169 		    is_large_pte(iter.old_spte)) {
1170 			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1171 				break;
1172 
1173 			/*
1174 			 * The iter must explicitly re-read the spte here
1175 			 * because the new value informs the !present
1176 			 * path below.
1177 			 */
1178 			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1179 		}
1180 
1181 		if (!is_shadow_present_pte(iter.old_spte)) {
1182 			bool account_nx = fault->huge_page_disallowed &&
1183 					  fault->req_level >= iter.level;
1184 
1185 			/*
1186 			 * If SPTE has been frozen by another thread, just
1187 			 * give up and retry, avoiding unnecessary page table
1188 			 * allocation and free.
1189 			 */
1190 			if (is_removed_spte(iter.old_spte))
1191 				break;
1192 
1193 			sp = tdp_mmu_alloc_sp(vcpu);
1194 			tdp_mmu_init_child_sp(sp, &iter);
1195 
1196 			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1197 				tdp_mmu_free_sp(sp);
1198 				break;
1199 			}
1200 		}
1201 	}
1202 
1203 	/*
1204 	 * Force the guest to retry the access if the upper level SPTEs aren't
1205 	 * in place, or if the target leaf SPTE is frozen by another CPU.
1206 	 */
1207 	if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1208 		rcu_read_unlock();
1209 		return RET_PF_RETRY;
1210 	}
1211 
1212 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1213 	rcu_read_unlock();
1214 
1215 	return ret;
1216 }
1217 
1218 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1219 				 bool flush)
1220 {
1221 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1222 				     range->end, range->may_block, flush);
1223 }
1224 
1225 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1226 			      struct kvm_gfn_range *range);
1227 
1228 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1229 						   struct kvm_gfn_range *range,
1230 						   tdp_handler_t handler)
1231 {
1232 	struct kvm_mmu_page *root;
1233 	struct tdp_iter iter;
1234 	bool ret = false;
1235 
1236 	/*
1237 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1238 	 * into this helper allow blocking; it'd be dead, wasteful code.
1239 	 */
1240 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1241 		rcu_read_lock();
1242 
1243 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1244 			ret |= handler(kvm, &iter, range);
1245 
1246 		rcu_read_unlock();
1247 	}
1248 
1249 	return ret;
1250 }
1251 
1252 /*
1253  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1254  * if any of the GFNs in the range have been accessed.
1255  */
1256 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1257 			  struct kvm_gfn_range *range)
1258 {
1259 	u64 new_spte = 0;
1260 
1261 	/* If we have a non-accessed entry we don't need to change the pte. */
1262 	if (!is_accessed_spte(iter->old_spte))
1263 		return false;
1264 
1265 	new_spte = iter->old_spte;
1266 
1267 	if (spte_ad_enabled(new_spte)) {
1268 		new_spte &= ~shadow_accessed_mask;
1269 	} else {
1270 		/*
1271 		 * Capture the dirty status of the page, so that it doesn't get
1272 		 * lost when the SPTE is marked for access tracking.
1273 		 */
1274 		if (is_writable_pte(new_spte))
1275 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1276 
1277 		new_spte = mark_spte_for_access_track(new_spte);
1278 	}
1279 
1280 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1281 
1282 	return true;
1283 }
1284 
1285 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1286 {
1287 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1288 }
1289 
1290 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1291 			 struct kvm_gfn_range *range)
1292 {
1293 	return is_accessed_spte(iter->old_spte);
1294 }
1295 
1296 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1297 {
1298 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1299 }
1300 
1301 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1302 			 struct kvm_gfn_range *range)
1303 {
1304 	u64 new_spte;
1305 
1306 	/* Huge pages aren't expected to be modified without first being zapped. */
1307 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1308 
1309 	if (iter->level != PG_LEVEL_4K ||
1310 	    !is_shadow_present_pte(iter->old_spte))
1311 		return false;
1312 
1313 	/*
1314 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1315 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1316 	 * invariant that the PFN of a present * leaf SPTE can never change.
1317 	 * See __handle_changed_spte().
1318 	 */
1319 	tdp_mmu_set_spte(kvm, iter, 0);
1320 
1321 	if (!pte_write(range->pte)) {
1322 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1323 								  pte_pfn(range->pte));
1324 
1325 		tdp_mmu_set_spte(kvm, iter, new_spte);
1326 	}
1327 
1328 	return true;
1329 }
1330 
1331 /*
1332  * Handle the changed_pte MMU notifier for the TDP MMU.
1333  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1334  * notifier.
1335  * Returns non-zero if a flush is needed before releasing the MMU lock.
1336  */
1337 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1338 {
1339 	/*
1340 	 * No need to handle the remote TLB flush under RCU protection, the
1341 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1342 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1343 	 */
1344 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1345 }
1346 
1347 /*
1348  * Remove write access from all SPTEs at or above min_level that map GFNs
1349  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1350  * be flushed.
1351  */
1352 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1353 			     gfn_t start, gfn_t end, int min_level)
1354 {
1355 	struct tdp_iter iter;
1356 	u64 new_spte;
1357 	bool spte_set = false;
1358 
1359 	rcu_read_lock();
1360 
1361 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1362 
1363 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1364 retry:
1365 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1366 			continue;
1367 
1368 		if (!is_shadow_present_pte(iter.old_spte) ||
1369 		    !is_last_spte(iter.old_spte, iter.level) ||
1370 		    !(iter.old_spte & PT_WRITABLE_MASK))
1371 			continue;
1372 
1373 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1374 
1375 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1376 			goto retry;
1377 
1378 		spte_set = true;
1379 	}
1380 
1381 	rcu_read_unlock();
1382 	return spte_set;
1383 }
1384 
1385 /*
1386  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1387  * only affect leaf SPTEs down to min_level.
1388  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1389  */
1390 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1391 			     const struct kvm_memory_slot *slot, int min_level)
1392 {
1393 	struct kvm_mmu_page *root;
1394 	bool spte_set = false;
1395 
1396 	lockdep_assert_held_read(&kvm->mmu_lock);
1397 
1398 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1399 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1400 			     slot->base_gfn + slot->npages, min_level);
1401 
1402 	return spte_set;
1403 }
1404 
1405 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1406 {
1407 	struct kvm_mmu_page *sp;
1408 
1409 	gfp |= __GFP_ZERO;
1410 
1411 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1412 	if (!sp)
1413 		return NULL;
1414 
1415 	sp->spt = (void *)__get_free_page(gfp);
1416 	if (!sp->spt) {
1417 		kmem_cache_free(mmu_page_header_cache, sp);
1418 		return NULL;
1419 	}
1420 
1421 	return sp;
1422 }
1423 
1424 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1425 						       struct tdp_iter *iter,
1426 						       bool shared)
1427 {
1428 	struct kvm_mmu_page *sp;
1429 
1430 	/*
1431 	 * Since we are allocating while under the MMU lock we have to be
1432 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1433 	 * reclaim and to avoid making any filesystem callbacks (which can end
1434 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1435 	 *
1436 	 * If this allocation fails we drop the lock and retry with reclaim
1437 	 * allowed.
1438 	 */
1439 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1440 	if (sp)
1441 		return sp;
1442 
1443 	rcu_read_unlock();
1444 
1445 	if (shared)
1446 		read_unlock(&kvm->mmu_lock);
1447 	else
1448 		write_unlock(&kvm->mmu_lock);
1449 
1450 	iter->yielded = true;
1451 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1452 
1453 	if (shared)
1454 		read_lock(&kvm->mmu_lock);
1455 	else
1456 		write_lock(&kvm->mmu_lock);
1457 
1458 	rcu_read_lock();
1459 
1460 	return sp;
1461 }
1462 
1463 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1464 				   struct kvm_mmu_page *sp, bool shared)
1465 {
1466 	const u64 huge_spte = iter->old_spte;
1467 	const int level = iter->level;
1468 	int ret, i;
1469 
1470 	tdp_mmu_init_child_sp(sp, iter);
1471 
1472 	/*
1473 	 * No need for atomics when writing to sp->spt since the page table has
1474 	 * not been linked in yet and thus is not reachable from any other CPU.
1475 	 */
1476 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1477 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1478 
1479 	/*
1480 	 * Replace the huge spte with a pointer to the populated lower level
1481 	 * page table. Since we are making this change without a TLB flush vCPUs
1482 	 * will see a mix of the split mappings and the original huge mapping,
1483 	 * depending on what's currently in their TLB. This is fine from a
1484 	 * correctness standpoint since the translation will be the same either
1485 	 * way.
1486 	 */
1487 	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1488 	if (ret)
1489 		goto out;
1490 
1491 	/*
1492 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1493 	 * are overwriting from the page stats. But we have to manually update
1494 	 * the page stats with the new present child pages.
1495 	 */
1496 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1497 
1498 out:
1499 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1500 	return ret;
1501 }
1502 
1503 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1504 					 struct kvm_mmu_page *root,
1505 					 gfn_t start, gfn_t end,
1506 					 int target_level, bool shared)
1507 {
1508 	struct kvm_mmu_page *sp = NULL;
1509 	struct tdp_iter iter;
1510 	int ret = 0;
1511 
1512 	rcu_read_lock();
1513 
1514 	/*
1515 	 * Traverse the page table splitting all huge pages above the target
1516 	 * level into one lower level. For example, if we encounter a 1GB page
1517 	 * we split it into 512 2MB pages.
1518 	 *
1519 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1520 	 * to visit an SPTE before ever visiting its children, which means we
1521 	 * will correctly recursively split huge pages that are more than one
1522 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1523 	 * and then splitting each of those to 512 4KB pages).
1524 	 */
1525 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1526 retry:
1527 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1528 			continue;
1529 
1530 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1531 			continue;
1532 
1533 		if (!sp) {
1534 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1535 			if (!sp) {
1536 				ret = -ENOMEM;
1537 				trace_kvm_mmu_split_huge_page(iter.gfn,
1538 							      iter.old_spte,
1539 							      iter.level, ret);
1540 				break;
1541 			}
1542 
1543 			if (iter.yielded)
1544 				continue;
1545 		}
1546 
1547 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1548 			goto retry;
1549 
1550 		sp = NULL;
1551 	}
1552 
1553 	rcu_read_unlock();
1554 
1555 	/*
1556 	 * It's possible to exit the loop having never used the last sp if, for
1557 	 * example, a vCPU doing HugePage NX splitting wins the race and
1558 	 * installs its own sp in place of the last sp we tried to split.
1559 	 */
1560 	if (sp)
1561 		tdp_mmu_free_sp(sp);
1562 
1563 	return ret;
1564 }
1565 
1566 
1567 /*
1568  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1569  */
1570 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1571 				      const struct kvm_memory_slot *slot,
1572 				      gfn_t start, gfn_t end,
1573 				      int target_level, bool shared)
1574 {
1575 	struct kvm_mmu_page *root;
1576 	int r = 0;
1577 
1578 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1579 
1580 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1581 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1582 		if (r) {
1583 			kvm_tdp_mmu_put_root(kvm, root, shared);
1584 			break;
1585 		}
1586 	}
1587 }
1588 
1589 /*
1590  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1591  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1592  * If AD bits are not enabled, this will require clearing the writable bit on
1593  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1594  * be flushed.
1595  */
1596 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1597 			   gfn_t start, gfn_t end)
1598 {
1599 	struct tdp_iter iter;
1600 	u64 new_spte;
1601 	bool spte_set = false;
1602 
1603 	rcu_read_lock();
1604 
1605 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1606 retry:
1607 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1608 			continue;
1609 
1610 		if (!is_shadow_present_pte(iter.old_spte))
1611 			continue;
1612 
1613 		if (spte_ad_need_write_protect(iter.old_spte)) {
1614 			if (is_writable_pte(iter.old_spte))
1615 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1616 			else
1617 				continue;
1618 		} else {
1619 			if (iter.old_spte & shadow_dirty_mask)
1620 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1621 			else
1622 				continue;
1623 		}
1624 
1625 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1626 			goto retry;
1627 
1628 		spte_set = true;
1629 	}
1630 
1631 	rcu_read_unlock();
1632 	return spte_set;
1633 }
1634 
1635 /*
1636  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1637  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1638  * If AD bits are not enabled, this will require clearing the writable bit on
1639  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1640  * be flushed.
1641  */
1642 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1643 				  const struct kvm_memory_slot *slot)
1644 {
1645 	struct kvm_mmu_page *root;
1646 	bool spte_set = false;
1647 
1648 	lockdep_assert_held_read(&kvm->mmu_lock);
1649 
1650 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1651 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1652 				slot->base_gfn + slot->npages);
1653 
1654 	return spte_set;
1655 }
1656 
1657 /*
1658  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1659  * set in mask, starting at gfn. The given memslot is expected to contain all
1660  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1661  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1662  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1663  */
1664 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1665 				  gfn_t gfn, unsigned long mask, bool wrprot)
1666 {
1667 	struct tdp_iter iter;
1668 	u64 new_spte;
1669 
1670 	rcu_read_lock();
1671 
1672 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1673 				    gfn + BITS_PER_LONG) {
1674 		if (!mask)
1675 			break;
1676 
1677 		if (iter.level > PG_LEVEL_4K ||
1678 		    !(mask & (1UL << (iter.gfn - gfn))))
1679 			continue;
1680 
1681 		mask &= ~(1UL << (iter.gfn - gfn));
1682 
1683 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1684 			if (is_writable_pte(iter.old_spte))
1685 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1686 			else
1687 				continue;
1688 		} else {
1689 			if (iter.old_spte & shadow_dirty_mask)
1690 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1691 			else
1692 				continue;
1693 		}
1694 
1695 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1696 	}
1697 
1698 	rcu_read_unlock();
1699 }
1700 
1701 /*
1702  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1703  * set in mask, starting at gfn. The given memslot is expected to contain all
1704  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1705  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1706  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1707  */
1708 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1709 				       struct kvm_memory_slot *slot,
1710 				       gfn_t gfn, unsigned long mask,
1711 				       bool wrprot)
1712 {
1713 	struct kvm_mmu_page *root;
1714 
1715 	lockdep_assert_held_write(&kvm->mmu_lock);
1716 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1717 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1718 }
1719 
1720 static void zap_collapsible_spte_range(struct kvm *kvm,
1721 				       struct kvm_mmu_page *root,
1722 				       const struct kvm_memory_slot *slot)
1723 {
1724 	gfn_t start = slot->base_gfn;
1725 	gfn_t end = start + slot->npages;
1726 	struct tdp_iter iter;
1727 	int max_mapping_level;
1728 
1729 	rcu_read_lock();
1730 
1731 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1732 retry:
1733 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1734 			continue;
1735 
1736 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1737 		    !is_shadow_present_pte(iter.old_spte))
1738 			continue;
1739 
1740 		/*
1741 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1742 		 * a large page size, then its parent would have been zapped
1743 		 * instead of stepping down.
1744 		 */
1745 		if (is_last_spte(iter.old_spte, iter.level))
1746 			continue;
1747 
1748 		/*
1749 		 * If iter.gfn resides outside of the slot, i.e. the page for
1750 		 * the current level overlaps but is not contained by the slot,
1751 		 * then the SPTE can't be made huge.  More importantly, trying
1752 		 * to query that info from slot->arch.lpage_info will cause an
1753 		 * out-of-bounds access.
1754 		 */
1755 		if (iter.gfn < start || iter.gfn >= end)
1756 			continue;
1757 
1758 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1759 							      iter.gfn, PG_LEVEL_NUM);
1760 		if (max_mapping_level < iter.level)
1761 			continue;
1762 
1763 		/* Note, a successful atomic zap also does a remote TLB flush. */
1764 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1765 			goto retry;
1766 	}
1767 
1768 	rcu_read_unlock();
1769 }
1770 
1771 /*
1772  * Zap non-leaf SPTEs (and free their associated page tables) which could
1773  * be replaced by huge pages, for GFNs within the slot.
1774  */
1775 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1776 				       const struct kvm_memory_slot *slot)
1777 {
1778 	struct kvm_mmu_page *root;
1779 
1780 	lockdep_assert_held_read(&kvm->mmu_lock);
1781 
1782 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1783 		zap_collapsible_spte_range(kvm, root, slot);
1784 }
1785 
1786 /*
1787  * Removes write access on the last level SPTE mapping this GFN and unsets the
1788  * MMU-writable bit to ensure future writes continue to be intercepted.
1789  * Returns true if an SPTE was set and a TLB flush is needed.
1790  */
1791 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1792 			      gfn_t gfn, int min_level)
1793 {
1794 	struct tdp_iter iter;
1795 	u64 new_spte;
1796 	bool spte_set = false;
1797 
1798 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1799 
1800 	rcu_read_lock();
1801 
1802 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1803 		if (!is_shadow_present_pte(iter.old_spte) ||
1804 		    !is_last_spte(iter.old_spte, iter.level))
1805 			continue;
1806 
1807 		new_spte = iter.old_spte &
1808 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1809 
1810 		if (new_spte == iter.old_spte)
1811 			break;
1812 
1813 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1814 		spte_set = true;
1815 	}
1816 
1817 	rcu_read_unlock();
1818 
1819 	return spte_set;
1820 }
1821 
1822 /*
1823  * Removes write access on the last level SPTE mapping this GFN and unsets the
1824  * MMU-writable bit to ensure future writes continue to be intercepted.
1825  * Returns true if an SPTE was set and a TLB flush is needed.
1826  */
1827 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1828 				   struct kvm_memory_slot *slot, gfn_t gfn,
1829 				   int min_level)
1830 {
1831 	struct kvm_mmu_page *root;
1832 	bool spte_set = false;
1833 
1834 	lockdep_assert_held_write(&kvm->mmu_lock);
1835 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1836 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1837 
1838 	return spte_set;
1839 }
1840 
1841 /*
1842  * Return the level of the lowest level SPTE added to sptes.
1843  * That SPTE may be non-present.
1844  *
1845  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1846  */
1847 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1848 			 int *root_level)
1849 {
1850 	struct tdp_iter iter;
1851 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1852 	gfn_t gfn = addr >> PAGE_SHIFT;
1853 	int leaf = -1;
1854 
1855 	*root_level = vcpu->arch.mmu->root_role.level;
1856 
1857 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1858 		leaf = iter.level;
1859 		sptes[leaf] = iter.old_spte;
1860 	}
1861 
1862 	return leaf;
1863 }
1864 
1865 /*
1866  * Returns the last level spte pointer of the shadow page walk for the given
1867  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1868  * walk could be performed, returns NULL and *spte does not contain valid data.
1869  *
1870  * Contract:
1871  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1872  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1873  *
1874  * WARNING: This function is only intended to be called during fast_page_fault.
1875  */
1876 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1877 					u64 *spte)
1878 {
1879 	struct tdp_iter iter;
1880 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1881 	gfn_t gfn = addr >> PAGE_SHIFT;
1882 	tdp_ptep_t sptep = NULL;
1883 
1884 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1885 		*spte = iter.old_spte;
1886 		sptep = iter.sptep;
1887 	}
1888 
1889 	/*
1890 	 * Perform the rcu_dereference to get the raw spte pointer value since
1891 	 * we are passing it up to fast_page_fault, which is shared with the
1892 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1893 	 * annotation.
1894 	 *
1895 	 * This is safe since fast_page_fault obeys the contracts of this
1896 	 * function as well as all TDP MMU contracts around modifying SPTEs
1897 	 * outside of mmu_lock.
1898 	 */
1899 	return rcu_dereference(sptep);
1900 }
1901