xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision e7300870)
1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10 
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13 
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 	struct workqueue_struct *wq;
18 
19 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
20 	if (!wq)
21 		return -ENOMEM;
22 
23 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
24 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25 	kvm->arch.tdp_mmu_zap_wq = wq;
26 	return 1;
27 }
28 
29 /* Arbitrarily returns true so that this may be used in if statements. */
30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
31 							     bool shared)
32 {
33 	if (shared)
34 		lockdep_assert_held_read(&kvm->mmu_lock);
35 	else
36 		lockdep_assert_held_write(&kvm->mmu_lock);
37 
38 	return true;
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	/* Also waits for any queued work items.  */
44 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
45 
46 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
52 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
53 	 */
54 	rcu_barrier();
55 }
56 
57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
58 {
59 	free_page((unsigned long)sp->spt);
60 	kmem_cache_free(mmu_page_header_cache, sp);
61 }
62 
63 /*
64  * This is called through call_rcu in order to free TDP page table memory
65  * safely with respect to other kernel threads that may be operating on
66  * the memory.
67  * By only accessing TDP MMU page table memory in an RCU read critical
68  * section, and freeing it after a grace period, lockless access to that
69  * memory won't use it after it is freed.
70  */
71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72 {
73 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 					       rcu_head);
75 
76 	tdp_mmu_free_sp(sp);
77 }
78 
79 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
80 			     bool shared);
81 
82 static void tdp_mmu_zap_root_work(struct work_struct *work)
83 {
84 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
85 						 tdp_mmu_async_work);
86 	struct kvm *kvm = root->tdp_mmu_async_data;
87 
88 	read_lock(&kvm->mmu_lock);
89 
90 	/*
91 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
92 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
93 	 * to a different pCPU.  Note, the local TLB flush on reuse also
94 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
95 	 * intermediate paging structures, that may be zapped, as such entries
96 	 * are associated with the ASID on both VMX and SVM.
97 	 */
98 	tdp_mmu_zap_root(kvm, root, true);
99 
100 	/*
101 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
102 	 * avoiding an infinite loop.  By design, the root is reachable while
103 	 * it's being asynchronously zapped, thus a different task can put its
104 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
105 	 * asynchronously zapped root is unavoidable.
106 	 */
107 	kvm_tdp_mmu_put_root(kvm, root, true);
108 
109 	read_unlock(&kvm->mmu_lock);
110 }
111 
112 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
113 {
114 	root->tdp_mmu_async_data = kvm;
115 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
116 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
117 }
118 
119 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
120 {
121 	union kvm_mmu_page_role role = page->role;
122 	role.invalid = true;
123 
124 	/* No need to use cmpxchg, only the invalid bit can change.  */
125 	role.word = xchg(&page->role.word, role.word);
126 	return role.invalid;
127 }
128 
129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
130 			  bool shared)
131 {
132 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
133 
134 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
135 		return;
136 
137 	WARN_ON(!is_tdp_mmu_page(root));
138 
139 	/*
140 	 * The root now has refcount=0.  It is valid, but readers already
141 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
142 	 * rejects it.  This remains true for the rest of the execution
143 	 * of this function, because readers visit valid roots only
144 	 * (except for tdp_mmu_zap_root_work(), which however
145 	 * does not acquire any reference itself).
146 	 *
147 	 * Even though there are flows that need to visit all roots for
148 	 * correctness, they all take mmu_lock for write, so they cannot yet
149 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
150 	 * since the root still has refcount=0.
151 	 *
152 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
153 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
154 	 * So the root temporarily gets an extra reference, going to refcount=1
155 	 * while staying invalid.  Readers still cannot acquire any reference;
156 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
157 	 * they might take an extra reference if they themselves yield.
158 	 * Therefore, when the reference is given back by the worker,
159 	 * there is no guarantee that the refcount is still 1.  If not, whoever
160 	 * puts the last reference will free the page, but they will not have to
161 	 * zap the root because a root cannot go from invalid to valid.
162 	 */
163 	if (!kvm_tdp_root_mark_invalid(root)) {
164 		refcount_set(&root->tdp_mmu_root_count, 1);
165 
166 		/*
167 		 * Zapping the root in a worker is not just "nice to have";
168 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
169 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
170 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
171 		 * might return with some roots not zapped yet.
172 		 */
173 		tdp_mmu_schedule_zap_root(kvm, root);
174 		return;
175 	}
176 
177 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
178 	list_del_rcu(&root->link);
179 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
180 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
181 }
182 
183 /*
184  * Returns the next root after @prev_root (or the first root if @prev_root is
185  * NULL).  A reference to the returned root is acquired, and the reference to
186  * @prev_root is released (the caller obviously must hold a reference to
187  * @prev_root if it's non-NULL).
188  *
189  * If @only_valid is true, invalid roots are skipped.
190  *
191  * Returns NULL if the end of tdp_mmu_roots was reached.
192  */
193 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
194 					      struct kvm_mmu_page *prev_root,
195 					      bool shared, bool only_valid)
196 {
197 	struct kvm_mmu_page *next_root;
198 
199 	rcu_read_lock();
200 
201 	if (prev_root)
202 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
203 						  &prev_root->link,
204 						  typeof(*prev_root), link);
205 	else
206 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
207 						   typeof(*next_root), link);
208 
209 	while (next_root) {
210 		if ((!only_valid || !next_root->role.invalid) &&
211 		    kvm_tdp_mmu_get_root(next_root))
212 			break;
213 
214 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
215 				&next_root->link, typeof(*next_root), link);
216 	}
217 
218 	rcu_read_unlock();
219 
220 	if (prev_root)
221 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
222 
223 	return next_root;
224 }
225 
226 /*
227  * Note: this iterator gets and puts references to the roots it iterates over.
228  * This makes it safe to release the MMU lock and yield within the loop, but
229  * if exiting the loop early, the caller must drop the reference to the most
230  * recent root. (Unless keeping a live reference is desirable.)
231  *
232  * If shared is set, this function is operating under the MMU lock in read
233  * mode. In the unlikely event that this thread must free a root, the lock
234  * will be temporarily dropped and reacquired in write mode.
235  */
236 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
237 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
238 	     _root;								\
239 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
240 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
241 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
242 		} else
243 
244 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
245 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
246 
247 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
248 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
249 
250 /*
251  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
252  * the implication being that any flow that holds mmu_lock for read is
253  * inherently yield-friendly and should use the yield-safe variant above.
254  * Holding mmu_lock for write obviates the need for RCU protection as the list
255  * is guaranteed to be stable.
256  */
257 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
258 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
259 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
260 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
261 		} else
262 
263 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
264 {
265 	struct kvm_mmu_page *sp;
266 
267 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
268 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
269 
270 	return sp;
271 }
272 
273 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
274 			    gfn_t gfn, union kvm_mmu_page_role role)
275 {
276 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
277 
278 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
279 
280 	sp->role = role;
281 	sp->gfn = gfn;
282 	sp->ptep = sptep;
283 	sp->tdp_mmu_page = true;
284 
285 	trace_kvm_mmu_get_page(sp, true);
286 }
287 
288 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
289 				  struct tdp_iter *iter)
290 {
291 	struct kvm_mmu_page *parent_sp;
292 	union kvm_mmu_page_role role;
293 
294 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
295 
296 	role = parent_sp->role;
297 	role.level--;
298 
299 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
300 }
301 
302 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
303 {
304 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
305 	struct kvm *kvm = vcpu->kvm;
306 	struct kvm_mmu_page *root;
307 
308 	lockdep_assert_held_write(&kvm->mmu_lock);
309 
310 	/*
311 	 * Check for an existing root before allocating a new one.  Note, the
312 	 * role check prevents consuming an invalid root.
313 	 */
314 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
315 		if (root->role.word == role.word &&
316 		    kvm_tdp_mmu_get_root(root))
317 			goto out;
318 	}
319 
320 	root = tdp_mmu_alloc_sp(vcpu);
321 	tdp_mmu_init_sp(root, NULL, 0, role);
322 
323 	refcount_set(&root->tdp_mmu_root_count, 1);
324 
325 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
326 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
327 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
328 
329 out:
330 	return __pa(root->spt);
331 }
332 
333 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
334 				u64 old_spte, u64 new_spte, int level,
335 				bool shared);
336 
337 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
338 {
339 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
340 		return;
341 
342 	if (is_accessed_spte(old_spte) &&
343 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
344 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
345 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
346 }
347 
348 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
349 					  u64 old_spte, u64 new_spte, int level)
350 {
351 	bool pfn_changed;
352 	struct kvm_memory_slot *slot;
353 
354 	if (level > PG_LEVEL_4K)
355 		return;
356 
357 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
358 
359 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
360 	    is_writable_pte(new_spte)) {
361 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
362 		mark_page_dirty_in_slot(kvm, slot, gfn);
363 	}
364 }
365 
366 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
367 {
368 	kvm_account_pgtable_pages((void *)sp->spt, +1);
369 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
370 }
371 
372 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
373 {
374 	kvm_account_pgtable_pages((void *)sp->spt, -1);
375 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
376 }
377 
378 /**
379  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
380  *
381  * @kvm: kvm instance
382  * @sp: the page to be removed
383  * @shared: This operation may not be running under the exclusive use of
384  *	    the MMU lock and the operation must synchronize with other
385  *	    threads that might be adding or removing pages.
386  */
387 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
388 			      bool shared)
389 {
390 	tdp_unaccount_mmu_page(kvm, sp);
391 
392 	if (!sp->nx_huge_page_disallowed)
393 		return;
394 
395 	if (shared)
396 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
397 	else
398 		lockdep_assert_held_write(&kvm->mmu_lock);
399 
400 	sp->nx_huge_page_disallowed = false;
401 	untrack_possible_nx_huge_page(kvm, sp);
402 
403 	if (shared)
404 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
405 }
406 
407 /**
408  * handle_removed_pt() - handle a page table removed from the TDP structure
409  *
410  * @kvm: kvm instance
411  * @pt: the page removed from the paging structure
412  * @shared: This operation may not be running under the exclusive use
413  *	    of the MMU lock and the operation must synchronize with other
414  *	    threads that might be modifying SPTEs.
415  *
416  * Given a page table that has been removed from the TDP paging structure,
417  * iterates through the page table to clear SPTEs and free child page tables.
418  *
419  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
420  * protection. Since this thread removed it from the paging structure,
421  * this thread will be responsible for ensuring the page is freed. Hence the
422  * early rcu_dereferences in the function.
423  */
424 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
425 {
426 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
427 	int level = sp->role.level;
428 	gfn_t base_gfn = sp->gfn;
429 	int i;
430 
431 	trace_kvm_mmu_prepare_zap_page(sp);
432 
433 	tdp_mmu_unlink_sp(kvm, sp, shared);
434 
435 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
436 		tdp_ptep_t sptep = pt + i;
437 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
438 		u64 old_spte;
439 
440 		if (shared) {
441 			/*
442 			 * Set the SPTE to a nonpresent value that other
443 			 * threads will not overwrite. If the SPTE was
444 			 * already marked as removed then another thread
445 			 * handling a page fault could overwrite it, so
446 			 * set the SPTE until it is set from some other
447 			 * value to the removed SPTE value.
448 			 */
449 			for (;;) {
450 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
451 				if (!is_removed_spte(old_spte))
452 					break;
453 				cpu_relax();
454 			}
455 		} else {
456 			/*
457 			 * If the SPTE is not MMU-present, there is no backing
458 			 * page associated with the SPTE and so no side effects
459 			 * that need to be recorded, and exclusive ownership of
460 			 * mmu_lock ensures the SPTE can't be made present.
461 			 * Note, zapping MMIO SPTEs is also unnecessary as they
462 			 * are guarded by the memslots generation, not by being
463 			 * unreachable.
464 			 */
465 			old_spte = kvm_tdp_mmu_read_spte(sptep);
466 			if (!is_shadow_present_pte(old_spte))
467 				continue;
468 
469 			/*
470 			 * Use the common helper instead of a raw WRITE_ONCE as
471 			 * the SPTE needs to be updated atomically if it can be
472 			 * modified by a different vCPU outside of mmu_lock.
473 			 * Even though the parent SPTE is !PRESENT, the TLB
474 			 * hasn't yet been flushed, and both Intel and AMD
475 			 * document that A/D assists can use upper-level PxE
476 			 * entries that are cached in the TLB, i.e. the CPU can
477 			 * still access the page and mark it dirty.
478 			 *
479 			 * No retry is needed in the atomic update path as the
480 			 * sole concern is dropping a Dirty bit, i.e. no other
481 			 * task can zap/remove the SPTE as mmu_lock is held for
482 			 * write.  Marking the SPTE as a removed SPTE is not
483 			 * strictly necessary for the same reason, but using
484 			 * the remove SPTE value keeps the shared/exclusive
485 			 * paths consistent and allows the handle_changed_spte()
486 			 * call below to hardcode the new value to REMOVED_SPTE.
487 			 *
488 			 * Note, even though dropping a Dirty bit is the only
489 			 * scenario where a non-atomic update could result in a
490 			 * functional bug, simply checking the Dirty bit isn't
491 			 * sufficient as a fast page fault could read the upper
492 			 * level SPTE before it is zapped, and then make this
493 			 * target SPTE writable, resume the guest, and set the
494 			 * Dirty bit between reading the SPTE above and writing
495 			 * it here.
496 			 */
497 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
498 							  REMOVED_SPTE, level);
499 		}
500 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
501 				    old_spte, REMOVED_SPTE, level, shared);
502 	}
503 
504 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
505 }
506 
507 /**
508  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
509  * @kvm: kvm instance
510  * @as_id: the address space of the paging structure the SPTE was a part of
511  * @gfn: the base GFN that was mapped by the SPTE
512  * @old_spte: The value of the SPTE before the change
513  * @new_spte: The value of the SPTE after the change
514  * @level: the level of the PT the SPTE is part of in the paging structure
515  * @shared: This operation may not be running under the exclusive use of
516  *	    the MMU lock and the operation must synchronize with other
517  *	    threads that might be modifying SPTEs.
518  *
519  * Handle bookkeeping that might result from the modification of a SPTE.
520  */
521 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
522 				  u64 old_spte, u64 new_spte, int level,
523 				  bool shared)
524 {
525 	bool was_present = is_shadow_present_pte(old_spte);
526 	bool is_present = is_shadow_present_pte(new_spte);
527 	bool was_leaf = was_present && is_last_spte(old_spte, level);
528 	bool is_leaf = is_present && is_last_spte(new_spte, level);
529 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
530 
531 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
532 	WARN_ON(level < PG_LEVEL_4K);
533 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
534 
535 	/*
536 	 * If this warning were to trigger it would indicate that there was a
537 	 * missing MMU notifier or a race with some notifier handler.
538 	 * A present, leaf SPTE should never be directly replaced with another
539 	 * present leaf SPTE pointing to a different PFN. A notifier handler
540 	 * should be zapping the SPTE before the main MM's page table is
541 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
542 	 * thread before replacement.
543 	 */
544 	if (was_leaf && is_leaf && pfn_changed) {
545 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
546 		       "SPTE with another present leaf SPTE mapping a\n"
547 		       "different PFN!\n"
548 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
549 		       as_id, gfn, old_spte, new_spte, level);
550 
551 		/*
552 		 * Crash the host to prevent error propagation and guest data
553 		 * corruption.
554 		 */
555 		BUG();
556 	}
557 
558 	if (old_spte == new_spte)
559 		return;
560 
561 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
562 
563 	if (is_leaf)
564 		check_spte_writable_invariants(new_spte);
565 
566 	/*
567 	 * The only times a SPTE should be changed from a non-present to
568 	 * non-present state is when an MMIO entry is installed/modified/
569 	 * removed. In that case, there is nothing to do here.
570 	 */
571 	if (!was_present && !is_present) {
572 		/*
573 		 * If this change does not involve a MMIO SPTE or removed SPTE,
574 		 * it is unexpected. Log the change, though it should not
575 		 * impact the guest since both the former and current SPTEs
576 		 * are nonpresent.
577 		 */
578 		if (WARN_ON(!is_mmio_spte(old_spte) &&
579 			    !is_mmio_spte(new_spte) &&
580 			    !is_removed_spte(new_spte)))
581 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
582 			       "should not be replaced with another,\n"
583 			       "different nonpresent SPTE, unless one or both\n"
584 			       "are MMIO SPTEs, or the new SPTE is\n"
585 			       "a temporary removed SPTE.\n"
586 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
587 			       as_id, gfn, old_spte, new_spte, level);
588 		return;
589 	}
590 
591 	if (is_leaf != was_leaf)
592 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
593 
594 	if (was_leaf && is_dirty_spte(old_spte) &&
595 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
596 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
597 
598 	/*
599 	 * Recursively handle child PTs if the change removed a subtree from
600 	 * the paging structure.  Note the WARN on the PFN changing without the
601 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
602 	 * pages are kernel allocations and should never be migrated.
603 	 */
604 	if (was_present && !was_leaf &&
605 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
606 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
607 }
608 
609 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
610 				u64 old_spte, u64 new_spte, int level,
611 				bool shared)
612 {
613 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
614 			      shared);
615 	handle_changed_spte_acc_track(old_spte, new_spte, level);
616 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
617 				      new_spte, level);
618 }
619 
620 /*
621  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
622  * and handle the associated bookkeeping.  Do not mark the page dirty
623  * in KVM's dirty bitmaps.
624  *
625  * If setting the SPTE fails because it has changed, iter->old_spte will be
626  * refreshed to the current value of the spte.
627  *
628  * @kvm: kvm instance
629  * @iter: a tdp_iter instance currently on the SPTE that should be set
630  * @new_spte: The value the SPTE should be set to
631  * Return:
632  * * 0      - If the SPTE was set.
633  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
634  *            no side-effects other than setting iter->old_spte to the last
635  *            known value of the spte.
636  */
637 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
638 					  struct tdp_iter *iter,
639 					  u64 new_spte)
640 {
641 	u64 *sptep = rcu_dereference(iter->sptep);
642 
643 	/*
644 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
645 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
646 	 * and pre-checking before inserting a new SPTE is advantageous as it
647 	 * avoids unnecessary work.
648 	 */
649 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
650 
651 	lockdep_assert_held_read(&kvm->mmu_lock);
652 
653 	/*
654 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
655 	 * does not hold the mmu_lock.
656 	 */
657 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
658 		return -EBUSY;
659 
660 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
661 			      new_spte, iter->level, true);
662 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
663 
664 	return 0;
665 }
666 
667 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
668 					  struct tdp_iter *iter)
669 {
670 	int ret;
671 
672 	/*
673 	 * Freeze the SPTE by setting it to a special,
674 	 * non-present value. This will stop other threads from
675 	 * immediately installing a present entry in its place
676 	 * before the TLBs are flushed.
677 	 */
678 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
679 	if (ret)
680 		return ret;
681 
682 	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
683 
684 	/*
685 	 * No other thread can overwrite the removed SPTE as they must either
686 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
687 	 * overwrite the special removed SPTE value. No bookkeeping is needed
688 	 * here since the SPTE is going from non-present to non-present.  Use
689 	 * the raw write helper to avoid an unnecessary check on volatile bits.
690 	 */
691 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
692 
693 	return 0;
694 }
695 
696 
697 /*
698  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
699  * @kvm:	      KVM instance
700  * @as_id:	      Address space ID, i.e. regular vs. SMM
701  * @sptep:	      Pointer to the SPTE
702  * @old_spte:	      The current value of the SPTE
703  * @new_spte:	      The new value that will be set for the SPTE
704  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
705  * @level:	      The level _containing_ the SPTE (its parent PT's level)
706  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
707  *		      of the page. Should be set unless handling an MMU
708  *		      notifier for access tracking. Leaving record_acc_track
709  *		      unset in that case prevents page accesses from being
710  *		      double counted.
711  *
712  * Returns the old SPTE value, which _may_ be different than @old_spte if the
713  * SPTE had voldatile bits.
714  */
715 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
716 			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
717 			      bool record_acc_track)
718 {
719 	lockdep_assert_held_write(&kvm->mmu_lock);
720 
721 	/*
722 	 * No thread should be using this function to set SPTEs to or from the
723 	 * temporary removed SPTE value.
724 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
725 	 * should be used. If operating under the MMU lock in write mode, the
726 	 * use of the removed SPTE should not be necessary.
727 	 */
728 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
729 
730 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
731 
732 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
733 
734 	if (record_acc_track)
735 		handle_changed_spte_acc_track(old_spte, new_spte, level);
736 
737 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte,
738 				      level);
739 	return old_spte;
740 }
741 
742 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
743 				     u64 new_spte, bool record_acc_track)
744 {
745 	WARN_ON_ONCE(iter->yielded);
746 
747 	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
748 					    iter->old_spte, new_spte,
749 					    iter->gfn, iter->level,
750 					    record_acc_track);
751 }
752 
753 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
754 				    u64 new_spte)
755 {
756 	_tdp_mmu_set_spte(kvm, iter, new_spte, true);
757 }
758 
759 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
760 						 struct tdp_iter *iter,
761 						 u64 new_spte)
762 {
763 	_tdp_mmu_set_spte(kvm, iter, new_spte, false);
764 }
765 
766 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
767 	for_each_tdp_pte(_iter, _root, _start, _end)
768 
769 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
770 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
771 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
772 		    !is_last_spte(_iter.old_spte, _iter.level))		\
773 			continue;					\
774 		else
775 
776 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
777 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
778 
779 /*
780  * Yield if the MMU lock is contended or this thread needs to return control
781  * to the scheduler.
782  *
783  * If this function should yield and flush is set, it will perform a remote
784  * TLB flush before yielding.
785  *
786  * If this function yields, iter->yielded is set and the caller must skip to
787  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
788  * over the paging structures to allow the iterator to continue its traversal
789  * from the paging structure root.
790  *
791  * Returns true if this function yielded.
792  */
793 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
794 							  struct tdp_iter *iter,
795 							  bool flush, bool shared)
796 {
797 	WARN_ON(iter->yielded);
798 
799 	/* Ensure forward progress has been made before yielding. */
800 	if (iter->next_last_level_gfn == iter->yielded_gfn)
801 		return false;
802 
803 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
804 		if (flush)
805 			kvm_flush_remote_tlbs(kvm);
806 
807 		rcu_read_unlock();
808 
809 		if (shared)
810 			cond_resched_rwlock_read(&kvm->mmu_lock);
811 		else
812 			cond_resched_rwlock_write(&kvm->mmu_lock);
813 
814 		rcu_read_lock();
815 
816 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
817 
818 		iter->yielded = true;
819 	}
820 
821 	return iter->yielded;
822 }
823 
824 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
825 {
826 	/*
827 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
828 	 * a gpa range that would exceed the max gfn, and KVM does not create
829 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
830 	 * the slow emulation path every time.
831 	 */
832 	return kvm_mmu_max_gfn() + 1;
833 }
834 
835 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
836 			       bool shared, int zap_level)
837 {
838 	struct tdp_iter iter;
839 
840 	gfn_t end = tdp_mmu_max_gfn_exclusive();
841 	gfn_t start = 0;
842 
843 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
844 retry:
845 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
846 			continue;
847 
848 		if (!is_shadow_present_pte(iter.old_spte))
849 			continue;
850 
851 		if (iter.level > zap_level)
852 			continue;
853 
854 		if (!shared)
855 			tdp_mmu_set_spte(kvm, &iter, 0);
856 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
857 			goto retry;
858 	}
859 }
860 
861 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
862 			     bool shared)
863 {
864 
865 	/*
866 	 * The root must have an elevated refcount so that it's reachable via
867 	 * mmu_notifier callbacks, which allows this path to yield and drop
868 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
869 	 * must drop all references to relevant pages prior to completing the
870 	 * callback.  Dropping mmu_lock with an unreachable root would result
871 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
872 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
873 	 * dirty accessed bits to the SPTE's associated struct page.
874 	 */
875 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
876 
877 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
878 
879 	rcu_read_lock();
880 
881 	/*
882 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
883 	 * split the zap into two passes.  On the first pass, zap at the 1gb
884 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
885 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
886 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
887 	 *
888 	 * Because zapping a SP recurses on its children, stepping down to
889 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
890 	 */
891 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
892 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
893 
894 	rcu_read_unlock();
895 }
896 
897 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
898 {
899 	u64 old_spte;
900 
901 	/*
902 	 * This helper intentionally doesn't allow zapping a root shadow page,
903 	 * which doesn't have a parent page table and thus no associated entry.
904 	 */
905 	if (WARN_ON_ONCE(!sp->ptep))
906 		return false;
907 
908 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
909 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
910 		return false;
911 
912 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
913 			   sp->gfn, sp->role.level + 1, true);
914 
915 	return true;
916 }
917 
918 /*
919  * If can_yield is true, will release the MMU lock and reschedule if the
920  * scheduler needs the CPU or there is contention on the MMU lock. If this
921  * function cannot yield, it will not release the MMU lock or reschedule and
922  * the caller must ensure it does not supply too large a GFN range, or the
923  * operation can cause a soft lockup.
924  */
925 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
926 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
927 {
928 	struct tdp_iter iter;
929 
930 	end = min(end, tdp_mmu_max_gfn_exclusive());
931 
932 	lockdep_assert_held_write(&kvm->mmu_lock);
933 
934 	rcu_read_lock();
935 
936 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
937 		if (can_yield &&
938 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
939 			flush = false;
940 			continue;
941 		}
942 
943 		if (!is_shadow_present_pte(iter.old_spte) ||
944 		    !is_last_spte(iter.old_spte, iter.level))
945 			continue;
946 
947 		tdp_mmu_set_spte(kvm, &iter, 0);
948 		flush = true;
949 	}
950 
951 	rcu_read_unlock();
952 
953 	/*
954 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
955 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
956 	 */
957 	return flush;
958 }
959 
960 /*
961  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
962  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
963  * more SPTEs were zapped since the MMU lock was last acquired.
964  */
965 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
966 			   bool can_yield, bool flush)
967 {
968 	struct kvm_mmu_page *root;
969 
970 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
971 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
972 
973 	return flush;
974 }
975 
976 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
977 {
978 	struct kvm_mmu_page *root;
979 	int i;
980 
981 	/*
982 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
983 	 * before returning to the caller.  Zap directly even if the root is
984 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
985 	 * all that expensive and mmu_lock is already held, which means the
986 	 * worker has yielded, i.e. flushing the work instead of zapping here
987 	 * isn't guaranteed to be any faster.
988 	 *
989 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
990 	 * is being destroyed or the userspace VMM has exited.  In both cases,
991 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
992 	 */
993 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
994 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
995 			tdp_mmu_zap_root(kvm, root, false);
996 	}
997 }
998 
999 /*
1000  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1001  * zap" completes.
1002  */
1003 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1004 {
1005 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1006 }
1007 
1008 /*
1009  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1010  * is about to be zapped, e.g. in response to a memslots update.  The actual
1011  * zapping is performed asynchronously, so a reference is taken on all roots.
1012  * Using a separate workqueue makes it easy to ensure that the destruction is
1013  * performed before the "fast zap" completes, without keeping a separate list
1014  * of invalidated roots; the list is effectively the list of work items in
1015  * the workqueue.
1016  *
1017  * Get a reference even if the root is already invalid, the asynchronous worker
1018  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1019  * is held for write, it should be impossible to observe a root with zero refcount,
1020  * i.e. the list of roots cannot be stale.
1021  *
1022  * This has essentially the same effect for the TDP MMU
1023  * as updating mmu_valid_gen does for the shadow MMU.
1024  */
1025 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1026 {
1027 	struct kvm_mmu_page *root;
1028 
1029 	lockdep_assert_held_write(&kvm->mmu_lock);
1030 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1031 		if (!root->role.invalid &&
1032 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1033 			root->role.invalid = true;
1034 			tdp_mmu_schedule_zap_root(kvm, root);
1035 		}
1036 	}
1037 }
1038 
1039 /*
1040  * Installs a last-level SPTE to handle a TDP page fault.
1041  * (NPT/EPT violation/misconfiguration)
1042  */
1043 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1044 					  struct kvm_page_fault *fault,
1045 					  struct tdp_iter *iter)
1046 {
1047 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1048 	u64 new_spte;
1049 	int ret = RET_PF_FIXED;
1050 	bool wrprot = false;
1051 
1052 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1053 		return RET_PF_RETRY;
1054 
1055 	if (unlikely(!fault->slot))
1056 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1057 	else
1058 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1059 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1060 					 fault->map_writable, &new_spte);
1061 
1062 	if (new_spte == iter->old_spte)
1063 		ret = RET_PF_SPURIOUS;
1064 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1065 		return RET_PF_RETRY;
1066 	else if (is_shadow_present_pte(iter->old_spte) &&
1067 		 !is_last_spte(iter->old_spte, iter->level))
1068 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1069 
1070 	/*
1071 	 * If the page fault was caused by a write but the page is write
1072 	 * protected, emulation is needed. If the emulation was skipped,
1073 	 * the vCPU would have the same fault again.
1074 	 */
1075 	if (wrprot) {
1076 		if (fault->write)
1077 			ret = RET_PF_EMULATE;
1078 	}
1079 
1080 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1081 	if (unlikely(is_mmio_spte(new_spte))) {
1082 		vcpu->stat.pf_mmio_spte_created++;
1083 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1084 				     new_spte);
1085 		ret = RET_PF_EMULATE;
1086 	} else {
1087 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1088 				       rcu_dereference(iter->sptep));
1089 	}
1090 
1091 	return ret;
1092 }
1093 
1094 /*
1095  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1096  * provided page table.
1097  *
1098  * @kvm: kvm instance
1099  * @iter: a tdp_iter instance currently on the SPTE that should be set
1100  * @sp: The new TDP page table to install.
1101  * @shared: This operation is running under the MMU lock in read mode.
1102  *
1103  * Returns: 0 if the new page table was installed. Non-0 if the page table
1104  *          could not be installed (e.g. the atomic compare-exchange failed).
1105  */
1106 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1107 			   struct kvm_mmu_page *sp, bool shared)
1108 {
1109 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1110 	int ret = 0;
1111 
1112 	if (shared) {
1113 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1114 		if (ret)
1115 			return ret;
1116 	} else {
1117 		tdp_mmu_set_spte(kvm, iter, spte);
1118 	}
1119 
1120 	tdp_account_mmu_page(kvm, sp);
1121 
1122 	return 0;
1123 }
1124 
1125 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1126 				   struct kvm_mmu_page *sp, bool shared);
1127 
1128 /*
1129  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1130  * page tables and SPTEs to translate the faulting guest physical address.
1131  */
1132 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1133 {
1134 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1135 	struct kvm *kvm = vcpu->kvm;
1136 	struct tdp_iter iter;
1137 	struct kvm_mmu_page *sp;
1138 	int ret = RET_PF_RETRY;
1139 
1140 	kvm_mmu_hugepage_adjust(vcpu, fault);
1141 
1142 	trace_kvm_mmu_spte_requested(fault);
1143 
1144 	rcu_read_lock();
1145 
1146 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1147 		int r;
1148 
1149 		if (fault->nx_huge_page_workaround_enabled)
1150 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1151 
1152 		/*
1153 		 * If SPTE has been frozen by another thread, just give up and
1154 		 * retry, avoiding unnecessary page table allocation and free.
1155 		 */
1156 		if (is_removed_spte(iter.old_spte))
1157 			goto retry;
1158 
1159 		if (iter.level == fault->goal_level)
1160 			goto map_target_level;
1161 
1162 		/* Step down into the lower level page table if it exists. */
1163 		if (is_shadow_present_pte(iter.old_spte) &&
1164 		    !is_large_pte(iter.old_spte))
1165 			continue;
1166 
1167 		/*
1168 		 * The SPTE is either non-present or points to a huge page that
1169 		 * needs to be split.
1170 		 */
1171 		sp = tdp_mmu_alloc_sp(vcpu);
1172 		tdp_mmu_init_child_sp(sp, &iter);
1173 
1174 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1175 
1176 		if (is_shadow_present_pte(iter.old_spte))
1177 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1178 		else
1179 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1180 
1181 		/*
1182 		 * Force the guest to retry if installing an upper level SPTE
1183 		 * failed, e.g. because a different task modified the SPTE.
1184 		 */
1185 		if (r) {
1186 			tdp_mmu_free_sp(sp);
1187 			goto retry;
1188 		}
1189 
1190 		if (fault->huge_page_disallowed &&
1191 		    fault->req_level >= iter.level) {
1192 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1193 			if (sp->nx_huge_page_disallowed)
1194 				track_possible_nx_huge_page(kvm, sp);
1195 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1196 		}
1197 	}
1198 
1199 	/*
1200 	 * The walk aborted before reaching the target level, e.g. because the
1201 	 * iterator detected an upper level SPTE was frozen during traversal.
1202 	 */
1203 	WARN_ON_ONCE(iter.level == fault->goal_level);
1204 	goto retry;
1205 
1206 map_target_level:
1207 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1208 
1209 retry:
1210 	rcu_read_unlock();
1211 	return ret;
1212 }
1213 
1214 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1215 				 bool flush)
1216 {
1217 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1218 				     range->end, range->may_block, flush);
1219 }
1220 
1221 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1222 			      struct kvm_gfn_range *range);
1223 
1224 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1225 						   struct kvm_gfn_range *range,
1226 						   tdp_handler_t handler)
1227 {
1228 	struct kvm_mmu_page *root;
1229 	struct tdp_iter iter;
1230 	bool ret = false;
1231 
1232 	/*
1233 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1234 	 * into this helper allow blocking; it'd be dead, wasteful code.
1235 	 */
1236 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1237 		rcu_read_lock();
1238 
1239 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1240 			ret |= handler(kvm, &iter, range);
1241 
1242 		rcu_read_unlock();
1243 	}
1244 
1245 	return ret;
1246 }
1247 
1248 /*
1249  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1250  * if any of the GFNs in the range have been accessed.
1251  */
1252 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1253 			  struct kvm_gfn_range *range)
1254 {
1255 	u64 new_spte = 0;
1256 
1257 	/* If we have a non-accessed entry we don't need to change the pte. */
1258 	if (!is_accessed_spte(iter->old_spte))
1259 		return false;
1260 
1261 	new_spte = iter->old_spte;
1262 
1263 	if (spte_ad_enabled(new_spte)) {
1264 		new_spte &= ~shadow_accessed_mask;
1265 	} else {
1266 		/*
1267 		 * Capture the dirty status of the page, so that it doesn't get
1268 		 * lost when the SPTE is marked for access tracking.
1269 		 */
1270 		if (is_writable_pte(new_spte))
1271 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1272 
1273 		new_spte = mark_spte_for_access_track(new_spte);
1274 	}
1275 
1276 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1277 
1278 	return true;
1279 }
1280 
1281 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1282 {
1283 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1284 }
1285 
1286 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1287 			 struct kvm_gfn_range *range)
1288 {
1289 	return is_accessed_spte(iter->old_spte);
1290 }
1291 
1292 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1293 {
1294 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1295 }
1296 
1297 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1298 			 struct kvm_gfn_range *range)
1299 {
1300 	u64 new_spte;
1301 
1302 	/* Huge pages aren't expected to be modified without first being zapped. */
1303 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1304 
1305 	if (iter->level != PG_LEVEL_4K ||
1306 	    !is_shadow_present_pte(iter->old_spte))
1307 		return false;
1308 
1309 	/*
1310 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1311 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1312 	 * invariant that the PFN of a present * leaf SPTE can never change.
1313 	 * See __handle_changed_spte().
1314 	 */
1315 	tdp_mmu_set_spte(kvm, iter, 0);
1316 
1317 	if (!pte_write(range->pte)) {
1318 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1319 								  pte_pfn(range->pte));
1320 
1321 		tdp_mmu_set_spte(kvm, iter, new_spte);
1322 	}
1323 
1324 	return true;
1325 }
1326 
1327 /*
1328  * Handle the changed_pte MMU notifier for the TDP MMU.
1329  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1330  * notifier.
1331  * Returns non-zero if a flush is needed before releasing the MMU lock.
1332  */
1333 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1334 {
1335 	/*
1336 	 * No need to handle the remote TLB flush under RCU protection, the
1337 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1338 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1339 	 */
1340 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1341 }
1342 
1343 /*
1344  * Remove write access from all SPTEs at or above min_level that map GFNs
1345  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1346  * be flushed.
1347  */
1348 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1349 			     gfn_t start, gfn_t end, int min_level)
1350 {
1351 	struct tdp_iter iter;
1352 	u64 new_spte;
1353 	bool spte_set = false;
1354 
1355 	rcu_read_lock();
1356 
1357 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1358 
1359 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1360 retry:
1361 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1362 			continue;
1363 
1364 		if (!is_shadow_present_pte(iter.old_spte) ||
1365 		    !is_last_spte(iter.old_spte, iter.level) ||
1366 		    !(iter.old_spte & PT_WRITABLE_MASK))
1367 			continue;
1368 
1369 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1370 
1371 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1372 			goto retry;
1373 
1374 		spte_set = true;
1375 	}
1376 
1377 	rcu_read_unlock();
1378 	return spte_set;
1379 }
1380 
1381 /*
1382  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1383  * only affect leaf SPTEs down to min_level.
1384  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1385  */
1386 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1387 			     const struct kvm_memory_slot *slot, int min_level)
1388 {
1389 	struct kvm_mmu_page *root;
1390 	bool spte_set = false;
1391 
1392 	lockdep_assert_held_read(&kvm->mmu_lock);
1393 
1394 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1395 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1396 			     slot->base_gfn + slot->npages, min_level);
1397 
1398 	return spte_set;
1399 }
1400 
1401 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1402 {
1403 	struct kvm_mmu_page *sp;
1404 
1405 	gfp |= __GFP_ZERO;
1406 
1407 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1408 	if (!sp)
1409 		return NULL;
1410 
1411 	sp->spt = (void *)__get_free_page(gfp);
1412 	if (!sp->spt) {
1413 		kmem_cache_free(mmu_page_header_cache, sp);
1414 		return NULL;
1415 	}
1416 
1417 	return sp;
1418 }
1419 
1420 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1421 						       struct tdp_iter *iter,
1422 						       bool shared)
1423 {
1424 	struct kvm_mmu_page *sp;
1425 
1426 	/*
1427 	 * Since we are allocating while under the MMU lock we have to be
1428 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1429 	 * reclaim and to avoid making any filesystem callbacks (which can end
1430 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1431 	 *
1432 	 * If this allocation fails we drop the lock and retry with reclaim
1433 	 * allowed.
1434 	 */
1435 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1436 	if (sp)
1437 		return sp;
1438 
1439 	rcu_read_unlock();
1440 
1441 	if (shared)
1442 		read_unlock(&kvm->mmu_lock);
1443 	else
1444 		write_unlock(&kvm->mmu_lock);
1445 
1446 	iter->yielded = true;
1447 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1448 
1449 	if (shared)
1450 		read_lock(&kvm->mmu_lock);
1451 	else
1452 		write_lock(&kvm->mmu_lock);
1453 
1454 	rcu_read_lock();
1455 
1456 	return sp;
1457 }
1458 
1459 /* Note, the caller is responsible for initializing @sp. */
1460 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1461 				   struct kvm_mmu_page *sp, bool shared)
1462 {
1463 	const u64 huge_spte = iter->old_spte;
1464 	const int level = iter->level;
1465 	int ret, i;
1466 
1467 	/*
1468 	 * No need for atomics when writing to sp->spt since the page table has
1469 	 * not been linked in yet and thus is not reachable from any other CPU.
1470 	 */
1471 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1472 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1473 
1474 	/*
1475 	 * Replace the huge spte with a pointer to the populated lower level
1476 	 * page table. Since we are making this change without a TLB flush vCPUs
1477 	 * will see a mix of the split mappings and the original huge mapping,
1478 	 * depending on what's currently in their TLB. This is fine from a
1479 	 * correctness standpoint since the translation will be the same either
1480 	 * way.
1481 	 */
1482 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1483 	if (ret)
1484 		goto out;
1485 
1486 	/*
1487 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1488 	 * are overwriting from the page stats. But we have to manually update
1489 	 * the page stats with the new present child pages.
1490 	 */
1491 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1492 
1493 out:
1494 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1495 	return ret;
1496 }
1497 
1498 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1499 					 struct kvm_mmu_page *root,
1500 					 gfn_t start, gfn_t end,
1501 					 int target_level, bool shared)
1502 {
1503 	struct kvm_mmu_page *sp = NULL;
1504 	struct tdp_iter iter;
1505 	int ret = 0;
1506 
1507 	rcu_read_lock();
1508 
1509 	/*
1510 	 * Traverse the page table splitting all huge pages above the target
1511 	 * level into one lower level. For example, if we encounter a 1GB page
1512 	 * we split it into 512 2MB pages.
1513 	 *
1514 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1515 	 * to visit an SPTE before ever visiting its children, which means we
1516 	 * will correctly recursively split huge pages that are more than one
1517 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1518 	 * and then splitting each of those to 512 4KB pages).
1519 	 */
1520 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1521 retry:
1522 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1523 			continue;
1524 
1525 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1526 			continue;
1527 
1528 		if (!sp) {
1529 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1530 			if (!sp) {
1531 				ret = -ENOMEM;
1532 				trace_kvm_mmu_split_huge_page(iter.gfn,
1533 							      iter.old_spte,
1534 							      iter.level, ret);
1535 				break;
1536 			}
1537 
1538 			if (iter.yielded)
1539 				continue;
1540 		}
1541 
1542 		tdp_mmu_init_child_sp(sp, &iter);
1543 
1544 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1545 			goto retry;
1546 
1547 		sp = NULL;
1548 	}
1549 
1550 	rcu_read_unlock();
1551 
1552 	/*
1553 	 * It's possible to exit the loop having never used the last sp if, for
1554 	 * example, a vCPU doing HugePage NX splitting wins the race and
1555 	 * installs its own sp in place of the last sp we tried to split.
1556 	 */
1557 	if (sp)
1558 		tdp_mmu_free_sp(sp);
1559 
1560 	return ret;
1561 }
1562 
1563 
1564 /*
1565  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1566  */
1567 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1568 				      const struct kvm_memory_slot *slot,
1569 				      gfn_t start, gfn_t end,
1570 				      int target_level, bool shared)
1571 {
1572 	struct kvm_mmu_page *root;
1573 	int r = 0;
1574 
1575 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1576 
1577 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1578 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1579 		if (r) {
1580 			kvm_tdp_mmu_put_root(kvm, root, shared);
1581 			break;
1582 		}
1583 	}
1584 }
1585 
1586 /*
1587  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1588  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1589  * If AD bits are not enabled, this will require clearing the writable bit on
1590  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1591  * be flushed.
1592  */
1593 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1594 			   gfn_t start, gfn_t end)
1595 {
1596 	u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1597 	struct tdp_iter iter;
1598 	bool spte_set = false;
1599 
1600 	rcu_read_lock();
1601 
1602 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1603 retry:
1604 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1605 			continue;
1606 
1607 		if (!is_shadow_present_pte(iter.old_spte))
1608 			continue;
1609 
1610 		MMU_WARN_ON(kvm_ad_enabled() &&
1611 			    spte_ad_need_write_protect(iter.old_spte));
1612 
1613 		if (!(iter.old_spte & dbit))
1614 			continue;
1615 
1616 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1617 			goto retry;
1618 
1619 		spte_set = true;
1620 	}
1621 
1622 	rcu_read_unlock();
1623 	return spte_set;
1624 }
1625 
1626 /*
1627  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1628  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1629  * If AD bits are not enabled, this will require clearing the writable bit on
1630  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1631  * be flushed.
1632  */
1633 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1634 				  const struct kvm_memory_slot *slot)
1635 {
1636 	struct kvm_mmu_page *root;
1637 	bool spte_set = false;
1638 
1639 	lockdep_assert_held_read(&kvm->mmu_lock);
1640 
1641 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1642 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1643 				slot->base_gfn + slot->npages);
1644 
1645 	return spte_set;
1646 }
1647 
1648 /*
1649  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1650  * set in mask, starting at gfn. The given memslot is expected to contain all
1651  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1652  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1653  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1654  */
1655 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1656 				  gfn_t gfn, unsigned long mask, bool wrprot)
1657 {
1658 	u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1659 						   shadow_dirty_mask;
1660 	struct tdp_iter iter;
1661 
1662 	rcu_read_lock();
1663 
1664 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1665 				    gfn + BITS_PER_LONG) {
1666 		if (!mask)
1667 			break;
1668 
1669 		MMU_WARN_ON(kvm_ad_enabled() &&
1670 			    spte_ad_need_write_protect(iter.old_spte));
1671 
1672 		if (iter.level > PG_LEVEL_4K ||
1673 		    !(mask & (1UL << (iter.gfn - gfn))))
1674 			continue;
1675 
1676 		mask &= ~(1UL << (iter.gfn - gfn));
1677 
1678 		if (!(iter.old_spte & dbit))
1679 			continue;
1680 
1681 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1682 							iter.old_spte, dbit,
1683 							iter.level);
1684 
1685 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1686 					       iter.old_spte,
1687 					       iter.old_spte & ~dbit);
1688 		kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1689 	}
1690 
1691 	rcu_read_unlock();
1692 }
1693 
1694 /*
1695  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1696  * set in mask, starting at gfn. The given memslot is expected to contain all
1697  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1698  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1699  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1700  */
1701 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1702 				       struct kvm_memory_slot *slot,
1703 				       gfn_t gfn, unsigned long mask,
1704 				       bool wrprot)
1705 {
1706 	struct kvm_mmu_page *root;
1707 
1708 	lockdep_assert_held_write(&kvm->mmu_lock);
1709 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1710 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1711 }
1712 
1713 static void zap_collapsible_spte_range(struct kvm *kvm,
1714 				       struct kvm_mmu_page *root,
1715 				       const struct kvm_memory_slot *slot)
1716 {
1717 	gfn_t start = slot->base_gfn;
1718 	gfn_t end = start + slot->npages;
1719 	struct tdp_iter iter;
1720 	int max_mapping_level;
1721 
1722 	rcu_read_lock();
1723 
1724 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1725 retry:
1726 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1727 			continue;
1728 
1729 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1730 		    !is_shadow_present_pte(iter.old_spte))
1731 			continue;
1732 
1733 		/*
1734 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1735 		 * a large page size, then its parent would have been zapped
1736 		 * instead of stepping down.
1737 		 */
1738 		if (is_last_spte(iter.old_spte, iter.level))
1739 			continue;
1740 
1741 		/*
1742 		 * If iter.gfn resides outside of the slot, i.e. the page for
1743 		 * the current level overlaps but is not contained by the slot,
1744 		 * then the SPTE can't be made huge.  More importantly, trying
1745 		 * to query that info from slot->arch.lpage_info will cause an
1746 		 * out-of-bounds access.
1747 		 */
1748 		if (iter.gfn < start || iter.gfn >= end)
1749 			continue;
1750 
1751 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1752 							      iter.gfn, PG_LEVEL_NUM);
1753 		if (max_mapping_level < iter.level)
1754 			continue;
1755 
1756 		/* Note, a successful atomic zap also does a remote TLB flush. */
1757 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1758 			goto retry;
1759 	}
1760 
1761 	rcu_read_unlock();
1762 }
1763 
1764 /*
1765  * Zap non-leaf SPTEs (and free their associated page tables) which could
1766  * be replaced by huge pages, for GFNs within the slot.
1767  */
1768 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1769 				       const struct kvm_memory_slot *slot)
1770 {
1771 	struct kvm_mmu_page *root;
1772 
1773 	lockdep_assert_held_read(&kvm->mmu_lock);
1774 
1775 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1776 		zap_collapsible_spte_range(kvm, root, slot);
1777 }
1778 
1779 /*
1780  * Removes write access on the last level SPTE mapping this GFN and unsets the
1781  * MMU-writable bit to ensure future writes continue to be intercepted.
1782  * Returns true if an SPTE was set and a TLB flush is needed.
1783  */
1784 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1785 			      gfn_t gfn, int min_level)
1786 {
1787 	struct tdp_iter iter;
1788 	u64 new_spte;
1789 	bool spte_set = false;
1790 
1791 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1792 
1793 	rcu_read_lock();
1794 
1795 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1796 		if (!is_shadow_present_pte(iter.old_spte) ||
1797 		    !is_last_spte(iter.old_spte, iter.level))
1798 			continue;
1799 
1800 		new_spte = iter.old_spte &
1801 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1802 
1803 		if (new_spte == iter.old_spte)
1804 			break;
1805 
1806 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1807 		spte_set = true;
1808 	}
1809 
1810 	rcu_read_unlock();
1811 
1812 	return spte_set;
1813 }
1814 
1815 /*
1816  * Removes write access on the last level SPTE mapping this GFN and unsets the
1817  * MMU-writable bit to ensure future writes continue to be intercepted.
1818  * Returns true if an SPTE was set and a TLB flush is needed.
1819  */
1820 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1821 				   struct kvm_memory_slot *slot, gfn_t gfn,
1822 				   int min_level)
1823 {
1824 	struct kvm_mmu_page *root;
1825 	bool spte_set = false;
1826 
1827 	lockdep_assert_held_write(&kvm->mmu_lock);
1828 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1829 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1830 
1831 	return spte_set;
1832 }
1833 
1834 /*
1835  * Return the level of the lowest level SPTE added to sptes.
1836  * That SPTE may be non-present.
1837  *
1838  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1839  */
1840 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1841 			 int *root_level)
1842 {
1843 	struct tdp_iter iter;
1844 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1845 	gfn_t gfn = addr >> PAGE_SHIFT;
1846 	int leaf = -1;
1847 
1848 	*root_level = vcpu->arch.mmu->root_role.level;
1849 
1850 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1851 		leaf = iter.level;
1852 		sptes[leaf] = iter.old_spte;
1853 	}
1854 
1855 	return leaf;
1856 }
1857 
1858 /*
1859  * Returns the last level spte pointer of the shadow page walk for the given
1860  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1861  * walk could be performed, returns NULL and *spte does not contain valid data.
1862  *
1863  * Contract:
1864  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1865  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1866  *
1867  * WARNING: This function is only intended to be called during fast_page_fault.
1868  */
1869 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1870 					u64 *spte)
1871 {
1872 	struct tdp_iter iter;
1873 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1874 	gfn_t gfn = addr >> PAGE_SHIFT;
1875 	tdp_ptep_t sptep = NULL;
1876 
1877 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1878 		*spte = iter.old_spte;
1879 		sptep = iter.sptep;
1880 	}
1881 
1882 	/*
1883 	 * Perform the rcu_dereference to get the raw spte pointer value since
1884 	 * we are passing it up to fast_page_fault, which is shared with the
1885 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1886 	 * annotation.
1887 	 *
1888 	 * This is safe since fast_page_fault obeys the contracts of this
1889 	 * function as well as all TDP MMU contracts around modifying SPTEs
1890 	 * outside of mmu_lock.
1891 	 */
1892 	return rcu_dereference(sptep);
1893 }
1894