xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision cf05e8c7)
1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10 
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13 
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 	struct workqueue_struct *wq;
18 
19 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
20 	if (!wq)
21 		return -ENOMEM;
22 
23 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
24 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25 	kvm->arch.tdp_mmu_zap_wq = wq;
26 	return 1;
27 }
28 
29 /* Arbitrarily returns true so that this may be used in if statements. */
30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
31 							     bool shared)
32 {
33 	if (shared)
34 		lockdep_assert_held_read(&kvm->mmu_lock);
35 	else
36 		lockdep_assert_held_write(&kvm->mmu_lock);
37 
38 	return true;
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	/* Also waits for any queued work items.  */
44 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
45 
46 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
52 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
53 	 */
54 	rcu_barrier();
55 }
56 
57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
58 {
59 	free_page((unsigned long)sp->spt);
60 	kmem_cache_free(mmu_page_header_cache, sp);
61 }
62 
63 /*
64  * This is called through call_rcu in order to free TDP page table memory
65  * safely with respect to other kernel threads that may be operating on
66  * the memory.
67  * By only accessing TDP MMU page table memory in an RCU read critical
68  * section, and freeing it after a grace period, lockless access to that
69  * memory won't use it after it is freed.
70  */
71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72 {
73 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 					       rcu_head);
75 
76 	tdp_mmu_free_sp(sp);
77 }
78 
79 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
80 			     bool shared);
81 
82 static void tdp_mmu_zap_root_work(struct work_struct *work)
83 {
84 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
85 						 tdp_mmu_async_work);
86 	struct kvm *kvm = root->tdp_mmu_async_data;
87 
88 	read_lock(&kvm->mmu_lock);
89 
90 	/*
91 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
92 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
93 	 * to a different pCPU.  Note, the local TLB flush on reuse also
94 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
95 	 * intermediate paging structures, that may be zapped, as such entries
96 	 * are associated with the ASID on both VMX and SVM.
97 	 */
98 	tdp_mmu_zap_root(kvm, root, true);
99 
100 	/*
101 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
102 	 * avoiding an infinite loop.  By design, the root is reachable while
103 	 * it's being asynchronously zapped, thus a different task can put its
104 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
105 	 * asynchronously zapped root is unavoidable.
106 	 */
107 	kvm_tdp_mmu_put_root(kvm, root, true);
108 
109 	read_unlock(&kvm->mmu_lock);
110 }
111 
112 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
113 {
114 	root->tdp_mmu_async_data = kvm;
115 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
116 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
117 }
118 
119 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
120 {
121 	union kvm_mmu_page_role role = page->role;
122 	role.invalid = true;
123 
124 	/* No need to use cmpxchg, only the invalid bit can change.  */
125 	role.word = xchg(&page->role.word, role.word);
126 	return role.invalid;
127 }
128 
129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
130 			  bool shared)
131 {
132 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
133 
134 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
135 		return;
136 
137 	WARN_ON(!is_tdp_mmu_page(root));
138 
139 	/*
140 	 * The root now has refcount=0.  It is valid, but readers already
141 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
142 	 * rejects it.  This remains true for the rest of the execution
143 	 * of this function, because readers visit valid roots only
144 	 * (except for tdp_mmu_zap_root_work(), which however
145 	 * does not acquire any reference itself).
146 	 *
147 	 * Even though there are flows that need to visit all roots for
148 	 * correctness, they all take mmu_lock for write, so they cannot yet
149 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
150 	 * since the root still has refcount=0.
151 	 *
152 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
153 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
154 	 * So the root temporarily gets an extra reference, going to refcount=1
155 	 * while staying invalid.  Readers still cannot acquire any reference;
156 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
157 	 * they might take an extra reference if they themselves yield.
158 	 * Therefore, when the reference is given back by the worker,
159 	 * there is no guarantee that the refcount is still 1.  If not, whoever
160 	 * puts the last reference will free the page, but they will not have to
161 	 * zap the root because a root cannot go from invalid to valid.
162 	 */
163 	if (!kvm_tdp_root_mark_invalid(root)) {
164 		refcount_set(&root->tdp_mmu_root_count, 1);
165 
166 		/*
167 		 * Zapping the root in a worker is not just "nice to have";
168 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
169 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
170 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
171 		 * might return with some roots not zapped yet.
172 		 */
173 		tdp_mmu_schedule_zap_root(kvm, root);
174 		return;
175 	}
176 
177 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
178 	list_del_rcu(&root->link);
179 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
180 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
181 }
182 
183 /*
184  * Returns the next root after @prev_root (or the first root if @prev_root is
185  * NULL).  A reference to the returned root is acquired, and the reference to
186  * @prev_root is released (the caller obviously must hold a reference to
187  * @prev_root if it's non-NULL).
188  *
189  * If @only_valid is true, invalid roots are skipped.
190  *
191  * Returns NULL if the end of tdp_mmu_roots was reached.
192  */
193 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
194 					      struct kvm_mmu_page *prev_root,
195 					      bool shared, bool only_valid)
196 {
197 	struct kvm_mmu_page *next_root;
198 
199 	rcu_read_lock();
200 
201 	if (prev_root)
202 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
203 						  &prev_root->link,
204 						  typeof(*prev_root), link);
205 	else
206 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
207 						   typeof(*next_root), link);
208 
209 	while (next_root) {
210 		if ((!only_valid || !next_root->role.invalid) &&
211 		    kvm_tdp_mmu_get_root(next_root))
212 			break;
213 
214 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
215 				&next_root->link, typeof(*next_root), link);
216 	}
217 
218 	rcu_read_unlock();
219 
220 	if (prev_root)
221 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
222 
223 	return next_root;
224 }
225 
226 /*
227  * Note: this iterator gets and puts references to the roots it iterates over.
228  * This makes it safe to release the MMU lock and yield within the loop, but
229  * if exiting the loop early, the caller must drop the reference to the most
230  * recent root. (Unless keeping a live reference is desirable.)
231  *
232  * If shared is set, this function is operating under the MMU lock in read
233  * mode. In the unlikely event that this thread must free a root, the lock
234  * will be temporarily dropped and reacquired in write mode.
235  */
236 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
237 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
238 	     _root;								\
239 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
240 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
241 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
242 		} else
243 
244 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
245 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
246 
247 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
248 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
249 
250 /*
251  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
252  * the implication being that any flow that holds mmu_lock for read is
253  * inherently yield-friendly and should use the yield-safe variant above.
254  * Holding mmu_lock for write obviates the need for RCU protection as the list
255  * is guaranteed to be stable.
256  */
257 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
258 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
259 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
260 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
261 		} else
262 
263 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
264 {
265 	struct kvm_mmu_page *sp;
266 
267 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
268 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
269 
270 	return sp;
271 }
272 
273 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
274 			    gfn_t gfn, union kvm_mmu_page_role role)
275 {
276 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
277 
278 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
279 
280 	sp->role = role;
281 	sp->gfn = gfn;
282 	sp->ptep = sptep;
283 	sp->tdp_mmu_page = true;
284 
285 	trace_kvm_mmu_get_page(sp, true);
286 }
287 
288 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
289 				  struct tdp_iter *iter)
290 {
291 	struct kvm_mmu_page *parent_sp;
292 	union kvm_mmu_page_role role;
293 
294 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
295 
296 	role = parent_sp->role;
297 	role.level--;
298 
299 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
300 }
301 
302 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
303 {
304 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
305 	struct kvm *kvm = vcpu->kvm;
306 	struct kvm_mmu_page *root;
307 
308 	lockdep_assert_held_write(&kvm->mmu_lock);
309 
310 	/*
311 	 * Check for an existing root before allocating a new one.  Note, the
312 	 * role check prevents consuming an invalid root.
313 	 */
314 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
315 		if (root->role.word == role.word &&
316 		    kvm_tdp_mmu_get_root(root))
317 			goto out;
318 	}
319 
320 	root = tdp_mmu_alloc_sp(vcpu);
321 	tdp_mmu_init_sp(root, NULL, 0, role);
322 
323 	refcount_set(&root->tdp_mmu_root_count, 1);
324 
325 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
326 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
327 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
328 
329 out:
330 	return __pa(root->spt);
331 }
332 
333 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
334 				u64 old_spte, u64 new_spte, int level,
335 				bool shared);
336 
337 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
338 {
339 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
340 		return;
341 
342 	if (is_accessed_spte(old_spte) &&
343 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
344 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
345 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
346 }
347 
348 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
349 					  u64 old_spte, u64 new_spte, int level)
350 {
351 	bool pfn_changed;
352 	struct kvm_memory_slot *slot;
353 
354 	if (level > PG_LEVEL_4K)
355 		return;
356 
357 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
358 
359 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
360 	    is_writable_pte(new_spte)) {
361 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
362 		mark_page_dirty_in_slot(kvm, slot, gfn);
363 	}
364 }
365 
366 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
367 {
368 	kvm_account_pgtable_pages((void *)sp->spt, +1);
369 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
370 }
371 
372 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
373 {
374 	kvm_account_pgtable_pages((void *)sp->spt, -1);
375 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
376 }
377 
378 /**
379  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
380  *
381  * @kvm: kvm instance
382  * @sp: the page to be removed
383  * @shared: This operation may not be running under the exclusive use of
384  *	    the MMU lock and the operation must synchronize with other
385  *	    threads that might be adding or removing pages.
386  */
387 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
388 			      bool shared)
389 {
390 	tdp_unaccount_mmu_page(kvm, sp);
391 
392 	if (!sp->nx_huge_page_disallowed)
393 		return;
394 
395 	if (shared)
396 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
397 	else
398 		lockdep_assert_held_write(&kvm->mmu_lock);
399 
400 	sp->nx_huge_page_disallowed = false;
401 	untrack_possible_nx_huge_page(kvm, sp);
402 
403 	if (shared)
404 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
405 }
406 
407 /**
408  * handle_removed_pt() - handle a page table removed from the TDP structure
409  *
410  * @kvm: kvm instance
411  * @pt: the page removed from the paging structure
412  * @shared: This operation may not be running under the exclusive use
413  *	    of the MMU lock and the operation must synchronize with other
414  *	    threads that might be modifying SPTEs.
415  *
416  * Given a page table that has been removed from the TDP paging structure,
417  * iterates through the page table to clear SPTEs and free child page tables.
418  *
419  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
420  * protection. Since this thread removed it from the paging structure,
421  * this thread will be responsible for ensuring the page is freed. Hence the
422  * early rcu_dereferences in the function.
423  */
424 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
425 {
426 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
427 	int level = sp->role.level;
428 	gfn_t base_gfn = sp->gfn;
429 	int i;
430 
431 	trace_kvm_mmu_prepare_zap_page(sp);
432 
433 	tdp_mmu_unlink_sp(kvm, sp, shared);
434 
435 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
436 		tdp_ptep_t sptep = pt + i;
437 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
438 		u64 old_spte;
439 
440 		if (shared) {
441 			/*
442 			 * Set the SPTE to a nonpresent value that other
443 			 * threads will not overwrite. If the SPTE was
444 			 * already marked as removed then another thread
445 			 * handling a page fault could overwrite it, so
446 			 * set the SPTE until it is set from some other
447 			 * value to the removed SPTE value.
448 			 */
449 			for (;;) {
450 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
451 				if (!is_removed_spte(old_spte))
452 					break;
453 				cpu_relax();
454 			}
455 		} else {
456 			/*
457 			 * If the SPTE is not MMU-present, there is no backing
458 			 * page associated with the SPTE and so no side effects
459 			 * that need to be recorded, and exclusive ownership of
460 			 * mmu_lock ensures the SPTE can't be made present.
461 			 * Note, zapping MMIO SPTEs is also unnecessary as they
462 			 * are guarded by the memslots generation, not by being
463 			 * unreachable.
464 			 */
465 			old_spte = kvm_tdp_mmu_read_spte(sptep);
466 			if (!is_shadow_present_pte(old_spte))
467 				continue;
468 
469 			/*
470 			 * Use the common helper instead of a raw WRITE_ONCE as
471 			 * the SPTE needs to be updated atomically if it can be
472 			 * modified by a different vCPU outside of mmu_lock.
473 			 * Even though the parent SPTE is !PRESENT, the TLB
474 			 * hasn't yet been flushed, and both Intel and AMD
475 			 * document that A/D assists can use upper-level PxE
476 			 * entries that are cached in the TLB, i.e. the CPU can
477 			 * still access the page and mark it dirty.
478 			 *
479 			 * No retry is needed in the atomic update path as the
480 			 * sole concern is dropping a Dirty bit, i.e. no other
481 			 * task can zap/remove the SPTE as mmu_lock is held for
482 			 * write.  Marking the SPTE as a removed SPTE is not
483 			 * strictly necessary for the same reason, but using
484 			 * the remove SPTE value keeps the shared/exclusive
485 			 * paths consistent and allows the handle_changed_spte()
486 			 * call below to hardcode the new value to REMOVED_SPTE.
487 			 *
488 			 * Note, even though dropping a Dirty bit is the only
489 			 * scenario where a non-atomic update could result in a
490 			 * functional bug, simply checking the Dirty bit isn't
491 			 * sufficient as a fast page fault could read the upper
492 			 * level SPTE before it is zapped, and then make this
493 			 * target SPTE writable, resume the guest, and set the
494 			 * Dirty bit between reading the SPTE above and writing
495 			 * it here.
496 			 */
497 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
498 							  REMOVED_SPTE, level);
499 		}
500 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
501 				    old_spte, REMOVED_SPTE, level, shared);
502 	}
503 
504 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
505 }
506 
507 /**
508  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
509  * @kvm: kvm instance
510  * @as_id: the address space of the paging structure the SPTE was a part of
511  * @gfn: the base GFN that was mapped by the SPTE
512  * @old_spte: The value of the SPTE before the change
513  * @new_spte: The value of the SPTE after the change
514  * @level: the level of the PT the SPTE is part of in the paging structure
515  * @shared: This operation may not be running under the exclusive use of
516  *	    the MMU lock and the operation must synchronize with other
517  *	    threads that might be modifying SPTEs.
518  *
519  * Handle bookkeeping that might result from the modification of a SPTE.
520  * This function must be called for all TDP SPTE modifications.
521  */
522 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
523 				  u64 old_spte, u64 new_spte, int level,
524 				  bool shared)
525 {
526 	bool was_present = is_shadow_present_pte(old_spte);
527 	bool is_present = is_shadow_present_pte(new_spte);
528 	bool was_leaf = was_present && is_last_spte(old_spte, level);
529 	bool is_leaf = is_present && is_last_spte(new_spte, level);
530 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
531 
532 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
533 	WARN_ON(level < PG_LEVEL_4K);
534 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
535 
536 	/*
537 	 * If this warning were to trigger it would indicate that there was a
538 	 * missing MMU notifier or a race with some notifier handler.
539 	 * A present, leaf SPTE should never be directly replaced with another
540 	 * present leaf SPTE pointing to a different PFN. A notifier handler
541 	 * should be zapping the SPTE before the main MM's page table is
542 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
543 	 * thread before replacement.
544 	 */
545 	if (was_leaf && is_leaf && pfn_changed) {
546 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
547 		       "SPTE with another present leaf SPTE mapping a\n"
548 		       "different PFN!\n"
549 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
550 		       as_id, gfn, old_spte, new_spte, level);
551 
552 		/*
553 		 * Crash the host to prevent error propagation and guest data
554 		 * corruption.
555 		 */
556 		BUG();
557 	}
558 
559 	if (old_spte == new_spte)
560 		return;
561 
562 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
563 
564 	if (is_leaf)
565 		check_spte_writable_invariants(new_spte);
566 
567 	/*
568 	 * The only times a SPTE should be changed from a non-present to
569 	 * non-present state is when an MMIO entry is installed/modified/
570 	 * removed. In that case, there is nothing to do here.
571 	 */
572 	if (!was_present && !is_present) {
573 		/*
574 		 * If this change does not involve a MMIO SPTE or removed SPTE,
575 		 * it is unexpected. Log the change, though it should not
576 		 * impact the guest since both the former and current SPTEs
577 		 * are nonpresent.
578 		 */
579 		if (WARN_ON(!is_mmio_spte(old_spte) &&
580 			    !is_mmio_spte(new_spte) &&
581 			    !is_removed_spte(new_spte)))
582 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
583 			       "should not be replaced with another,\n"
584 			       "different nonpresent SPTE, unless one or both\n"
585 			       "are MMIO SPTEs, or the new SPTE is\n"
586 			       "a temporary removed SPTE.\n"
587 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
588 			       as_id, gfn, old_spte, new_spte, level);
589 		return;
590 	}
591 
592 	if (is_leaf != was_leaf)
593 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
594 
595 	if (was_leaf && is_dirty_spte(old_spte) &&
596 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
597 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
598 
599 	/*
600 	 * Recursively handle child PTs if the change removed a subtree from
601 	 * the paging structure.  Note the WARN on the PFN changing without the
602 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
603 	 * pages are kernel allocations and should never be migrated.
604 	 */
605 	if (was_present && !was_leaf &&
606 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
607 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
608 }
609 
610 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
611 				u64 old_spte, u64 new_spte, int level,
612 				bool shared)
613 {
614 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
615 			      shared);
616 	handle_changed_spte_acc_track(old_spte, new_spte, level);
617 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
618 				      new_spte, level);
619 }
620 
621 /*
622  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
623  * and handle the associated bookkeeping.  Do not mark the page dirty
624  * in KVM's dirty bitmaps.
625  *
626  * If setting the SPTE fails because it has changed, iter->old_spte will be
627  * refreshed to the current value of the spte.
628  *
629  * @kvm: kvm instance
630  * @iter: a tdp_iter instance currently on the SPTE that should be set
631  * @new_spte: The value the SPTE should be set to
632  * Return:
633  * * 0      - If the SPTE was set.
634  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
635  *            no side-effects other than setting iter->old_spte to the last
636  *            known value of the spte.
637  */
638 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
639 					  struct tdp_iter *iter,
640 					  u64 new_spte)
641 {
642 	u64 *sptep = rcu_dereference(iter->sptep);
643 
644 	/*
645 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
646 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
647 	 * and pre-checking before inserting a new SPTE is advantageous as it
648 	 * avoids unnecessary work.
649 	 */
650 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
651 
652 	lockdep_assert_held_read(&kvm->mmu_lock);
653 
654 	/*
655 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
656 	 * does not hold the mmu_lock.
657 	 */
658 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
659 		return -EBUSY;
660 
661 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
662 			      new_spte, iter->level, true);
663 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
664 
665 	return 0;
666 }
667 
668 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
669 					  struct tdp_iter *iter)
670 {
671 	int ret;
672 
673 	/*
674 	 * Freeze the SPTE by setting it to a special,
675 	 * non-present value. This will stop other threads from
676 	 * immediately installing a present entry in its place
677 	 * before the TLBs are flushed.
678 	 */
679 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
680 	if (ret)
681 		return ret;
682 
683 	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
684 
685 	/*
686 	 * No other thread can overwrite the removed SPTE as they must either
687 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
688 	 * overwrite the special removed SPTE value. No bookkeeping is needed
689 	 * here since the SPTE is going from non-present to non-present.  Use
690 	 * the raw write helper to avoid an unnecessary check on volatile bits.
691 	 */
692 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
693 
694 	return 0;
695 }
696 
697 
698 /*
699  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
700  * @kvm:	      KVM instance
701  * @as_id:	      Address space ID, i.e. regular vs. SMM
702  * @sptep:	      Pointer to the SPTE
703  * @old_spte:	      The current value of the SPTE
704  * @new_spte:	      The new value that will be set for the SPTE
705  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
706  * @level:	      The level _containing_ the SPTE (its parent PT's level)
707  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
708  *		      of the page. Should be set unless handling an MMU
709  *		      notifier for access tracking. Leaving record_acc_track
710  *		      unset in that case prevents page accesses from being
711  *		      double counted.
712  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
713  *		      appropriate for the change being made. Should be set
714  *		      unless performing certain dirty logging operations.
715  *		      Leaving record_dirty_log unset in that case prevents page
716  *		      writes from being double counted.
717  *
718  * Returns the old SPTE value, which _may_ be different than @old_spte if the
719  * SPTE had voldatile bits.
720  */
721 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
722 			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
723 			      bool record_acc_track, bool record_dirty_log)
724 {
725 	lockdep_assert_held_write(&kvm->mmu_lock);
726 
727 	/*
728 	 * No thread should be using this function to set SPTEs to or from the
729 	 * temporary removed SPTE value.
730 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
731 	 * should be used. If operating under the MMU lock in write mode, the
732 	 * use of the removed SPTE should not be necessary.
733 	 */
734 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
735 
736 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
737 
738 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
739 
740 	if (record_acc_track)
741 		handle_changed_spte_acc_track(old_spte, new_spte, level);
742 	if (record_dirty_log)
743 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
744 					      new_spte, level);
745 	return old_spte;
746 }
747 
748 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
749 				     u64 new_spte, bool record_acc_track,
750 				     bool record_dirty_log)
751 {
752 	WARN_ON_ONCE(iter->yielded);
753 
754 	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
755 					    iter->old_spte, new_spte,
756 					    iter->gfn, iter->level,
757 					    record_acc_track, record_dirty_log);
758 }
759 
760 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
761 				    u64 new_spte)
762 {
763 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
764 }
765 
766 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
767 						 struct tdp_iter *iter,
768 						 u64 new_spte)
769 {
770 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
771 }
772 
773 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
774 	for_each_tdp_pte(_iter, _root, _start, _end)
775 
776 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
777 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
778 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
779 		    !is_last_spte(_iter.old_spte, _iter.level))		\
780 			continue;					\
781 		else
782 
783 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
784 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
785 
786 /*
787  * Yield if the MMU lock is contended or this thread needs to return control
788  * to the scheduler.
789  *
790  * If this function should yield and flush is set, it will perform a remote
791  * TLB flush before yielding.
792  *
793  * If this function yields, iter->yielded is set and the caller must skip to
794  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
795  * over the paging structures to allow the iterator to continue its traversal
796  * from the paging structure root.
797  *
798  * Returns true if this function yielded.
799  */
800 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
801 							  struct tdp_iter *iter,
802 							  bool flush, bool shared)
803 {
804 	WARN_ON(iter->yielded);
805 
806 	/* Ensure forward progress has been made before yielding. */
807 	if (iter->next_last_level_gfn == iter->yielded_gfn)
808 		return false;
809 
810 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
811 		if (flush)
812 			kvm_flush_remote_tlbs(kvm);
813 
814 		rcu_read_unlock();
815 
816 		if (shared)
817 			cond_resched_rwlock_read(&kvm->mmu_lock);
818 		else
819 			cond_resched_rwlock_write(&kvm->mmu_lock);
820 
821 		rcu_read_lock();
822 
823 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
824 
825 		iter->yielded = true;
826 	}
827 
828 	return iter->yielded;
829 }
830 
831 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
832 {
833 	/*
834 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
835 	 * a gpa range that would exceed the max gfn, and KVM does not create
836 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
837 	 * the slow emulation path every time.
838 	 */
839 	return kvm_mmu_max_gfn() + 1;
840 }
841 
842 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
843 			       bool shared, int zap_level)
844 {
845 	struct tdp_iter iter;
846 
847 	gfn_t end = tdp_mmu_max_gfn_exclusive();
848 	gfn_t start = 0;
849 
850 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
851 retry:
852 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
853 			continue;
854 
855 		if (!is_shadow_present_pte(iter.old_spte))
856 			continue;
857 
858 		if (iter.level > zap_level)
859 			continue;
860 
861 		if (!shared)
862 			tdp_mmu_set_spte(kvm, &iter, 0);
863 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
864 			goto retry;
865 	}
866 }
867 
868 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
869 			     bool shared)
870 {
871 
872 	/*
873 	 * The root must have an elevated refcount so that it's reachable via
874 	 * mmu_notifier callbacks, which allows this path to yield and drop
875 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
876 	 * must drop all references to relevant pages prior to completing the
877 	 * callback.  Dropping mmu_lock with an unreachable root would result
878 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
879 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
880 	 * dirty accessed bits to the SPTE's associated struct page.
881 	 */
882 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
883 
884 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
885 
886 	rcu_read_lock();
887 
888 	/*
889 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
890 	 * split the zap into two passes.  On the first pass, zap at the 1gb
891 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
892 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
893 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
894 	 *
895 	 * Because zapping a SP recurses on its children, stepping down to
896 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
897 	 */
898 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
899 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
900 
901 	rcu_read_unlock();
902 }
903 
904 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
905 {
906 	u64 old_spte;
907 
908 	/*
909 	 * This helper intentionally doesn't allow zapping a root shadow page,
910 	 * which doesn't have a parent page table and thus no associated entry.
911 	 */
912 	if (WARN_ON_ONCE(!sp->ptep))
913 		return false;
914 
915 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
916 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
917 		return false;
918 
919 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
920 			   sp->gfn, sp->role.level + 1, true, true);
921 
922 	return true;
923 }
924 
925 /*
926  * If can_yield is true, will release the MMU lock and reschedule if the
927  * scheduler needs the CPU or there is contention on the MMU lock. If this
928  * function cannot yield, it will not release the MMU lock or reschedule and
929  * the caller must ensure it does not supply too large a GFN range, or the
930  * operation can cause a soft lockup.
931  */
932 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
933 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
934 {
935 	struct tdp_iter iter;
936 
937 	end = min(end, tdp_mmu_max_gfn_exclusive());
938 
939 	lockdep_assert_held_write(&kvm->mmu_lock);
940 
941 	rcu_read_lock();
942 
943 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
944 		if (can_yield &&
945 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
946 			flush = false;
947 			continue;
948 		}
949 
950 		if (!is_shadow_present_pte(iter.old_spte) ||
951 		    !is_last_spte(iter.old_spte, iter.level))
952 			continue;
953 
954 		tdp_mmu_set_spte(kvm, &iter, 0);
955 		flush = true;
956 	}
957 
958 	rcu_read_unlock();
959 
960 	/*
961 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
962 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
963 	 */
964 	return flush;
965 }
966 
967 /*
968  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
969  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
970  * more SPTEs were zapped since the MMU lock was last acquired.
971  */
972 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
973 			   bool can_yield, bool flush)
974 {
975 	struct kvm_mmu_page *root;
976 
977 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
978 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
979 
980 	return flush;
981 }
982 
983 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
984 {
985 	struct kvm_mmu_page *root;
986 	int i;
987 
988 	/*
989 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
990 	 * before returning to the caller.  Zap directly even if the root is
991 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
992 	 * all that expensive and mmu_lock is already held, which means the
993 	 * worker has yielded, i.e. flushing the work instead of zapping here
994 	 * isn't guaranteed to be any faster.
995 	 *
996 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
997 	 * is being destroyed or the userspace VMM has exited.  In both cases,
998 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
999 	 */
1000 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1001 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1002 			tdp_mmu_zap_root(kvm, root, false);
1003 	}
1004 }
1005 
1006 /*
1007  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1008  * zap" completes.
1009  */
1010 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1011 {
1012 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1013 }
1014 
1015 /*
1016  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1017  * is about to be zapped, e.g. in response to a memslots update.  The actual
1018  * zapping is performed asynchronously, so a reference is taken on all roots.
1019  * Using a separate workqueue makes it easy to ensure that the destruction is
1020  * performed before the "fast zap" completes, without keeping a separate list
1021  * of invalidated roots; the list is effectively the list of work items in
1022  * the workqueue.
1023  *
1024  * Get a reference even if the root is already invalid, the asynchronous worker
1025  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1026  * is held for write, it should be impossible to observe a root with zero refcount,
1027  * i.e. the list of roots cannot be stale.
1028  *
1029  * This has essentially the same effect for the TDP MMU
1030  * as updating mmu_valid_gen does for the shadow MMU.
1031  */
1032 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1033 {
1034 	struct kvm_mmu_page *root;
1035 
1036 	lockdep_assert_held_write(&kvm->mmu_lock);
1037 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1038 		if (!root->role.invalid &&
1039 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1040 			root->role.invalid = true;
1041 			tdp_mmu_schedule_zap_root(kvm, root);
1042 		}
1043 	}
1044 }
1045 
1046 /*
1047  * Installs a last-level SPTE to handle a TDP page fault.
1048  * (NPT/EPT violation/misconfiguration)
1049  */
1050 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1051 					  struct kvm_page_fault *fault,
1052 					  struct tdp_iter *iter)
1053 {
1054 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1055 	u64 new_spte;
1056 	int ret = RET_PF_FIXED;
1057 	bool wrprot = false;
1058 
1059 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1060 		return RET_PF_RETRY;
1061 
1062 	if (unlikely(!fault->slot))
1063 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1064 	else
1065 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1066 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1067 					 fault->map_writable, &new_spte);
1068 
1069 	if (new_spte == iter->old_spte)
1070 		ret = RET_PF_SPURIOUS;
1071 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1072 		return RET_PF_RETRY;
1073 	else if (is_shadow_present_pte(iter->old_spte) &&
1074 		 !is_last_spte(iter->old_spte, iter->level))
1075 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1076 
1077 	/*
1078 	 * If the page fault was caused by a write but the page is write
1079 	 * protected, emulation is needed. If the emulation was skipped,
1080 	 * the vCPU would have the same fault again.
1081 	 */
1082 	if (wrprot) {
1083 		if (fault->write)
1084 			ret = RET_PF_EMULATE;
1085 	}
1086 
1087 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1088 	if (unlikely(is_mmio_spte(new_spte))) {
1089 		vcpu->stat.pf_mmio_spte_created++;
1090 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1091 				     new_spte);
1092 		ret = RET_PF_EMULATE;
1093 	} else {
1094 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1095 				       rcu_dereference(iter->sptep));
1096 	}
1097 
1098 	return ret;
1099 }
1100 
1101 /*
1102  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1103  * provided page table.
1104  *
1105  * @kvm: kvm instance
1106  * @iter: a tdp_iter instance currently on the SPTE that should be set
1107  * @sp: The new TDP page table to install.
1108  * @shared: This operation is running under the MMU lock in read mode.
1109  *
1110  * Returns: 0 if the new page table was installed. Non-0 if the page table
1111  *          could not be installed (e.g. the atomic compare-exchange failed).
1112  */
1113 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1114 			   struct kvm_mmu_page *sp, bool shared)
1115 {
1116 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1117 	int ret = 0;
1118 
1119 	if (shared) {
1120 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1121 		if (ret)
1122 			return ret;
1123 	} else {
1124 		tdp_mmu_set_spte(kvm, iter, spte);
1125 	}
1126 
1127 	tdp_account_mmu_page(kvm, sp);
1128 
1129 	return 0;
1130 }
1131 
1132 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1133 				   struct kvm_mmu_page *sp, bool shared);
1134 
1135 /*
1136  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1137  * page tables and SPTEs to translate the faulting guest physical address.
1138  */
1139 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1140 {
1141 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1142 	struct kvm *kvm = vcpu->kvm;
1143 	struct tdp_iter iter;
1144 	struct kvm_mmu_page *sp;
1145 	int ret = RET_PF_RETRY;
1146 
1147 	kvm_mmu_hugepage_adjust(vcpu, fault);
1148 
1149 	trace_kvm_mmu_spte_requested(fault);
1150 
1151 	rcu_read_lock();
1152 
1153 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1154 		int r;
1155 
1156 		if (fault->nx_huge_page_workaround_enabled)
1157 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1158 
1159 		/*
1160 		 * If SPTE has been frozen by another thread, just give up and
1161 		 * retry, avoiding unnecessary page table allocation and free.
1162 		 */
1163 		if (is_removed_spte(iter.old_spte))
1164 			goto retry;
1165 
1166 		if (iter.level == fault->goal_level)
1167 			goto map_target_level;
1168 
1169 		/* Step down into the lower level page table if it exists. */
1170 		if (is_shadow_present_pte(iter.old_spte) &&
1171 		    !is_large_pte(iter.old_spte))
1172 			continue;
1173 
1174 		/*
1175 		 * The SPTE is either non-present or points to a huge page that
1176 		 * needs to be split.
1177 		 */
1178 		sp = tdp_mmu_alloc_sp(vcpu);
1179 		tdp_mmu_init_child_sp(sp, &iter);
1180 
1181 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1182 
1183 		if (is_shadow_present_pte(iter.old_spte))
1184 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1185 		else
1186 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1187 
1188 		/*
1189 		 * Force the guest to retry if installing an upper level SPTE
1190 		 * failed, e.g. because a different task modified the SPTE.
1191 		 */
1192 		if (r) {
1193 			tdp_mmu_free_sp(sp);
1194 			goto retry;
1195 		}
1196 
1197 		if (fault->huge_page_disallowed &&
1198 		    fault->req_level >= iter.level) {
1199 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1200 			if (sp->nx_huge_page_disallowed)
1201 				track_possible_nx_huge_page(kvm, sp);
1202 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1203 		}
1204 	}
1205 
1206 	/*
1207 	 * The walk aborted before reaching the target level, e.g. because the
1208 	 * iterator detected an upper level SPTE was frozen during traversal.
1209 	 */
1210 	WARN_ON_ONCE(iter.level == fault->goal_level);
1211 	goto retry;
1212 
1213 map_target_level:
1214 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1215 
1216 retry:
1217 	rcu_read_unlock();
1218 	return ret;
1219 }
1220 
1221 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1222 				 bool flush)
1223 {
1224 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1225 				     range->end, range->may_block, flush);
1226 }
1227 
1228 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1229 			      struct kvm_gfn_range *range);
1230 
1231 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1232 						   struct kvm_gfn_range *range,
1233 						   tdp_handler_t handler)
1234 {
1235 	struct kvm_mmu_page *root;
1236 	struct tdp_iter iter;
1237 	bool ret = false;
1238 
1239 	/*
1240 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1241 	 * into this helper allow blocking; it'd be dead, wasteful code.
1242 	 */
1243 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1244 		rcu_read_lock();
1245 
1246 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1247 			ret |= handler(kvm, &iter, range);
1248 
1249 		rcu_read_unlock();
1250 	}
1251 
1252 	return ret;
1253 }
1254 
1255 /*
1256  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1257  * if any of the GFNs in the range have been accessed.
1258  */
1259 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1260 			  struct kvm_gfn_range *range)
1261 {
1262 	u64 new_spte = 0;
1263 
1264 	/* If we have a non-accessed entry we don't need to change the pte. */
1265 	if (!is_accessed_spte(iter->old_spte))
1266 		return false;
1267 
1268 	new_spte = iter->old_spte;
1269 
1270 	if (spte_ad_enabled(new_spte)) {
1271 		new_spte &= ~shadow_accessed_mask;
1272 	} else {
1273 		/*
1274 		 * Capture the dirty status of the page, so that it doesn't get
1275 		 * lost when the SPTE is marked for access tracking.
1276 		 */
1277 		if (is_writable_pte(new_spte))
1278 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1279 
1280 		new_spte = mark_spte_for_access_track(new_spte);
1281 	}
1282 
1283 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1284 
1285 	return true;
1286 }
1287 
1288 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1289 {
1290 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1291 }
1292 
1293 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1294 			 struct kvm_gfn_range *range)
1295 {
1296 	return is_accessed_spte(iter->old_spte);
1297 }
1298 
1299 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1300 {
1301 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1302 }
1303 
1304 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1305 			 struct kvm_gfn_range *range)
1306 {
1307 	u64 new_spte;
1308 
1309 	/* Huge pages aren't expected to be modified without first being zapped. */
1310 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1311 
1312 	if (iter->level != PG_LEVEL_4K ||
1313 	    !is_shadow_present_pte(iter->old_spte))
1314 		return false;
1315 
1316 	/*
1317 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1318 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1319 	 * invariant that the PFN of a present * leaf SPTE can never change.
1320 	 * See __handle_changed_spte().
1321 	 */
1322 	tdp_mmu_set_spte(kvm, iter, 0);
1323 
1324 	if (!pte_write(range->pte)) {
1325 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1326 								  pte_pfn(range->pte));
1327 
1328 		tdp_mmu_set_spte(kvm, iter, new_spte);
1329 	}
1330 
1331 	return true;
1332 }
1333 
1334 /*
1335  * Handle the changed_pte MMU notifier for the TDP MMU.
1336  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1337  * notifier.
1338  * Returns non-zero if a flush is needed before releasing the MMU lock.
1339  */
1340 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1341 {
1342 	/*
1343 	 * No need to handle the remote TLB flush under RCU protection, the
1344 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1345 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1346 	 */
1347 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1348 }
1349 
1350 /*
1351  * Remove write access from all SPTEs at or above min_level that map GFNs
1352  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1353  * be flushed.
1354  */
1355 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1356 			     gfn_t start, gfn_t end, int min_level)
1357 {
1358 	struct tdp_iter iter;
1359 	u64 new_spte;
1360 	bool spte_set = false;
1361 
1362 	rcu_read_lock();
1363 
1364 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1365 
1366 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1367 retry:
1368 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1369 			continue;
1370 
1371 		if (!is_shadow_present_pte(iter.old_spte) ||
1372 		    !is_last_spte(iter.old_spte, iter.level) ||
1373 		    !(iter.old_spte & PT_WRITABLE_MASK))
1374 			continue;
1375 
1376 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1377 
1378 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1379 			goto retry;
1380 
1381 		spte_set = true;
1382 	}
1383 
1384 	rcu_read_unlock();
1385 	return spte_set;
1386 }
1387 
1388 /*
1389  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1390  * only affect leaf SPTEs down to min_level.
1391  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1392  */
1393 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1394 			     const struct kvm_memory_slot *slot, int min_level)
1395 {
1396 	struct kvm_mmu_page *root;
1397 	bool spte_set = false;
1398 
1399 	lockdep_assert_held_read(&kvm->mmu_lock);
1400 
1401 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1402 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1403 			     slot->base_gfn + slot->npages, min_level);
1404 
1405 	return spte_set;
1406 }
1407 
1408 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1409 {
1410 	struct kvm_mmu_page *sp;
1411 
1412 	gfp |= __GFP_ZERO;
1413 
1414 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1415 	if (!sp)
1416 		return NULL;
1417 
1418 	sp->spt = (void *)__get_free_page(gfp);
1419 	if (!sp->spt) {
1420 		kmem_cache_free(mmu_page_header_cache, sp);
1421 		return NULL;
1422 	}
1423 
1424 	return sp;
1425 }
1426 
1427 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1428 						       struct tdp_iter *iter,
1429 						       bool shared)
1430 {
1431 	struct kvm_mmu_page *sp;
1432 
1433 	/*
1434 	 * Since we are allocating while under the MMU lock we have to be
1435 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1436 	 * reclaim and to avoid making any filesystem callbacks (which can end
1437 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1438 	 *
1439 	 * If this allocation fails we drop the lock and retry with reclaim
1440 	 * allowed.
1441 	 */
1442 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1443 	if (sp)
1444 		return sp;
1445 
1446 	rcu_read_unlock();
1447 
1448 	if (shared)
1449 		read_unlock(&kvm->mmu_lock);
1450 	else
1451 		write_unlock(&kvm->mmu_lock);
1452 
1453 	iter->yielded = true;
1454 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1455 
1456 	if (shared)
1457 		read_lock(&kvm->mmu_lock);
1458 	else
1459 		write_lock(&kvm->mmu_lock);
1460 
1461 	rcu_read_lock();
1462 
1463 	return sp;
1464 }
1465 
1466 /* Note, the caller is responsible for initializing @sp. */
1467 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1468 				   struct kvm_mmu_page *sp, bool shared)
1469 {
1470 	const u64 huge_spte = iter->old_spte;
1471 	const int level = iter->level;
1472 	int ret, i;
1473 
1474 	/*
1475 	 * No need for atomics when writing to sp->spt since the page table has
1476 	 * not been linked in yet and thus is not reachable from any other CPU.
1477 	 */
1478 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1479 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1480 
1481 	/*
1482 	 * Replace the huge spte with a pointer to the populated lower level
1483 	 * page table. Since we are making this change without a TLB flush vCPUs
1484 	 * will see a mix of the split mappings and the original huge mapping,
1485 	 * depending on what's currently in their TLB. This is fine from a
1486 	 * correctness standpoint since the translation will be the same either
1487 	 * way.
1488 	 */
1489 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1490 	if (ret)
1491 		goto out;
1492 
1493 	/*
1494 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1495 	 * are overwriting from the page stats. But we have to manually update
1496 	 * the page stats with the new present child pages.
1497 	 */
1498 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1499 
1500 out:
1501 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1502 	return ret;
1503 }
1504 
1505 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1506 					 struct kvm_mmu_page *root,
1507 					 gfn_t start, gfn_t end,
1508 					 int target_level, bool shared)
1509 {
1510 	struct kvm_mmu_page *sp = NULL;
1511 	struct tdp_iter iter;
1512 	int ret = 0;
1513 
1514 	rcu_read_lock();
1515 
1516 	/*
1517 	 * Traverse the page table splitting all huge pages above the target
1518 	 * level into one lower level. For example, if we encounter a 1GB page
1519 	 * we split it into 512 2MB pages.
1520 	 *
1521 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1522 	 * to visit an SPTE before ever visiting its children, which means we
1523 	 * will correctly recursively split huge pages that are more than one
1524 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1525 	 * and then splitting each of those to 512 4KB pages).
1526 	 */
1527 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1528 retry:
1529 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1530 			continue;
1531 
1532 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1533 			continue;
1534 
1535 		if (!sp) {
1536 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1537 			if (!sp) {
1538 				ret = -ENOMEM;
1539 				trace_kvm_mmu_split_huge_page(iter.gfn,
1540 							      iter.old_spte,
1541 							      iter.level, ret);
1542 				break;
1543 			}
1544 
1545 			if (iter.yielded)
1546 				continue;
1547 		}
1548 
1549 		tdp_mmu_init_child_sp(sp, &iter);
1550 
1551 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1552 			goto retry;
1553 
1554 		sp = NULL;
1555 	}
1556 
1557 	rcu_read_unlock();
1558 
1559 	/*
1560 	 * It's possible to exit the loop having never used the last sp if, for
1561 	 * example, a vCPU doing HugePage NX splitting wins the race and
1562 	 * installs its own sp in place of the last sp we tried to split.
1563 	 */
1564 	if (sp)
1565 		tdp_mmu_free_sp(sp);
1566 
1567 	return ret;
1568 }
1569 
1570 
1571 /*
1572  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1573  */
1574 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1575 				      const struct kvm_memory_slot *slot,
1576 				      gfn_t start, gfn_t end,
1577 				      int target_level, bool shared)
1578 {
1579 	struct kvm_mmu_page *root;
1580 	int r = 0;
1581 
1582 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1583 
1584 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1585 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1586 		if (r) {
1587 			kvm_tdp_mmu_put_root(kvm, root, shared);
1588 			break;
1589 		}
1590 	}
1591 }
1592 
1593 /*
1594  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1595  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1596  * If AD bits are not enabled, this will require clearing the writable bit on
1597  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1598  * be flushed.
1599  */
1600 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1601 			   gfn_t start, gfn_t end)
1602 {
1603 	u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1604 	struct tdp_iter iter;
1605 	bool spte_set = false;
1606 
1607 	rcu_read_lock();
1608 
1609 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1610 retry:
1611 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1612 			continue;
1613 
1614 		if (!is_shadow_present_pte(iter.old_spte))
1615 			continue;
1616 
1617 		MMU_WARN_ON(kvm_ad_enabled() &&
1618 			    spte_ad_need_write_protect(iter.old_spte));
1619 
1620 		if (!(iter.old_spte & dbit))
1621 			continue;
1622 
1623 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1624 			goto retry;
1625 
1626 		spte_set = true;
1627 	}
1628 
1629 	rcu_read_unlock();
1630 	return spte_set;
1631 }
1632 
1633 /*
1634  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1635  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1636  * If AD bits are not enabled, this will require clearing the writable bit on
1637  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1638  * be flushed.
1639  */
1640 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1641 				  const struct kvm_memory_slot *slot)
1642 {
1643 	struct kvm_mmu_page *root;
1644 	bool spte_set = false;
1645 
1646 	lockdep_assert_held_read(&kvm->mmu_lock);
1647 
1648 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1649 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1650 				slot->base_gfn + slot->npages);
1651 
1652 	return spte_set;
1653 }
1654 
1655 /*
1656  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1657  * set in mask, starting at gfn. The given memslot is expected to contain all
1658  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1659  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1660  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1661  */
1662 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1663 				  gfn_t gfn, unsigned long mask, bool wrprot)
1664 {
1665 	u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1666 						   shadow_dirty_mask;
1667 	struct tdp_iter iter;
1668 
1669 	rcu_read_lock();
1670 
1671 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1672 				    gfn + BITS_PER_LONG) {
1673 		if (!mask)
1674 			break;
1675 
1676 		MMU_WARN_ON(kvm_ad_enabled() &&
1677 			    spte_ad_need_write_protect(iter.old_spte));
1678 
1679 		if (iter.level > PG_LEVEL_4K ||
1680 		    !(mask & (1UL << (iter.gfn - gfn))))
1681 			continue;
1682 
1683 		mask &= ~(1UL << (iter.gfn - gfn));
1684 
1685 		if (!(iter.old_spte & dbit))
1686 			continue;
1687 
1688 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1689 							iter.old_spte, dbit,
1690 							iter.level);
1691 
1692 		__handle_changed_spte(kvm, iter.as_id, iter.gfn, iter.old_spte,
1693 				      iter.old_spte & ~dbit, iter.level, false);
1694 	}
1695 
1696 	rcu_read_unlock();
1697 }
1698 
1699 /*
1700  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1701  * set in mask, starting at gfn. The given memslot is expected to contain all
1702  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1703  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1704  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1705  */
1706 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1707 				       struct kvm_memory_slot *slot,
1708 				       gfn_t gfn, unsigned long mask,
1709 				       bool wrprot)
1710 {
1711 	struct kvm_mmu_page *root;
1712 
1713 	lockdep_assert_held_write(&kvm->mmu_lock);
1714 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1715 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1716 }
1717 
1718 static void zap_collapsible_spte_range(struct kvm *kvm,
1719 				       struct kvm_mmu_page *root,
1720 				       const struct kvm_memory_slot *slot)
1721 {
1722 	gfn_t start = slot->base_gfn;
1723 	gfn_t end = start + slot->npages;
1724 	struct tdp_iter iter;
1725 	int max_mapping_level;
1726 
1727 	rcu_read_lock();
1728 
1729 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1730 retry:
1731 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1732 			continue;
1733 
1734 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1735 		    !is_shadow_present_pte(iter.old_spte))
1736 			continue;
1737 
1738 		/*
1739 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1740 		 * a large page size, then its parent would have been zapped
1741 		 * instead of stepping down.
1742 		 */
1743 		if (is_last_spte(iter.old_spte, iter.level))
1744 			continue;
1745 
1746 		/*
1747 		 * If iter.gfn resides outside of the slot, i.e. the page for
1748 		 * the current level overlaps but is not contained by the slot,
1749 		 * then the SPTE can't be made huge.  More importantly, trying
1750 		 * to query that info from slot->arch.lpage_info will cause an
1751 		 * out-of-bounds access.
1752 		 */
1753 		if (iter.gfn < start || iter.gfn >= end)
1754 			continue;
1755 
1756 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1757 							      iter.gfn, PG_LEVEL_NUM);
1758 		if (max_mapping_level < iter.level)
1759 			continue;
1760 
1761 		/* Note, a successful atomic zap also does a remote TLB flush. */
1762 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1763 			goto retry;
1764 	}
1765 
1766 	rcu_read_unlock();
1767 }
1768 
1769 /*
1770  * Zap non-leaf SPTEs (and free their associated page tables) which could
1771  * be replaced by huge pages, for GFNs within the slot.
1772  */
1773 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1774 				       const struct kvm_memory_slot *slot)
1775 {
1776 	struct kvm_mmu_page *root;
1777 
1778 	lockdep_assert_held_read(&kvm->mmu_lock);
1779 
1780 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1781 		zap_collapsible_spte_range(kvm, root, slot);
1782 }
1783 
1784 /*
1785  * Removes write access on the last level SPTE mapping this GFN and unsets the
1786  * MMU-writable bit to ensure future writes continue to be intercepted.
1787  * Returns true if an SPTE was set and a TLB flush is needed.
1788  */
1789 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1790 			      gfn_t gfn, int min_level)
1791 {
1792 	struct tdp_iter iter;
1793 	u64 new_spte;
1794 	bool spte_set = false;
1795 
1796 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1797 
1798 	rcu_read_lock();
1799 
1800 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1801 		if (!is_shadow_present_pte(iter.old_spte) ||
1802 		    !is_last_spte(iter.old_spte, iter.level))
1803 			continue;
1804 
1805 		new_spte = iter.old_spte &
1806 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1807 
1808 		if (new_spte == iter.old_spte)
1809 			break;
1810 
1811 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1812 		spte_set = true;
1813 	}
1814 
1815 	rcu_read_unlock();
1816 
1817 	return spte_set;
1818 }
1819 
1820 /*
1821  * Removes write access on the last level SPTE mapping this GFN and unsets the
1822  * MMU-writable bit to ensure future writes continue to be intercepted.
1823  * Returns true if an SPTE was set and a TLB flush is needed.
1824  */
1825 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1826 				   struct kvm_memory_slot *slot, gfn_t gfn,
1827 				   int min_level)
1828 {
1829 	struct kvm_mmu_page *root;
1830 	bool spte_set = false;
1831 
1832 	lockdep_assert_held_write(&kvm->mmu_lock);
1833 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1834 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1835 
1836 	return spte_set;
1837 }
1838 
1839 /*
1840  * Return the level of the lowest level SPTE added to sptes.
1841  * That SPTE may be non-present.
1842  *
1843  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1844  */
1845 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1846 			 int *root_level)
1847 {
1848 	struct tdp_iter iter;
1849 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1850 	gfn_t gfn = addr >> PAGE_SHIFT;
1851 	int leaf = -1;
1852 
1853 	*root_level = vcpu->arch.mmu->root_role.level;
1854 
1855 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1856 		leaf = iter.level;
1857 		sptes[leaf] = iter.old_spte;
1858 	}
1859 
1860 	return leaf;
1861 }
1862 
1863 /*
1864  * Returns the last level spte pointer of the shadow page walk for the given
1865  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1866  * walk could be performed, returns NULL and *spte does not contain valid data.
1867  *
1868  * Contract:
1869  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1870  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1871  *
1872  * WARNING: This function is only intended to be called during fast_page_fault.
1873  */
1874 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1875 					u64 *spte)
1876 {
1877 	struct tdp_iter iter;
1878 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1879 	gfn_t gfn = addr >> PAGE_SHIFT;
1880 	tdp_ptep_t sptep = NULL;
1881 
1882 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1883 		*spte = iter.old_spte;
1884 		sptep = iter.sptep;
1885 	}
1886 
1887 	/*
1888 	 * Perform the rcu_dereference to get the raw spte pointer value since
1889 	 * we are passing it up to fast_page_fault, which is shared with the
1890 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1891 	 * annotation.
1892 	 *
1893 	 * This is safe since fast_page_fault obeys the contracts of this
1894 	 * function as well as all TDP MMU contracts around modifying SPTEs
1895 	 * outside of mmu_lock.
1896 	 */
1897 	return rcu_dereference(sptep);
1898 }
1899