xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision e9adcfec)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	struct workqueue_struct *wq;
20 
21 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 		return 0;
23 
24 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 	if (!wq)
26 		return -ENOMEM;
27 
28 	/* This should not be changed for the lifetime of the VM. */
29 	kvm->arch.tdp_mmu_enabled = true;
30 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
32 	kvm->arch.tdp_mmu_zap_wq = wq;
33 	return 1;
34 }
35 
36 /* Arbitrarily returns true so that this may be used in if statements. */
37 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
38 							     bool shared)
39 {
40 	if (shared)
41 		lockdep_assert_held_read(&kvm->mmu_lock);
42 	else
43 		lockdep_assert_held_write(&kvm->mmu_lock);
44 
45 	return true;
46 }
47 
48 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
49 {
50 	if (!kvm->arch.tdp_mmu_enabled)
51 		return;
52 
53 	/* Also waits for any queued work items.  */
54 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
55 
56 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
57 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
58 
59 	/*
60 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
61 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
62 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
63 	 */
64 	rcu_barrier();
65 }
66 
67 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
68 {
69 	free_page((unsigned long)sp->spt);
70 	kmem_cache_free(mmu_page_header_cache, sp);
71 }
72 
73 /*
74  * This is called through call_rcu in order to free TDP page table memory
75  * safely with respect to other kernel threads that may be operating on
76  * the memory.
77  * By only accessing TDP MMU page table memory in an RCU read critical
78  * section, and freeing it after a grace period, lockless access to that
79  * memory won't use it after it is freed.
80  */
81 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
82 {
83 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
84 					       rcu_head);
85 
86 	tdp_mmu_free_sp(sp);
87 }
88 
89 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
90 			     bool shared);
91 
92 static void tdp_mmu_zap_root_work(struct work_struct *work)
93 {
94 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
95 						 tdp_mmu_async_work);
96 	struct kvm *kvm = root->tdp_mmu_async_data;
97 
98 	read_lock(&kvm->mmu_lock);
99 
100 	/*
101 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
102 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
103 	 * to a different pCPU.  Note, the local TLB flush on reuse also
104 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
105 	 * intermediate paging structures, that may be zapped, as such entries
106 	 * are associated with the ASID on both VMX and SVM.
107 	 */
108 	tdp_mmu_zap_root(kvm, root, true);
109 
110 	/*
111 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
112 	 * avoiding an infinite loop.  By design, the root is reachable while
113 	 * it's being asynchronously zapped, thus a different task can put its
114 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
115 	 * asynchronously zapped root is unavoidable.
116 	 */
117 	kvm_tdp_mmu_put_root(kvm, root, true);
118 
119 	read_unlock(&kvm->mmu_lock);
120 }
121 
122 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
123 {
124 	root->tdp_mmu_async_data = kvm;
125 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
126 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
127 }
128 
129 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
130 {
131 	union kvm_mmu_page_role role = page->role;
132 	role.invalid = true;
133 
134 	/* No need to use cmpxchg, only the invalid bit can change.  */
135 	role.word = xchg(&page->role.word, role.word);
136 	return role.invalid;
137 }
138 
139 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
140 			  bool shared)
141 {
142 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
143 
144 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
145 		return;
146 
147 	WARN_ON(!root->tdp_mmu_page);
148 
149 	/*
150 	 * The root now has refcount=0.  It is valid, but readers already
151 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
152 	 * rejects it.  This remains true for the rest of the execution
153 	 * of this function, because readers visit valid roots only
154 	 * (except for tdp_mmu_zap_root_work(), which however
155 	 * does not acquire any reference itself).
156 	 *
157 	 * Even though there are flows that need to visit all roots for
158 	 * correctness, they all take mmu_lock for write, so they cannot yet
159 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
160 	 * since the root still has refcount=0.
161 	 *
162 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
163 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
164 	 * So the root temporarily gets an extra reference, going to refcount=1
165 	 * while staying invalid.  Readers still cannot acquire any reference;
166 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
167 	 * they might take an extra reference if they themselves yield.
168 	 * Therefore, when the reference is given back by the worker,
169 	 * there is no guarantee that the refcount is still 1.  If not, whoever
170 	 * puts the last reference will free the page, but they will not have to
171 	 * zap the root because a root cannot go from invalid to valid.
172 	 */
173 	if (!kvm_tdp_root_mark_invalid(root)) {
174 		refcount_set(&root->tdp_mmu_root_count, 1);
175 
176 		/*
177 		 * Zapping the root in a worker is not just "nice to have";
178 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
179 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
180 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
181 		 * might return with some roots not zapped yet.
182 		 */
183 		tdp_mmu_schedule_zap_root(kvm, root);
184 		return;
185 	}
186 
187 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
188 	list_del_rcu(&root->link);
189 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
190 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
191 }
192 
193 /*
194  * Returns the next root after @prev_root (or the first root if @prev_root is
195  * NULL).  A reference to the returned root is acquired, and the reference to
196  * @prev_root is released (the caller obviously must hold a reference to
197  * @prev_root if it's non-NULL).
198  *
199  * If @only_valid is true, invalid roots are skipped.
200  *
201  * Returns NULL if the end of tdp_mmu_roots was reached.
202  */
203 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
204 					      struct kvm_mmu_page *prev_root,
205 					      bool shared, bool only_valid)
206 {
207 	struct kvm_mmu_page *next_root;
208 
209 	rcu_read_lock();
210 
211 	if (prev_root)
212 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
213 						  &prev_root->link,
214 						  typeof(*prev_root), link);
215 	else
216 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
217 						   typeof(*next_root), link);
218 
219 	while (next_root) {
220 		if ((!only_valid || !next_root->role.invalid) &&
221 		    kvm_tdp_mmu_get_root(next_root))
222 			break;
223 
224 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
225 				&next_root->link, typeof(*next_root), link);
226 	}
227 
228 	rcu_read_unlock();
229 
230 	if (prev_root)
231 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
232 
233 	return next_root;
234 }
235 
236 /*
237  * Note: this iterator gets and puts references to the roots it iterates over.
238  * This makes it safe to release the MMU lock and yield within the loop, but
239  * if exiting the loop early, the caller must drop the reference to the most
240  * recent root. (Unless keeping a live reference is desirable.)
241  *
242  * If shared is set, this function is operating under the MMU lock in read
243  * mode. In the unlikely event that this thread must free a root, the lock
244  * will be temporarily dropped and reacquired in write mode.
245  */
246 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
247 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
248 	     _root;								\
249 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
250 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
251 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
252 		} else
253 
254 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
255 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
256 
257 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
258 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
259 
260 /*
261  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
262  * the implication being that any flow that holds mmu_lock for read is
263  * inherently yield-friendly and should use the yield-safe variant above.
264  * Holding mmu_lock for write obviates the need for RCU protection as the list
265  * is guaranteed to be stable.
266  */
267 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
268 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
269 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
270 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
271 		} else
272 
273 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
274 {
275 	struct kvm_mmu_page *sp;
276 
277 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
278 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
279 
280 	return sp;
281 }
282 
283 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
284 			    gfn_t gfn, union kvm_mmu_page_role role)
285 {
286 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
287 
288 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
289 
290 	sp->role = role;
291 	sp->gfn = gfn;
292 	sp->ptep = sptep;
293 	sp->tdp_mmu_page = true;
294 
295 	trace_kvm_mmu_get_page(sp, true);
296 }
297 
298 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
299 				  struct tdp_iter *iter)
300 {
301 	struct kvm_mmu_page *parent_sp;
302 	union kvm_mmu_page_role role;
303 
304 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
305 
306 	role = parent_sp->role;
307 	role.level--;
308 
309 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
310 }
311 
312 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
313 {
314 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
315 	struct kvm *kvm = vcpu->kvm;
316 	struct kvm_mmu_page *root;
317 
318 	lockdep_assert_held_write(&kvm->mmu_lock);
319 
320 	/*
321 	 * Check for an existing root before allocating a new one.  Note, the
322 	 * role check prevents consuming an invalid root.
323 	 */
324 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
325 		if (root->role.word == role.word &&
326 		    kvm_tdp_mmu_get_root(root))
327 			goto out;
328 	}
329 
330 	root = tdp_mmu_alloc_sp(vcpu);
331 	tdp_mmu_init_sp(root, NULL, 0, role);
332 
333 	refcount_set(&root->tdp_mmu_root_count, 1);
334 
335 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
336 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
337 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
338 
339 out:
340 	return __pa(root->spt);
341 }
342 
343 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
344 				u64 old_spte, u64 new_spte, int level,
345 				bool shared);
346 
347 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
348 {
349 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
350 		return;
351 
352 	if (is_accessed_spte(old_spte) &&
353 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
354 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
355 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
356 }
357 
358 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
359 					  u64 old_spte, u64 new_spte, int level)
360 {
361 	bool pfn_changed;
362 	struct kvm_memory_slot *slot;
363 
364 	if (level > PG_LEVEL_4K)
365 		return;
366 
367 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
368 
369 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
370 	    is_writable_pte(new_spte)) {
371 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
372 		mark_page_dirty_in_slot(kvm, slot, gfn);
373 	}
374 }
375 
376 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
377 {
378 	kvm_account_pgtable_pages((void *)sp->spt, +1);
379 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
380 }
381 
382 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
383 {
384 	kvm_account_pgtable_pages((void *)sp->spt, -1);
385 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
386 }
387 
388 /**
389  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
390  *
391  * @kvm: kvm instance
392  * @sp: the page to be removed
393  * @shared: This operation may not be running under the exclusive use of
394  *	    the MMU lock and the operation must synchronize with other
395  *	    threads that might be adding or removing pages.
396  */
397 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
398 			      bool shared)
399 {
400 	tdp_unaccount_mmu_page(kvm, sp);
401 
402 	if (!sp->nx_huge_page_disallowed)
403 		return;
404 
405 	if (shared)
406 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
407 	else
408 		lockdep_assert_held_write(&kvm->mmu_lock);
409 
410 	sp->nx_huge_page_disallowed = false;
411 	untrack_possible_nx_huge_page(kvm, sp);
412 
413 	if (shared)
414 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
415 }
416 
417 /**
418  * handle_removed_pt() - handle a page table removed from the TDP structure
419  *
420  * @kvm: kvm instance
421  * @pt: the page removed from the paging structure
422  * @shared: This operation may not be running under the exclusive use
423  *	    of the MMU lock and the operation must synchronize with other
424  *	    threads that might be modifying SPTEs.
425  *
426  * Given a page table that has been removed from the TDP paging structure,
427  * iterates through the page table to clear SPTEs and free child page tables.
428  *
429  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
430  * protection. Since this thread removed it from the paging structure,
431  * this thread will be responsible for ensuring the page is freed. Hence the
432  * early rcu_dereferences in the function.
433  */
434 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
435 {
436 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
437 	int level = sp->role.level;
438 	gfn_t base_gfn = sp->gfn;
439 	int i;
440 
441 	trace_kvm_mmu_prepare_zap_page(sp);
442 
443 	tdp_mmu_unlink_sp(kvm, sp, shared);
444 
445 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
446 		tdp_ptep_t sptep = pt + i;
447 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
448 		u64 old_spte;
449 
450 		if (shared) {
451 			/*
452 			 * Set the SPTE to a nonpresent value that other
453 			 * threads will not overwrite. If the SPTE was
454 			 * already marked as removed then another thread
455 			 * handling a page fault could overwrite it, so
456 			 * set the SPTE until it is set from some other
457 			 * value to the removed SPTE value.
458 			 */
459 			for (;;) {
460 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
461 				if (!is_removed_spte(old_spte))
462 					break;
463 				cpu_relax();
464 			}
465 		} else {
466 			/*
467 			 * If the SPTE is not MMU-present, there is no backing
468 			 * page associated with the SPTE and so no side effects
469 			 * that need to be recorded, and exclusive ownership of
470 			 * mmu_lock ensures the SPTE can't be made present.
471 			 * Note, zapping MMIO SPTEs is also unnecessary as they
472 			 * are guarded by the memslots generation, not by being
473 			 * unreachable.
474 			 */
475 			old_spte = kvm_tdp_mmu_read_spte(sptep);
476 			if (!is_shadow_present_pte(old_spte))
477 				continue;
478 
479 			/*
480 			 * Use the common helper instead of a raw WRITE_ONCE as
481 			 * the SPTE needs to be updated atomically if it can be
482 			 * modified by a different vCPU outside of mmu_lock.
483 			 * Even though the parent SPTE is !PRESENT, the TLB
484 			 * hasn't yet been flushed, and both Intel and AMD
485 			 * document that A/D assists can use upper-level PxE
486 			 * entries that are cached in the TLB, i.e. the CPU can
487 			 * still access the page and mark it dirty.
488 			 *
489 			 * No retry is needed in the atomic update path as the
490 			 * sole concern is dropping a Dirty bit, i.e. no other
491 			 * task can zap/remove the SPTE as mmu_lock is held for
492 			 * write.  Marking the SPTE as a removed SPTE is not
493 			 * strictly necessary for the same reason, but using
494 			 * the remove SPTE value keeps the shared/exclusive
495 			 * paths consistent and allows the handle_changed_spte()
496 			 * call below to hardcode the new value to REMOVED_SPTE.
497 			 *
498 			 * Note, even though dropping a Dirty bit is the only
499 			 * scenario where a non-atomic update could result in a
500 			 * functional bug, simply checking the Dirty bit isn't
501 			 * sufficient as a fast page fault could read the upper
502 			 * level SPTE before it is zapped, and then make this
503 			 * target SPTE writable, resume the guest, and set the
504 			 * Dirty bit between reading the SPTE above and writing
505 			 * it here.
506 			 */
507 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
508 							  REMOVED_SPTE, level);
509 		}
510 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
511 				    old_spte, REMOVED_SPTE, level, shared);
512 	}
513 
514 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
515 }
516 
517 /**
518  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
519  * @kvm: kvm instance
520  * @as_id: the address space of the paging structure the SPTE was a part of
521  * @gfn: the base GFN that was mapped by the SPTE
522  * @old_spte: The value of the SPTE before the change
523  * @new_spte: The value of the SPTE after the change
524  * @level: the level of the PT the SPTE is part of in the paging structure
525  * @shared: This operation may not be running under the exclusive use of
526  *	    the MMU lock and the operation must synchronize with other
527  *	    threads that might be modifying SPTEs.
528  *
529  * Handle bookkeeping that might result from the modification of a SPTE.
530  * This function must be called for all TDP SPTE modifications.
531  */
532 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
533 				  u64 old_spte, u64 new_spte, int level,
534 				  bool shared)
535 {
536 	bool was_present = is_shadow_present_pte(old_spte);
537 	bool is_present = is_shadow_present_pte(new_spte);
538 	bool was_leaf = was_present && is_last_spte(old_spte, level);
539 	bool is_leaf = is_present && is_last_spte(new_spte, level);
540 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
541 
542 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
543 	WARN_ON(level < PG_LEVEL_4K);
544 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
545 
546 	/*
547 	 * If this warning were to trigger it would indicate that there was a
548 	 * missing MMU notifier or a race with some notifier handler.
549 	 * A present, leaf SPTE should never be directly replaced with another
550 	 * present leaf SPTE pointing to a different PFN. A notifier handler
551 	 * should be zapping the SPTE before the main MM's page table is
552 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
553 	 * thread before replacement.
554 	 */
555 	if (was_leaf && is_leaf && pfn_changed) {
556 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
557 		       "SPTE with another present leaf SPTE mapping a\n"
558 		       "different PFN!\n"
559 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
560 		       as_id, gfn, old_spte, new_spte, level);
561 
562 		/*
563 		 * Crash the host to prevent error propagation and guest data
564 		 * corruption.
565 		 */
566 		BUG();
567 	}
568 
569 	if (old_spte == new_spte)
570 		return;
571 
572 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
573 
574 	if (is_leaf)
575 		check_spte_writable_invariants(new_spte);
576 
577 	/*
578 	 * The only times a SPTE should be changed from a non-present to
579 	 * non-present state is when an MMIO entry is installed/modified/
580 	 * removed. In that case, there is nothing to do here.
581 	 */
582 	if (!was_present && !is_present) {
583 		/*
584 		 * If this change does not involve a MMIO SPTE or removed SPTE,
585 		 * it is unexpected. Log the change, though it should not
586 		 * impact the guest since both the former and current SPTEs
587 		 * are nonpresent.
588 		 */
589 		if (WARN_ON(!is_mmio_spte(old_spte) &&
590 			    !is_mmio_spte(new_spte) &&
591 			    !is_removed_spte(new_spte)))
592 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
593 			       "should not be replaced with another,\n"
594 			       "different nonpresent SPTE, unless one or both\n"
595 			       "are MMIO SPTEs, or the new SPTE is\n"
596 			       "a temporary removed SPTE.\n"
597 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
598 			       as_id, gfn, old_spte, new_spte, level);
599 		return;
600 	}
601 
602 	if (is_leaf != was_leaf)
603 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
604 
605 	if (was_leaf && is_dirty_spte(old_spte) &&
606 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
607 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
608 
609 	/*
610 	 * Recursively handle child PTs if the change removed a subtree from
611 	 * the paging structure.  Note the WARN on the PFN changing without the
612 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
613 	 * pages are kernel allocations and should never be migrated.
614 	 */
615 	if (was_present && !was_leaf &&
616 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
617 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
618 }
619 
620 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
621 				u64 old_spte, u64 new_spte, int level,
622 				bool shared)
623 {
624 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
625 			      shared);
626 	handle_changed_spte_acc_track(old_spte, new_spte, level);
627 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
628 				      new_spte, level);
629 }
630 
631 /*
632  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
633  * and handle the associated bookkeeping.  Do not mark the page dirty
634  * in KVM's dirty bitmaps.
635  *
636  * If setting the SPTE fails because it has changed, iter->old_spte will be
637  * refreshed to the current value of the spte.
638  *
639  * @kvm: kvm instance
640  * @iter: a tdp_iter instance currently on the SPTE that should be set
641  * @new_spte: The value the SPTE should be set to
642  * Return:
643  * * 0      - If the SPTE was set.
644  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
645  *            no side-effects other than setting iter->old_spte to the last
646  *            known value of the spte.
647  */
648 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
649 					  struct tdp_iter *iter,
650 					  u64 new_spte)
651 {
652 	u64 *sptep = rcu_dereference(iter->sptep);
653 
654 	/*
655 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
656 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
657 	 * and pre-checking before inserting a new SPTE is advantageous as it
658 	 * avoids unnecessary work.
659 	 */
660 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
661 
662 	lockdep_assert_held_read(&kvm->mmu_lock);
663 
664 	/*
665 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
666 	 * does not hold the mmu_lock.
667 	 */
668 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
669 		return -EBUSY;
670 
671 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
672 			      new_spte, iter->level, true);
673 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
674 
675 	return 0;
676 }
677 
678 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
679 					  struct tdp_iter *iter)
680 {
681 	int ret;
682 
683 	/*
684 	 * Freeze the SPTE by setting it to a special,
685 	 * non-present value. This will stop other threads from
686 	 * immediately installing a present entry in its place
687 	 * before the TLBs are flushed.
688 	 */
689 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
690 	if (ret)
691 		return ret;
692 
693 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
694 					   KVM_PAGES_PER_HPAGE(iter->level));
695 
696 	/*
697 	 * No other thread can overwrite the removed SPTE as they must either
698 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
699 	 * overwrite the special removed SPTE value. No bookkeeping is needed
700 	 * here since the SPTE is going from non-present to non-present.  Use
701 	 * the raw write helper to avoid an unnecessary check on volatile bits.
702 	 */
703 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
704 
705 	return 0;
706 }
707 
708 
709 /*
710  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
711  * @kvm:	      KVM instance
712  * @as_id:	      Address space ID, i.e. regular vs. SMM
713  * @sptep:	      Pointer to the SPTE
714  * @old_spte:	      The current value of the SPTE
715  * @new_spte:	      The new value that will be set for the SPTE
716  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
717  * @level:	      The level _containing_ the SPTE (its parent PT's level)
718  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
719  *		      of the page. Should be set unless handling an MMU
720  *		      notifier for access tracking. Leaving record_acc_track
721  *		      unset in that case prevents page accesses from being
722  *		      double counted.
723  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
724  *		      appropriate for the change being made. Should be set
725  *		      unless performing certain dirty logging operations.
726  *		      Leaving record_dirty_log unset in that case prevents page
727  *		      writes from being double counted.
728  *
729  * Returns the old SPTE value, which _may_ be different than @old_spte if the
730  * SPTE had voldatile bits.
731  */
732 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
733 			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
734 			      bool record_acc_track, bool record_dirty_log)
735 {
736 	lockdep_assert_held_write(&kvm->mmu_lock);
737 
738 	/*
739 	 * No thread should be using this function to set SPTEs to or from the
740 	 * temporary removed SPTE value.
741 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
742 	 * should be used. If operating under the MMU lock in write mode, the
743 	 * use of the removed SPTE should not be necessary.
744 	 */
745 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
746 
747 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
748 
749 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
750 
751 	if (record_acc_track)
752 		handle_changed_spte_acc_track(old_spte, new_spte, level);
753 	if (record_dirty_log)
754 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
755 					      new_spte, level);
756 	return old_spte;
757 }
758 
759 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
760 				     u64 new_spte, bool record_acc_track,
761 				     bool record_dirty_log)
762 {
763 	WARN_ON_ONCE(iter->yielded);
764 
765 	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
766 					    iter->old_spte, new_spte,
767 					    iter->gfn, iter->level,
768 					    record_acc_track, record_dirty_log);
769 }
770 
771 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
772 				    u64 new_spte)
773 {
774 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
775 }
776 
777 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
778 						 struct tdp_iter *iter,
779 						 u64 new_spte)
780 {
781 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
782 }
783 
784 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
785 						 struct tdp_iter *iter,
786 						 u64 new_spte)
787 {
788 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
789 }
790 
791 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
792 	for_each_tdp_pte(_iter, _root, _start, _end)
793 
794 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
795 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
796 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
797 		    !is_last_spte(_iter.old_spte, _iter.level))		\
798 			continue;					\
799 		else
800 
801 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
802 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
803 
804 /*
805  * Yield if the MMU lock is contended or this thread needs to return control
806  * to the scheduler.
807  *
808  * If this function should yield and flush is set, it will perform a remote
809  * TLB flush before yielding.
810  *
811  * If this function yields, iter->yielded is set and the caller must skip to
812  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
813  * over the paging structures to allow the iterator to continue its traversal
814  * from the paging structure root.
815  *
816  * Returns true if this function yielded.
817  */
818 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
819 							  struct tdp_iter *iter,
820 							  bool flush, bool shared)
821 {
822 	WARN_ON(iter->yielded);
823 
824 	/* Ensure forward progress has been made before yielding. */
825 	if (iter->next_last_level_gfn == iter->yielded_gfn)
826 		return false;
827 
828 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
829 		if (flush)
830 			kvm_flush_remote_tlbs(kvm);
831 
832 		rcu_read_unlock();
833 
834 		if (shared)
835 			cond_resched_rwlock_read(&kvm->mmu_lock);
836 		else
837 			cond_resched_rwlock_write(&kvm->mmu_lock);
838 
839 		rcu_read_lock();
840 
841 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
842 
843 		iter->yielded = true;
844 	}
845 
846 	return iter->yielded;
847 }
848 
849 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
850 {
851 	/*
852 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
853 	 * a gpa range that would exceed the max gfn, and KVM does not create
854 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
855 	 * the slow emulation path every time.
856 	 */
857 	return kvm_mmu_max_gfn() + 1;
858 }
859 
860 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
861 			       bool shared, int zap_level)
862 {
863 	struct tdp_iter iter;
864 
865 	gfn_t end = tdp_mmu_max_gfn_exclusive();
866 	gfn_t start = 0;
867 
868 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
869 retry:
870 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
871 			continue;
872 
873 		if (!is_shadow_present_pte(iter.old_spte))
874 			continue;
875 
876 		if (iter.level > zap_level)
877 			continue;
878 
879 		if (!shared)
880 			tdp_mmu_set_spte(kvm, &iter, 0);
881 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
882 			goto retry;
883 	}
884 }
885 
886 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
887 			     bool shared)
888 {
889 
890 	/*
891 	 * The root must have an elevated refcount so that it's reachable via
892 	 * mmu_notifier callbacks, which allows this path to yield and drop
893 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
894 	 * must drop all references to relevant pages prior to completing the
895 	 * callback.  Dropping mmu_lock with an unreachable root would result
896 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
897 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
898 	 * dirty accessed bits to the SPTE's associated struct page.
899 	 */
900 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
901 
902 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
903 
904 	rcu_read_lock();
905 
906 	/*
907 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
908 	 * split the zap into two passes.  On the first pass, zap at the 1gb
909 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
910 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
911 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
912 	 *
913 	 * Because zapping a SP recurses on its children, stepping down to
914 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
915 	 */
916 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
917 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
918 
919 	rcu_read_unlock();
920 }
921 
922 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
923 {
924 	u64 old_spte;
925 
926 	/*
927 	 * This helper intentionally doesn't allow zapping a root shadow page,
928 	 * which doesn't have a parent page table and thus no associated entry.
929 	 */
930 	if (WARN_ON_ONCE(!sp->ptep))
931 		return false;
932 
933 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
934 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
935 		return false;
936 
937 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
938 			   sp->gfn, sp->role.level + 1, true, true);
939 
940 	return true;
941 }
942 
943 /*
944  * If can_yield is true, will release the MMU lock and reschedule if the
945  * scheduler needs the CPU or there is contention on the MMU lock. If this
946  * function cannot yield, it will not release the MMU lock or reschedule and
947  * the caller must ensure it does not supply too large a GFN range, or the
948  * operation can cause a soft lockup.
949  */
950 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
951 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
952 {
953 	struct tdp_iter iter;
954 
955 	end = min(end, tdp_mmu_max_gfn_exclusive());
956 
957 	lockdep_assert_held_write(&kvm->mmu_lock);
958 
959 	rcu_read_lock();
960 
961 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
962 		if (can_yield &&
963 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
964 			flush = false;
965 			continue;
966 		}
967 
968 		if (!is_shadow_present_pte(iter.old_spte) ||
969 		    !is_last_spte(iter.old_spte, iter.level))
970 			continue;
971 
972 		tdp_mmu_set_spte(kvm, &iter, 0);
973 		flush = true;
974 	}
975 
976 	rcu_read_unlock();
977 
978 	/*
979 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
980 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
981 	 */
982 	return flush;
983 }
984 
985 /*
986  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
987  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
988  * more SPTEs were zapped since the MMU lock was last acquired.
989  */
990 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
991 			   bool can_yield, bool flush)
992 {
993 	struct kvm_mmu_page *root;
994 
995 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
996 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
997 
998 	return flush;
999 }
1000 
1001 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1002 {
1003 	struct kvm_mmu_page *root;
1004 	int i;
1005 
1006 	/*
1007 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
1008 	 * before returning to the caller.  Zap directly even if the root is
1009 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
1010 	 * all that expensive and mmu_lock is already held, which means the
1011 	 * worker has yielded, i.e. flushing the work instead of zapping here
1012 	 * isn't guaranteed to be any faster.
1013 	 *
1014 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1015 	 * is being destroyed or the userspace VMM has exited.  In both cases,
1016 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1017 	 */
1018 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1019 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1020 			tdp_mmu_zap_root(kvm, root, false);
1021 	}
1022 }
1023 
1024 /*
1025  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1026  * zap" completes.
1027  */
1028 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1029 {
1030 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1031 }
1032 
1033 /*
1034  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1035  * is about to be zapped, e.g. in response to a memslots update.  The actual
1036  * zapping is performed asynchronously, so a reference is taken on all roots.
1037  * Using a separate workqueue makes it easy to ensure that the destruction is
1038  * performed before the "fast zap" completes, without keeping a separate list
1039  * of invalidated roots; the list is effectively the list of work items in
1040  * the workqueue.
1041  *
1042  * Get a reference even if the root is already invalid, the asynchronous worker
1043  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1044  * is held for write, it should be impossible to observe a root with zero refcount,
1045  * i.e. the list of roots cannot be stale.
1046  *
1047  * This has essentially the same effect for the TDP MMU
1048  * as updating mmu_valid_gen does for the shadow MMU.
1049  */
1050 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1051 {
1052 	struct kvm_mmu_page *root;
1053 
1054 	lockdep_assert_held_write(&kvm->mmu_lock);
1055 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1056 		if (!root->role.invalid &&
1057 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1058 			root->role.invalid = true;
1059 			tdp_mmu_schedule_zap_root(kvm, root);
1060 		}
1061 	}
1062 }
1063 
1064 /*
1065  * Installs a last-level SPTE to handle a TDP page fault.
1066  * (NPT/EPT violation/misconfiguration)
1067  */
1068 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1069 					  struct kvm_page_fault *fault,
1070 					  struct tdp_iter *iter)
1071 {
1072 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1073 	u64 new_spte;
1074 	int ret = RET_PF_FIXED;
1075 	bool wrprot = false;
1076 
1077 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1078 		return RET_PF_RETRY;
1079 
1080 	if (unlikely(!fault->slot))
1081 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1082 	else
1083 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1084 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1085 					 fault->map_writable, &new_spte);
1086 
1087 	if (new_spte == iter->old_spte)
1088 		ret = RET_PF_SPURIOUS;
1089 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1090 		return RET_PF_RETRY;
1091 	else if (is_shadow_present_pte(iter->old_spte) &&
1092 		 !is_last_spte(iter->old_spte, iter->level))
1093 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1094 						   KVM_PAGES_PER_HPAGE(iter->level + 1));
1095 
1096 	/*
1097 	 * If the page fault was caused by a write but the page is write
1098 	 * protected, emulation is needed. If the emulation was skipped,
1099 	 * the vCPU would have the same fault again.
1100 	 */
1101 	if (wrprot) {
1102 		if (fault->write)
1103 			ret = RET_PF_EMULATE;
1104 	}
1105 
1106 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1107 	if (unlikely(is_mmio_spte(new_spte))) {
1108 		vcpu->stat.pf_mmio_spte_created++;
1109 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1110 				     new_spte);
1111 		ret = RET_PF_EMULATE;
1112 	} else {
1113 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1114 				       rcu_dereference(iter->sptep));
1115 	}
1116 
1117 	return ret;
1118 }
1119 
1120 /*
1121  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1122  * provided page table.
1123  *
1124  * @kvm: kvm instance
1125  * @iter: a tdp_iter instance currently on the SPTE that should be set
1126  * @sp: The new TDP page table to install.
1127  * @shared: This operation is running under the MMU lock in read mode.
1128  *
1129  * Returns: 0 if the new page table was installed. Non-0 if the page table
1130  *          could not be installed (e.g. the atomic compare-exchange failed).
1131  */
1132 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1133 			   struct kvm_mmu_page *sp, bool shared)
1134 {
1135 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1136 	int ret = 0;
1137 
1138 	if (shared) {
1139 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1140 		if (ret)
1141 			return ret;
1142 	} else {
1143 		tdp_mmu_set_spte(kvm, iter, spte);
1144 	}
1145 
1146 	tdp_account_mmu_page(kvm, sp);
1147 
1148 	return 0;
1149 }
1150 
1151 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1152 				   struct kvm_mmu_page *sp, bool shared);
1153 
1154 /*
1155  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1156  * page tables and SPTEs to translate the faulting guest physical address.
1157  */
1158 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1159 {
1160 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1161 	struct kvm *kvm = vcpu->kvm;
1162 	struct tdp_iter iter;
1163 	struct kvm_mmu_page *sp;
1164 	int ret = RET_PF_RETRY;
1165 
1166 	kvm_mmu_hugepage_adjust(vcpu, fault);
1167 
1168 	trace_kvm_mmu_spte_requested(fault);
1169 
1170 	rcu_read_lock();
1171 
1172 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1173 		int r;
1174 
1175 		if (fault->nx_huge_page_workaround_enabled)
1176 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1177 
1178 		/*
1179 		 * If SPTE has been frozen by another thread, just give up and
1180 		 * retry, avoiding unnecessary page table allocation and free.
1181 		 */
1182 		if (is_removed_spte(iter.old_spte))
1183 			goto retry;
1184 
1185 		if (iter.level == fault->goal_level)
1186 			goto map_target_level;
1187 
1188 		/* Step down into the lower level page table if it exists. */
1189 		if (is_shadow_present_pte(iter.old_spte) &&
1190 		    !is_large_pte(iter.old_spte))
1191 			continue;
1192 
1193 		/*
1194 		 * The SPTE is either non-present or points to a huge page that
1195 		 * needs to be split.
1196 		 */
1197 		sp = tdp_mmu_alloc_sp(vcpu);
1198 		tdp_mmu_init_child_sp(sp, &iter);
1199 
1200 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1201 
1202 		if (is_shadow_present_pte(iter.old_spte))
1203 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1204 		else
1205 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1206 
1207 		/*
1208 		 * Force the guest to retry if installing an upper level SPTE
1209 		 * failed, e.g. because a different task modified the SPTE.
1210 		 */
1211 		if (r) {
1212 			tdp_mmu_free_sp(sp);
1213 			goto retry;
1214 		}
1215 
1216 		if (fault->huge_page_disallowed &&
1217 		    fault->req_level >= iter.level) {
1218 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1219 			if (sp->nx_huge_page_disallowed)
1220 				track_possible_nx_huge_page(kvm, sp);
1221 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1222 		}
1223 	}
1224 
1225 	/*
1226 	 * The walk aborted before reaching the target level, e.g. because the
1227 	 * iterator detected an upper level SPTE was frozen during traversal.
1228 	 */
1229 	WARN_ON_ONCE(iter.level == fault->goal_level);
1230 	goto retry;
1231 
1232 map_target_level:
1233 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1234 
1235 retry:
1236 	rcu_read_unlock();
1237 	return ret;
1238 }
1239 
1240 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1241 				 bool flush)
1242 {
1243 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1244 				     range->end, range->may_block, flush);
1245 }
1246 
1247 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1248 			      struct kvm_gfn_range *range);
1249 
1250 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1251 						   struct kvm_gfn_range *range,
1252 						   tdp_handler_t handler)
1253 {
1254 	struct kvm_mmu_page *root;
1255 	struct tdp_iter iter;
1256 	bool ret = false;
1257 
1258 	/*
1259 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1260 	 * into this helper allow blocking; it'd be dead, wasteful code.
1261 	 */
1262 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1263 		rcu_read_lock();
1264 
1265 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1266 			ret |= handler(kvm, &iter, range);
1267 
1268 		rcu_read_unlock();
1269 	}
1270 
1271 	return ret;
1272 }
1273 
1274 /*
1275  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1276  * if any of the GFNs in the range have been accessed.
1277  */
1278 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1279 			  struct kvm_gfn_range *range)
1280 {
1281 	u64 new_spte = 0;
1282 
1283 	/* If we have a non-accessed entry we don't need to change the pte. */
1284 	if (!is_accessed_spte(iter->old_spte))
1285 		return false;
1286 
1287 	new_spte = iter->old_spte;
1288 
1289 	if (spte_ad_enabled(new_spte)) {
1290 		new_spte &= ~shadow_accessed_mask;
1291 	} else {
1292 		/*
1293 		 * Capture the dirty status of the page, so that it doesn't get
1294 		 * lost when the SPTE is marked for access tracking.
1295 		 */
1296 		if (is_writable_pte(new_spte))
1297 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1298 
1299 		new_spte = mark_spte_for_access_track(new_spte);
1300 	}
1301 
1302 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1303 
1304 	return true;
1305 }
1306 
1307 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1308 {
1309 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1310 }
1311 
1312 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1313 			 struct kvm_gfn_range *range)
1314 {
1315 	return is_accessed_spte(iter->old_spte);
1316 }
1317 
1318 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1319 {
1320 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1321 }
1322 
1323 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1324 			 struct kvm_gfn_range *range)
1325 {
1326 	u64 new_spte;
1327 
1328 	/* Huge pages aren't expected to be modified without first being zapped. */
1329 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1330 
1331 	if (iter->level != PG_LEVEL_4K ||
1332 	    !is_shadow_present_pte(iter->old_spte))
1333 		return false;
1334 
1335 	/*
1336 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1337 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1338 	 * invariant that the PFN of a present * leaf SPTE can never change.
1339 	 * See __handle_changed_spte().
1340 	 */
1341 	tdp_mmu_set_spte(kvm, iter, 0);
1342 
1343 	if (!pte_write(range->pte)) {
1344 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1345 								  pte_pfn(range->pte));
1346 
1347 		tdp_mmu_set_spte(kvm, iter, new_spte);
1348 	}
1349 
1350 	return true;
1351 }
1352 
1353 /*
1354  * Handle the changed_pte MMU notifier for the TDP MMU.
1355  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1356  * notifier.
1357  * Returns non-zero if a flush is needed before releasing the MMU lock.
1358  */
1359 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1360 {
1361 	/*
1362 	 * No need to handle the remote TLB flush under RCU protection, the
1363 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1364 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1365 	 */
1366 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1367 }
1368 
1369 /*
1370  * Remove write access from all SPTEs at or above min_level that map GFNs
1371  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1372  * be flushed.
1373  */
1374 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1375 			     gfn_t start, gfn_t end, int min_level)
1376 {
1377 	struct tdp_iter iter;
1378 	u64 new_spte;
1379 	bool spte_set = false;
1380 
1381 	rcu_read_lock();
1382 
1383 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1384 
1385 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1386 retry:
1387 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1388 			continue;
1389 
1390 		if (!is_shadow_present_pte(iter.old_spte) ||
1391 		    !is_last_spte(iter.old_spte, iter.level) ||
1392 		    !(iter.old_spte & PT_WRITABLE_MASK))
1393 			continue;
1394 
1395 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1396 
1397 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1398 			goto retry;
1399 
1400 		spte_set = true;
1401 	}
1402 
1403 	rcu_read_unlock();
1404 	return spte_set;
1405 }
1406 
1407 /*
1408  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1409  * only affect leaf SPTEs down to min_level.
1410  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1411  */
1412 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1413 			     const struct kvm_memory_slot *slot, int min_level)
1414 {
1415 	struct kvm_mmu_page *root;
1416 	bool spte_set = false;
1417 
1418 	lockdep_assert_held_read(&kvm->mmu_lock);
1419 
1420 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1421 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1422 			     slot->base_gfn + slot->npages, min_level);
1423 
1424 	return spte_set;
1425 }
1426 
1427 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1428 {
1429 	struct kvm_mmu_page *sp;
1430 
1431 	gfp |= __GFP_ZERO;
1432 
1433 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1434 	if (!sp)
1435 		return NULL;
1436 
1437 	sp->spt = (void *)__get_free_page(gfp);
1438 	if (!sp->spt) {
1439 		kmem_cache_free(mmu_page_header_cache, sp);
1440 		return NULL;
1441 	}
1442 
1443 	return sp;
1444 }
1445 
1446 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1447 						       struct tdp_iter *iter,
1448 						       bool shared)
1449 {
1450 	struct kvm_mmu_page *sp;
1451 
1452 	/*
1453 	 * Since we are allocating while under the MMU lock we have to be
1454 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1455 	 * reclaim and to avoid making any filesystem callbacks (which can end
1456 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1457 	 *
1458 	 * If this allocation fails we drop the lock and retry with reclaim
1459 	 * allowed.
1460 	 */
1461 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1462 	if (sp)
1463 		return sp;
1464 
1465 	rcu_read_unlock();
1466 
1467 	if (shared)
1468 		read_unlock(&kvm->mmu_lock);
1469 	else
1470 		write_unlock(&kvm->mmu_lock);
1471 
1472 	iter->yielded = true;
1473 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1474 
1475 	if (shared)
1476 		read_lock(&kvm->mmu_lock);
1477 	else
1478 		write_lock(&kvm->mmu_lock);
1479 
1480 	rcu_read_lock();
1481 
1482 	return sp;
1483 }
1484 
1485 /* Note, the caller is responsible for initializing @sp. */
1486 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1487 				   struct kvm_mmu_page *sp, bool shared)
1488 {
1489 	const u64 huge_spte = iter->old_spte;
1490 	const int level = iter->level;
1491 	int ret, i;
1492 
1493 	/*
1494 	 * No need for atomics when writing to sp->spt since the page table has
1495 	 * not been linked in yet and thus is not reachable from any other CPU.
1496 	 */
1497 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1498 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1499 
1500 	/*
1501 	 * Replace the huge spte with a pointer to the populated lower level
1502 	 * page table. Since we are making this change without a TLB flush vCPUs
1503 	 * will see a mix of the split mappings and the original huge mapping,
1504 	 * depending on what's currently in their TLB. This is fine from a
1505 	 * correctness standpoint since the translation will be the same either
1506 	 * way.
1507 	 */
1508 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1509 	if (ret)
1510 		goto out;
1511 
1512 	/*
1513 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1514 	 * are overwriting from the page stats. But we have to manually update
1515 	 * the page stats with the new present child pages.
1516 	 */
1517 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1518 
1519 out:
1520 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1521 	return ret;
1522 }
1523 
1524 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1525 					 struct kvm_mmu_page *root,
1526 					 gfn_t start, gfn_t end,
1527 					 int target_level, bool shared)
1528 {
1529 	struct kvm_mmu_page *sp = NULL;
1530 	struct tdp_iter iter;
1531 	int ret = 0;
1532 
1533 	rcu_read_lock();
1534 
1535 	/*
1536 	 * Traverse the page table splitting all huge pages above the target
1537 	 * level into one lower level. For example, if we encounter a 1GB page
1538 	 * we split it into 512 2MB pages.
1539 	 *
1540 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1541 	 * to visit an SPTE before ever visiting its children, which means we
1542 	 * will correctly recursively split huge pages that are more than one
1543 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1544 	 * and then splitting each of those to 512 4KB pages).
1545 	 */
1546 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1547 retry:
1548 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1549 			continue;
1550 
1551 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1552 			continue;
1553 
1554 		if (!sp) {
1555 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1556 			if (!sp) {
1557 				ret = -ENOMEM;
1558 				trace_kvm_mmu_split_huge_page(iter.gfn,
1559 							      iter.old_spte,
1560 							      iter.level, ret);
1561 				break;
1562 			}
1563 
1564 			if (iter.yielded)
1565 				continue;
1566 		}
1567 
1568 		tdp_mmu_init_child_sp(sp, &iter);
1569 
1570 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1571 			goto retry;
1572 
1573 		sp = NULL;
1574 	}
1575 
1576 	rcu_read_unlock();
1577 
1578 	/*
1579 	 * It's possible to exit the loop having never used the last sp if, for
1580 	 * example, a vCPU doing HugePage NX splitting wins the race and
1581 	 * installs its own sp in place of the last sp we tried to split.
1582 	 */
1583 	if (sp)
1584 		tdp_mmu_free_sp(sp);
1585 
1586 	return ret;
1587 }
1588 
1589 
1590 /*
1591  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1592  */
1593 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1594 				      const struct kvm_memory_slot *slot,
1595 				      gfn_t start, gfn_t end,
1596 				      int target_level, bool shared)
1597 {
1598 	struct kvm_mmu_page *root;
1599 	int r = 0;
1600 
1601 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1602 
1603 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1604 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1605 		if (r) {
1606 			kvm_tdp_mmu_put_root(kvm, root, shared);
1607 			break;
1608 		}
1609 	}
1610 }
1611 
1612 /*
1613  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1614  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1615  * If AD bits are not enabled, this will require clearing the writable bit on
1616  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1617  * be flushed.
1618  */
1619 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1620 			   gfn_t start, gfn_t end)
1621 {
1622 	struct tdp_iter iter;
1623 	u64 new_spte;
1624 	bool spte_set = false;
1625 
1626 	rcu_read_lock();
1627 
1628 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1629 retry:
1630 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1631 			continue;
1632 
1633 		if (!is_shadow_present_pte(iter.old_spte))
1634 			continue;
1635 
1636 		if (spte_ad_need_write_protect(iter.old_spte)) {
1637 			if (is_writable_pte(iter.old_spte))
1638 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1639 			else
1640 				continue;
1641 		} else {
1642 			if (iter.old_spte & shadow_dirty_mask)
1643 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1644 			else
1645 				continue;
1646 		}
1647 
1648 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1649 			goto retry;
1650 
1651 		spte_set = true;
1652 	}
1653 
1654 	rcu_read_unlock();
1655 	return spte_set;
1656 }
1657 
1658 /*
1659  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1660  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1661  * If AD bits are not enabled, this will require clearing the writable bit on
1662  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1663  * be flushed.
1664  */
1665 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1666 				  const struct kvm_memory_slot *slot)
1667 {
1668 	struct kvm_mmu_page *root;
1669 	bool spte_set = false;
1670 
1671 	lockdep_assert_held_read(&kvm->mmu_lock);
1672 
1673 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1674 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1675 				slot->base_gfn + slot->npages);
1676 
1677 	return spte_set;
1678 }
1679 
1680 /*
1681  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1682  * set in mask, starting at gfn. The given memslot is expected to contain all
1683  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1684  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1685  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1686  */
1687 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1688 				  gfn_t gfn, unsigned long mask, bool wrprot)
1689 {
1690 	struct tdp_iter iter;
1691 	u64 new_spte;
1692 
1693 	rcu_read_lock();
1694 
1695 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1696 				    gfn + BITS_PER_LONG) {
1697 		if (!mask)
1698 			break;
1699 
1700 		if (iter.level > PG_LEVEL_4K ||
1701 		    !(mask & (1UL << (iter.gfn - gfn))))
1702 			continue;
1703 
1704 		mask &= ~(1UL << (iter.gfn - gfn));
1705 
1706 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1707 			if (is_writable_pte(iter.old_spte))
1708 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1709 			else
1710 				continue;
1711 		} else {
1712 			if (iter.old_spte & shadow_dirty_mask)
1713 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1714 			else
1715 				continue;
1716 		}
1717 
1718 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1719 	}
1720 
1721 	rcu_read_unlock();
1722 }
1723 
1724 /*
1725  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1726  * set in mask, starting at gfn. The given memslot is expected to contain all
1727  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1728  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1729  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1730  */
1731 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1732 				       struct kvm_memory_slot *slot,
1733 				       gfn_t gfn, unsigned long mask,
1734 				       bool wrprot)
1735 {
1736 	struct kvm_mmu_page *root;
1737 
1738 	lockdep_assert_held_write(&kvm->mmu_lock);
1739 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1740 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1741 }
1742 
1743 static void zap_collapsible_spte_range(struct kvm *kvm,
1744 				       struct kvm_mmu_page *root,
1745 				       const struct kvm_memory_slot *slot)
1746 {
1747 	gfn_t start = slot->base_gfn;
1748 	gfn_t end = start + slot->npages;
1749 	struct tdp_iter iter;
1750 	int max_mapping_level;
1751 
1752 	rcu_read_lock();
1753 
1754 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1755 retry:
1756 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1757 			continue;
1758 
1759 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1760 		    !is_shadow_present_pte(iter.old_spte))
1761 			continue;
1762 
1763 		/*
1764 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1765 		 * a large page size, then its parent would have been zapped
1766 		 * instead of stepping down.
1767 		 */
1768 		if (is_last_spte(iter.old_spte, iter.level))
1769 			continue;
1770 
1771 		/*
1772 		 * If iter.gfn resides outside of the slot, i.e. the page for
1773 		 * the current level overlaps but is not contained by the slot,
1774 		 * then the SPTE can't be made huge.  More importantly, trying
1775 		 * to query that info from slot->arch.lpage_info will cause an
1776 		 * out-of-bounds access.
1777 		 */
1778 		if (iter.gfn < start || iter.gfn >= end)
1779 			continue;
1780 
1781 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1782 							      iter.gfn, PG_LEVEL_NUM);
1783 		if (max_mapping_level < iter.level)
1784 			continue;
1785 
1786 		/* Note, a successful atomic zap also does a remote TLB flush. */
1787 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1788 			goto retry;
1789 	}
1790 
1791 	rcu_read_unlock();
1792 }
1793 
1794 /*
1795  * Zap non-leaf SPTEs (and free their associated page tables) which could
1796  * be replaced by huge pages, for GFNs within the slot.
1797  */
1798 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1799 				       const struct kvm_memory_slot *slot)
1800 {
1801 	struct kvm_mmu_page *root;
1802 
1803 	lockdep_assert_held_read(&kvm->mmu_lock);
1804 
1805 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1806 		zap_collapsible_spte_range(kvm, root, slot);
1807 }
1808 
1809 /*
1810  * Removes write access on the last level SPTE mapping this GFN and unsets the
1811  * MMU-writable bit to ensure future writes continue to be intercepted.
1812  * Returns true if an SPTE was set and a TLB flush is needed.
1813  */
1814 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1815 			      gfn_t gfn, int min_level)
1816 {
1817 	struct tdp_iter iter;
1818 	u64 new_spte;
1819 	bool spte_set = false;
1820 
1821 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1822 
1823 	rcu_read_lock();
1824 
1825 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1826 		if (!is_shadow_present_pte(iter.old_spte) ||
1827 		    !is_last_spte(iter.old_spte, iter.level))
1828 			continue;
1829 
1830 		new_spte = iter.old_spte &
1831 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1832 
1833 		if (new_spte == iter.old_spte)
1834 			break;
1835 
1836 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1837 		spte_set = true;
1838 	}
1839 
1840 	rcu_read_unlock();
1841 
1842 	return spte_set;
1843 }
1844 
1845 /*
1846  * Removes write access on the last level SPTE mapping this GFN and unsets the
1847  * MMU-writable bit to ensure future writes continue to be intercepted.
1848  * Returns true if an SPTE was set and a TLB flush is needed.
1849  */
1850 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1851 				   struct kvm_memory_slot *slot, gfn_t gfn,
1852 				   int min_level)
1853 {
1854 	struct kvm_mmu_page *root;
1855 	bool spte_set = false;
1856 
1857 	lockdep_assert_held_write(&kvm->mmu_lock);
1858 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1859 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1860 
1861 	return spte_set;
1862 }
1863 
1864 /*
1865  * Return the level of the lowest level SPTE added to sptes.
1866  * That SPTE may be non-present.
1867  *
1868  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1869  */
1870 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1871 			 int *root_level)
1872 {
1873 	struct tdp_iter iter;
1874 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1875 	gfn_t gfn = addr >> PAGE_SHIFT;
1876 	int leaf = -1;
1877 
1878 	*root_level = vcpu->arch.mmu->root_role.level;
1879 
1880 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1881 		leaf = iter.level;
1882 		sptes[leaf] = iter.old_spte;
1883 	}
1884 
1885 	return leaf;
1886 }
1887 
1888 /*
1889  * Returns the last level spte pointer of the shadow page walk for the given
1890  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1891  * walk could be performed, returns NULL and *spte does not contain valid data.
1892  *
1893  * Contract:
1894  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1895  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1896  *
1897  * WARNING: This function is only intended to be called during fast_page_fault.
1898  */
1899 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1900 					u64 *spte)
1901 {
1902 	struct tdp_iter iter;
1903 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1904 	gfn_t gfn = addr >> PAGE_SHIFT;
1905 	tdp_ptep_t sptep = NULL;
1906 
1907 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1908 		*spte = iter.old_spte;
1909 		sptep = iter.sptep;
1910 	}
1911 
1912 	/*
1913 	 * Perform the rcu_dereference to get the raw spte pointer value since
1914 	 * we are passing it up to fast_page_fault, which is shared with the
1915 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1916 	 * annotation.
1917 	 *
1918 	 * This is safe since fast_page_fault obeys the contracts of this
1919 	 * function as well as all TDP MMU contracts around modifying SPTEs
1920 	 * outside of mmu_lock.
1921 	 */
1922 	return rcu_dereference(sptep);
1923 }
1924