xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 16b0314a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.
52 	 */
53 	rcu_barrier();
54 }
55 
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
58 			  bool shared);
59 
60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 	free_page((unsigned long)sp->spt);
63 	kmem_cache_free(mmu_page_header_cache, sp);
64 }
65 
66 /*
67  * This is called through call_rcu in order to free TDP page table memory
68  * safely with respect to other kernel threads that may be operating on
69  * the memory.
70  * By only accessing TDP MMU page table memory in an RCU read critical
71  * section, and freeing it after a grace period, lockless access to that
72  * memory won't use it after it is freed.
73  */
74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 					       rcu_head);
78 
79 	tdp_mmu_free_sp(sp);
80 }
81 
82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 			  bool shared)
84 {
85 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86 
87 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 		return;
89 
90 	WARN_ON(!root->tdp_mmu_page);
91 
92 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 	list_del_rcu(&root->link);
94 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95 
96 	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97 
98 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100 
101 /*
102  * Finds the next valid root after root (or the first valid root if root
103  * is NULL), takes a reference on it, and returns that next root. If root
104  * is not NULL, this thread should have already taken a reference on it, and
105  * that reference will be dropped. If no valid root is found, this
106  * function will return NULL.
107  */
108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
109 					      struct kvm_mmu_page *prev_root,
110 					      bool shared)
111 {
112 	struct kvm_mmu_page *next_root;
113 
114 	rcu_read_lock();
115 
116 	if (prev_root)
117 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 						  &prev_root->link,
119 						  typeof(*prev_root), link);
120 	else
121 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 						   typeof(*next_root), link);
123 
124 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 				&next_root->link, typeof(*next_root), link);
127 
128 	rcu_read_unlock();
129 
130 	if (prev_root)
131 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
132 
133 	return next_root;
134 }
135 
136 /*
137  * Note: this iterator gets and puts references to the roots it iterates over.
138  * This makes it safe to release the MMU lock and yield within the loop, but
139  * if exiting the loop early, the caller must drop the reference to the most
140  * recent root. (Unless keeping a live reference is desirable.)
141  *
142  * If shared is set, this function is operating under the MMU lock in read
143  * mode. In the unlikely event that this thread must free a root, the lock
144  * will be temporarily dropped and reacquired in write mode.
145  */
146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
147 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
148 	     _root;							\
149 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
150 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
151 		} else
152 
153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
154 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
155 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
156 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
157 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
158 		} else
159 
160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 						   int level)
162 {
163 	union kvm_mmu_page_role role;
164 
165 	role = vcpu->arch.mmu->mmu_role.base;
166 	role.level = level;
167 	role.direct = true;
168 	role.gpte_is_8_bytes = true;
169 	role.access = ACC_ALL;
170 
171 	return role;
172 }
173 
174 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
175 					       int level)
176 {
177 	struct kvm_mmu_page *sp;
178 
179 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
180 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
181 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
182 
183 	sp->role.word = page_role_for_level(vcpu, level).word;
184 	sp->gfn = gfn;
185 	sp->tdp_mmu_page = true;
186 
187 	trace_kvm_mmu_get_page(sp, true);
188 
189 	return sp;
190 }
191 
192 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
193 {
194 	union kvm_mmu_page_role role;
195 	struct kvm *kvm = vcpu->kvm;
196 	struct kvm_mmu_page *root;
197 
198 	lockdep_assert_held_write(&kvm->mmu_lock);
199 
200 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
201 
202 	/* Check for an existing root before allocating a new one. */
203 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
204 		if (root->role.word == role.word &&
205 		    kvm_tdp_mmu_get_root(kvm, root))
206 			goto out;
207 	}
208 
209 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
210 	refcount_set(&root->tdp_mmu_root_count, 1);
211 
212 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
213 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
214 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
215 
216 out:
217 	return __pa(root->spt);
218 }
219 
220 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
221 				u64 old_spte, u64 new_spte, int level,
222 				bool shared);
223 
224 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
225 {
226 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
227 		return;
228 
229 	if (is_accessed_spte(old_spte) &&
230 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
231 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
232 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
233 }
234 
235 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
236 					  u64 old_spte, u64 new_spte, int level)
237 {
238 	bool pfn_changed;
239 	struct kvm_memory_slot *slot;
240 
241 	if (level > PG_LEVEL_4K)
242 		return;
243 
244 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
245 
246 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
247 	    is_writable_pte(new_spte)) {
248 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
249 		mark_page_dirty_in_slot(kvm, slot, gfn);
250 	}
251 }
252 
253 /**
254  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
255  *
256  * @kvm: kvm instance
257  * @sp: the new page
258  * @account_nx: This page replaces a NX large page and should be marked for
259  *		eventual reclaim.
260  */
261 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
262 			      bool account_nx)
263 {
264 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
265 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
266 	if (account_nx)
267 		account_huge_nx_page(kvm, sp);
268 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
269 }
270 
271 /**
272  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
273  *
274  * @kvm: kvm instance
275  * @sp: the page to be removed
276  * @shared: This operation may not be running under the exclusive use of
277  *	    the MMU lock and the operation must synchronize with other
278  *	    threads that might be adding or removing pages.
279  */
280 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
281 				bool shared)
282 {
283 	if (shared)
284 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
285 	else
286 		lockdep_assert_held_write(&kvm->mmu_lock);
287 
288 	list_del(&sp->link);
289 	if (sp->lpage_disallowed)
290 		unaccount_huge_nx_page(kvm, sp);
291 
292 	if (shared)
293 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
294 }
295 
296 /**
297  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
298  *
299  * @kvm: kvm instance
300  * @pt: the page removed from the paging structure
301  * @shared: This operation may not be running under the exclusive use
302  *	    of the MMU lock and the operation must synchronize with other
303  *	    threads that might be modifying SPTEs.
304  *
305  * Given a page table that has been removed from the TDP paging structure,
306  * iterates through the page table to clear SPTEs and free child page tables.
307  *
308  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
309  * protection. Since this thread removed it from the paging structure,
310  * this thread will be responsible for ensuring the page is freed. Hence the
311  * early rcu_dereferences in the function.
312  */
313 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
314 					bool shared)
315 {
316 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
317 	int level = sp->role.level;
318 	gfn_t base_gfn = sp->gfn;
319 	u64 old_child_spte;
320 	u64 *sptep;
321 	gfn_t gfn;
322 	int i;
323 
324 	trace_kvm_mmu_prepare_zap_page(sp);
325 
326 	tdp_mmu_unlink_page(kvm, sp, shared);
327 
328 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
329 		sptep = rcu_dereference(pt) + i;
330 		gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
331 
332 		if (shared) {
333 			/*
334 			 * Set the SPTE to a nonpresent value that other
335 			 * threads will not overwrite. If the SPTE was
336 			 * already marked as removed then another thread
337 			 * handling a page fault could overwrite it, so
338 			 * set the SPTE until it is set from some other
339 			 * value to the removed SPTE value.
340 			 */
341 			for (;;) {
342 				old_child_spte = xchg(sptep, REMOVED_SPTE);
343 				if (!is_removed_spte(old_child_spte))
344 					break;
345 				cpu_relax();
346 			}
347 		} else {
348 			/*
349 			 * If the SPTE is not MMU-present, there is no backing
350 			 * page associated with the SPTE and so no side effects
351 			 * that need to be recorded, and exclusive ownership of
352 			 * mmu_lock ensures the SPTE can't be made present.
353 			 * Note, zapping MMIO SPTEs is also unnecessary as they
354 			 * are guarded by the memslots generation, not by being
355 			 * unreachable.
356 			 */
357 			old_child_spte = READ_ONCE(*sptep);
358 			if (!is_shadow_present_pte(old_child_spte))
359 				continue;
360 
361 			/*
362 			 * Marking the SPTE as a removed SPTE is not
363 			 * strictly necessary here as the MMU lock will
364 			 * stop other threads from concurrently modifying
365 			 * this SPTE. Using the removed SPTE value keeps
366 			 * the two branches consistent and simplifies
367 			 * the function.
368 			 */
369 			WRITE_ONCE(*sptep, REMOVED_SPTE);
370 		}
371 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
372 				    old_child_spte, REMOVED_SPTE, level,
373 				    shared);
374 	}
375 
376 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
377 					   KVM_PAGES_PER_HPAGE(level + 1));
378 
379 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
380 }
381 
382 /**
383  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
384  * @kvm: kvm instance
385  * @as_id: the address space of the paging structure the SPTE was a part of
386  * @gfn: the base GFN that was mapped by the SPTE
387  * @old_spte: The value of the SPTE before the change
388  * @new_spte: The value of the SPTE after the change
389  * @level: the level of the PT the SPTE is part of in the paging structure
390  * @shared: This operation may not be running under the exclusive use of
391  *	    the MMU lock and the operation must synchronize with other
392  *	    threads that might be modifying SPTEs.
393  *
394  * Handle bookkeeping that might result from the modification of a SPTE.
395  * This function must be called for all TDP SPTE modifications.
396  */
397 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
398 				  u64 old_spte, u64 new_spte, int level,
399 				  bool shared)
400 {
401 	bool was_present = is_shadow_present_pte(old_spte);
402 	bool is_present = is_shadow_present_pte(new_spte);
403 	bool was_leaf = was_present && is_last_spte(old_spte, level);
404 	bool is_leaf = is_present && is_last_spte(new_spte, level);
405 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
406 
407 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
408 	WARN_ON(level < PG_LEVEL_4K);
409 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
410 
411 	/*
412 	 * If this warning were to trigger it would indicate that there was a
413 	 * missing MMU notifier or a race with some notifier handler.
414 	 * A present, leaf SPTE should never be directly replaced with another
415 	 * present leaf SPTE pointing to a different PFN. A notifier handler
416 	 * should be zapping the SPTE before the main MM's page table is
417 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
418 	 * thread before replacement.
419 	 */
420 	if (was_leaf && is_leaf && pfn_changed) {
421 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
422 		       "SPTE with another present leaf SPTE mapping a\n"
423 		       "different PFN!\n"
424 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
425 		       as_id, gfn, old_spte, new_spte, level);
426 
427 		/*
428 		 * Crash the host to prevent error propagation and guest data
429 		 * corruption.
430 		 */
431 		BUG();
432 	}
433 
434 	if (old_spte == new_spte)
435 		return;
436 
437 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
438 
439 	/*
440 	 * The only times a SPTE should be changed from a non-present to
441 	 * non-present state is when an MMIO entry is installed/modified/
442 	 * removed. In that case, there is nothing to do here.
443 	 */
444 	if (!was_present && !is_present) {
445 		/*
446 		 * If this change does not involve a MMIO SPTE or removed SPTE,
447 		 * it is unexpected. Log the change, though it should not
448 		 * impact the guest since both the former and current SPTEs
449 		 * are nonpresent.
450 		 */
451 		if (WARN_ON(!is_mmio_spte(old_spte) &&
452 			    !is_mmio_spte(new_spte) &&
453 			    !is_removed_spte(new_spte)))
454 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
455 			       "should not be replaced with another,\n"
456 			       "different nonpresent SPTE, unless one or both\n"
457 			       "are MMIO SPTEs, or the new SPTE is\n"
458 			       "a temporary removed SPTE.\n"
459 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
460 			       as_id, gfn, old_spte, new_spte, level);
461 		return;
462 	}
463 
464 	if (is_leaf != was_leaf)
465 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
466 
467 	if (was_leaf && is_dirty_spte(old_spte) &&
468 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
469 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
470 
471 	/*
472 	 * Recursively handle child PTs if the change removed a subtree from
473 	 * the paging structure.
474 	 */
475 	if (was_present && !was_leaf && (pfn_changed || !is_present))
476 		handle_removed_tdp_mmu_page(kvm,
477 				spte_to_child_pt(old_spte, level), shared);
478 }
479 
480 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
481 				u64 old_spte, u64 new_spte, int level,
482 				bool shared)
483 {
484 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
485 			      shared);
486 	handle_changed_spte_acc_track(old_spte, new_spte, level);
487 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
488 				      new_spte, level);
489 }
490 
491 /*
492  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
493  * and handle the associated bookkeeping, but do not mark the page dirty
494  * in KVM's dirty bitmaps.
495  *
496  * @kvm: kvm instance
497  * @iter: a tdp_iter instance currently on the SPTE that should be set
498  * @new_spte: The value the SPTE should be set to
499  * Returns: true if the SPTE was set, false if it was not. If false is returned,
500  *	    this function will have no side-effects.
501  */
502 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
503 							struct tdp_iter *iter,
504 							u64 new_spte)
505 {
506 	lockdep_assert_held_read(&kvm->mmu_lock);
507 
508 	/*
509 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
510 	 * may modify it.
511 	 */
512 	if (is_removed_spte(iter->old_spte))
513 		return false;
514 
515 	/*
516 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
517 	 * does not hold the mmu_lock.
518 	 */
519 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
520 		      new_spte) != iter->old_spte)
521 		return false;
522 
523 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
524 			      new_spte, iter->level, true);
525 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
526 
527 	return true;
528 }
529 
530 /*
531  * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
532  * TDP page fault.
533  *
534  * @vcpu: The vcpu instance that took the TDP page fault.
535  * @iter: a tdp_iter instance currently on the SPTE that should be set
536  * @new_spte: The value the SPTE should be set to
537  *
538  * Returns: true if the SPTE was set, false if it was not. If false is returned,
539  *	    this function will have no side-effects.
540  */
541 static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
542 					       struct tdp_iter *iter,
543 					       u64 new_spte)
544 {
545 	struct kvm *kvm = vcpu->kvm;
546 
547 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
548 		return false;
549 
550 	/*
551 	 * Use kvm_vcpu_gfn_to_memslot() instead of going through
552 	 * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
553 	 */
554 	if (is_writable_pte(new_spte)) {
555 		struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
556 
557 		if (slot && kvm_slot_dirty_track_enabled(slot)) {
558 			/* Enforced by kvm_mmu_hugepage_adjust. */
559 			WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
560 			mark_page_dirty_in_slot(kvm, slot, iter->gfn);
561 		}
562 	}
563 
564 	return true;
565 }
566 
567 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
568 					   struct tdp_iter *iter)
569 {
570 	/*
571 	 * Freeze the SPTE by setting it to a special,
572 	 * non-present value. This will stop other threads from
573 	 * immediately installing a present entry in its place
574 	 * before the TLBs are flushed.
575 	 */
576 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
577 		return false;
578 
579 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
580 					   KVM_PAGES_PER_HPAGE(iter->level));
581 
582 	/*
583 	 * No other thread can overwrite the removed SPTE as they
584 	 * must either wait on the MMU lock or use
585 	 * tdp_mmu_set_spte_atomic which will not overwrite the
586 	 * special removed SPTE value. No bookkeeping is needed
587 	 * here since the SPTE is going from non-present
588 	 * to non-present.
589 	 */
590 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
591 
592 	return true;
593 }
594 
595 
596 /*
597  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
598  * @kvm: kvm instance
599  * @iter: a tdp_iter instance currently on the SPTE that should be set
600  * @new_spte: The value the SPTE should be set to
601  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
602  *		      of the page. Should be set unless handling an MMU
603  *		      notifier for access tracking. Leaving record_acc_track
604  *		      unset in that case prevents page accesses from being
605  *		      double counted.
606  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
607  *		      appropriate for the change being made. Should be set
608  *		      unless performing certain dirty logging operations.
609  *		      Leaving record_dirty_log unset in that case prevents page
610  *		      writes from being double counted.
611  */
612 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
613 				      u64 new_spte, bool record_acc_track,
614 				      bool record_dirty_log)
615 {
616 	lockdep_assert_held_write(&kvm->mmu_lock);
617 
618 	/*
619 	 * No thread should be using this function to set SPTEs to the
620 	 * temporary removed SPTE value.
621 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
622 	 * should be used. If operating under the MMU lock in write mode, the
623 	 * use of the removed SPTE should not be necessary.
624 	 */
625 	WARN_ON(is_removed_spte(iter->old_spte));
626 
627 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
628 
629 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
630 			      new_spte, iter->level, false);
631 	if (record_acc_track)
632 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
633 					      iter->level);
634 	if (record_dirty_log)
635 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
636 					      iter->old_spte, new_spte,
637 					      iter->level);
638 }
639 
640 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
641 				    u64 new_spte)
642 {
643 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
644 }
645 
646 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
647 						 struct tdp_iter *iter,
648 						 u64 new_spte)
649 {
650 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
651 }
652 
653 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
654 						 struct tdp_iter *iter,
655 						 u64 new_spte)
656 {
657 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
658 }
659 
660 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
661 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
662 
663 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
664 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
665 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
666 		    !is_last_spte(_iter.old_spte, _iter.level))		\
667 			continue;					\
668 		else
669 
670 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
671 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
672 			 _mmu->shadow_root_level, _start, _end)
673 
674 /*
675  * Yield if the MMU lock is contended or this thread needs to return control
676  * to the scheduler.
677  *
678  * If this function should yield and flush is set, it will perform a remote
679  * TLB flush before yielding.
680  *
681  * If this function yields, it will also reset the tdp_iter's walk over the
682  * paging structure and the calling function should skip to the next
683  * iteration to allow the iterator to continue its traversal from the
684  * paging structure root.
685  *
686  * Return true if this function yielded and the iterator's traversal was reset.
687  * Return false if a yield was not needed.
688  */
689 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
690 					     struct tdp_iter *iter, bool flush,
691 					     bool shared)
692 {
693 	/* Ensure forward progress has been made before yielding. */
694 	if (iter->next_last_level_gfn == iter->yielded_gfn)
695 		return false;
696 
697 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
698 		rcu_read_unlock();
699 
700 		if (flush)
701 			kvm_flush_remote_tlbs(kvm);
702 
703 		if (shared)
704 			cond_resched_rwlock_read(&kvm->mmu_lock);
705 		else
706 			cond_resched_rwlock_write(&kvm->mmu_lock);
707 
708 		rcu_read_lock();
709 
710 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
711 
712 		tdp_iter_restart(iter);
713 
714 		return true;
715 	}
716 
717 	return false;
718 }
719 
720 /*
721  * Tears down the mappings for the range of gfns, [start, end), and frees the
722  * non-root pages mapping GFNs strictly within that range. Returns true if
723  * SPTEs have been cleared and a TLB flush is needed before releasing the
724  * MMU lock.
725  *
726  * If can_yield is true, will release the MMU lock and reschedule if the
727  * scheduler needs the CPU or there is contention on the MMU lock. If this
728  * function cannot yield, it will not release the MMU lock or reschedule and
729  * the caller must ensure it does not supply too large a GFN range, or the
730  * operation can cause a soft lockup.
731  *
732  * If shared is true, this thread holds the MMU lock in read mode and must
733  * account for the possibility that other threads are modifying the paging
734  * structures concurrently. If shared is false, this thread should hold the
735  * MMU lock in write mode.
736  */
737 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
738 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
739 			  bool shared)
740 {
741 	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
742 	bool zap_all = (start == 0 && end >= max_gfn_host);
743 	struct tdp_iter iter;
744 
745 	/*
746 	 * No need to try to step down in the iterator when zapping all SPTEs,
747 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
748 	 */
749 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
750 
751 	/*
752 	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
753 	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
754 	 * and so KVM will never install a SPTE for such addresses.
755 	 */
756 	end = min(end, max_gfn_host);
757 
758 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
759 
760 	rcu_read_lock();
761 
762 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
763 				   min_level, start, end) {
764 retry:
765 		if (can_yield &&
766 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
767 			flush = false;
768 			continue;
769 		}
770 
771 		if (!is_shadow_present_pte(iter.old_spte))
772 			continue;
773 
774 		/*
775 		 * If this is a non-last-level SPTE that covers a larger range
776 		 * than should be zapped, continue, and zap the mappings at a
777 		 * lower level, except when zapping all SPTEs.
778 		 */
779 		if (!zap_all &&
780 		    (iter.gfn < start ||
781 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
782 		    !is_last_spte(iter.old_spte, iter.level))
783 			continue;
784 
785 		if (!shared) {
786 			tdp_mmu_set_spte(kvm, &iter, 0);
787 			flush = true;
788 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
789 			/*
790 			 * The iter must explicitly re-read the SPTE because
791 			 * the atomic cmpxchg failed.
792 			 */
793 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
794 			goto retry;
795 		}
796 	}
797 
798 	rcu_read_unlock();
799 	return flush;
800 }
801 
802 /*
803  * Tears down the mappings for the range of gfns, [start, end), and frees the
804  * non-root pages mapping GFNs strictly within that range. Returns true if
805  * SPTEs have been cleared and a TLB flush is needed before releasing the
806  * MMU lock.
807  */
808 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
809 				 gfn_t end, bool can_yield, bool flush)
810 {
811 	struct kvm_mmu_page *root;
812 
813 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
814 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
815 				      false);
816 
817 	return flush;
818 }
819 
820 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
821 {
822 	bool flush = false;
823 	int i;
824 
825 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
826 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
827 
828 	if (flush)
829 		kvm_flush_remote_tlbs(kvm);
830 }
831 
832 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
833 						  struct kvm_mmu_page *prev_root)
834 {
835 	struct kvm_mmu_page *next_root;
836 
837 	if (prev_root)
838 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
839 						  &prev_root->link,
840 						  typeof(*prev_root), link);
841 	else
842 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
843 						   typeof(*next_root), link);
844 
845 	while (next_root && !(next_root->role.invalid &&
846 			      refcount_read(&next_root->tdp_mmu_root_count)))
847 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
848 						  &next_root->link,
849 						  typeof(*next_root), link);
850 
851 	return next_root;
852 }
853 
854 /*
855  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
856  * invalidated root, they will not be freed until this function drops the
857  * reference. Before dropping that reference, tear down the paging
858  * structure so that whichever thread does drop the last reference
859  * only has to do a trivial amount of work. Since the roots are invalid,
860  * no new SPTEs should be created under them.
861  */
862 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
863 {
864 	struct kvm_mmu_page *next_root;
865 	struct kvm_mmu_page *root;
866 	bool flush = false;
867 
868 	lockdep_assert_held_read(&kvm->mmu_lock);
869 
870 	rcu_read_lock();
871 
872 	root = next_invalidated_root(kvm, NULL);
873 
874 	while (root) {
875 		next_root = next_invalidated_root(kvm, root);
876 
877 		rcu_read_unlock();
878 
879 		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
880 
881 		/*
882 		 * Put the reference acquired in
883 		 * kvm_tdp_mmu_invalidate_roots
884 		 */
885 		kvm_tdp_mmu_put_root(kvm, root, true);
886 
887 		root = next_root;
888 
889 		rcu_read_lock();
890 	}
891 
892 	rcu_read_unlock();
893 
894 	if (flush)
895 		kvm_flush_remote_tlbs(kvm);
896 }
897 
898 /*
899  * Mark each TDP MMU root as invalid so that other threads
900  * will drop their references and allow the root count to
901  * go to 0.
902  *
903  * Also take a reference on all roots so that this thread
904  * can do the bulk of the work required to free the roots
905  * once they are invalidated. Without this reference, a
906  * vCPU thread might drop the last reference to a root and
907  * get stuck with tearing down the entire paging structure.
908  *
909  * Roots which have a zero refcount should be skipped as
910  * they're already being torn down.
911  * Already invalid roots should be referenced again so that
912  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
913  * done with them.
914  *
915  * This has essentially the same effect for the TDP MMU
916  * as updating mmu_valid_gen does for the shadow MMU.
917  */
918 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
919 {
920 	struct kvm_mmu_page *root;
921 
922 	lockdep_assert_held_write(&kvm->mmu_lock);
923 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
924 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
925 			root->role.invalid = true;
926 }
927 
928 /*
929  * Installs a last-level SPTE to handle a TDP page fault.
930  * (NPT/EPT violation/misconfiguration)
931  */
932 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
933 					  int map_writable,
934 					  struct tdp_iter *iter,
935 					  kvm_pfn_t pfn, bool prefault)
936 {
937 	u64 new_spte;
938 	int ret = RET_PF_FIXED;
939 	int make_spte_ret = 0;
940 
941 	if (unlikely(is_noslot_pfn(pfn)))
942 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
943 	else
944 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
945 					 pfn, iter->old_spte, prefault, true,
946 					 map_writable, !shadow_accessed_mask,
947 					 &new_spte);
948 
949 	if (new_spte == iter->old_spte)
950 		ret = RET_PF_SPURIOUS;
951 	else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
952 		return RET_PF_RETRY;
953 
954 	/*
955 	 * If the page fault was caused by a write but the page is write
956 	 * protected, emulation is needed. If the emulation was skipped,
957 	 * the vCPU would have the same fault again.
958 	 */
959 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
960 		if (write)
961 			ret = RET_PF_EMULATE;
962 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
963 	}
964 
965 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
966 	if (unlikely(is_mmio_spte(new_spte))) {
967 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
968 				     new_spte);
969 		ret = RET_PF_EMULATE;
970 	} else {
971 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
972 				       rcu_dereference(iter->sptep));
973 	}
974 
975 	/*
976 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
977 	 * consistent with legacy MMU behavior.
978 	 */
979 	if (ret != RET_PF_SPURIOUS)
980 		vcpu->stat.pf_fixed++;
981 
982 	return ret;
983 }
984 
985 /*
986  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
987  * page tables and SPTEs to translate the faulting guest physical address.
988  */
989 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
990 		    int map_writable, int max_level, kvm_pfn_t pfn,
991 		    bool prefault)
992 {
993 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
994 	bool write = error_code & PFERR_WRITE_MASK;
995 	bool exec = error_code & PFERR_FETCH_MASK;
996 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
997 	struct kvm_mmu *mmu = vcpu->arch.mmu;
998 	struct tdp_iter iter;
999 	struct kvm_mmu_page *sp;
1000 	u64 *child_pt;
1001 	u64 new_spte;
1002 	int ret;
1003 	gfn_t gfn = gpa >> PAGE_SHIFT;
1004 	int level;
1005 	int req_level;
1006 
1007 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
1008 					huge_page_disallowed, &req_level);
1009 
1010 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
1011 
1012 	rcu_read_lock();
1013 
1014 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1015 		if (nx_huge_page_workaround_enabled)
1016 			disallowed_hugepage_adjust(iter.old_spte, gfn,
1017 						   iter.level, &pfn, &level);
1018 
1019 		if (iter.level == level)
1020 			break;
1021 
1022 		/*
1023 		 * If there is an SPTE mapping a large page at a higher level
1024 		 * than the target, that SPTE must be cleared and replaced
1025 		 * with a non-leaf SPTE.
1026 		 */
1027 		if (is_shadow_present_pte(iter.old_spte) &&
1028 		    is_large_pte(iter.old_spte)) {
1029 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1030 				break;
1031 
1032 			/*
1033 			 * The iter must explicitly re-read the spte here
1034 			 * because the new value informs the !present
1035 			 * path below.
1036 			 */
1037 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1038 		}
1039 
1040 		if (!is_shadow_present_pte(iter.old_spte)) {
1041 			/*
1042 			 * If SPTE has been frozen by another thread, just
1043 			 * give up and retry, avoiding unnecessary page table
1044 			 * allocation and free.
1045 			 */
1046 			if (is_removed_spte(iter.old_spte))
1047 				break;
1048 
1049 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1050 			child_pt = sp->spt;
1051 
1052 			new_spte = make_nonleaf_spte(child_pt,
1053 						     !shadow_accessed_mask);
1054 
1055 			if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
1056 				tdp_mmu_link_page(vcpu->kvm, sp,
1057 						  huge_page_disallowed &&
1058 						  req_level >= iter.level);
1059 
1060 				trace_kvm_mmu_get_page(sp, true);
1061 			} else {
1062 				tdp_mmu_free_sp(sp);
1063 				break;
1064 			}
1065 		}
1066 	}
1067 
1068 	if (iter.level != level) {
1069 		rcu_read_unlock();
1070 		return RET_PF_RETRY;
1071 	}
1072 
1073 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1074 					      pfn, prefault);
1075 	rcu_read_unlock();
1076 
1077 	return ret;
1078 }
1079 
1080 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1081 				 bool flush)
1082 {
1083 	struct kvm_mmu_page *root;
1084 
1085 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1086 		flush |= zap_gfn_range(kvm, root, range->start, range->end,
1087 				       range->may_block, flush, false);
1088 
1089 	return flush;
1090 }
1091 
1092 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1093 			      struct kvm_gfn_range *range);
1094 
1095 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1096 						   struct kvm_gfn_range *range,
1097 						   tdp_handler_t handler)
1098 {
1099 	struct kvm_mmu_page *root;
1100 	struct tdp_iter iter;
1101 	bool ret = false;
1102 
1103 	rcu_read_lock();
1104 
1105 	/*
1106 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1107 	 * into this helper allow blocking; it'd be dead, wasteful code.
1108 	 */
1109 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1110 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1111 			ret |= handler(kvm, &iter, range);
1112 	}
1113 
1114 	rcu_read_unlock();
1115 
1116 	return ret;
1117 }
1118 
1119 /*
1120  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1121  * if any of the GFNs in the range have been accessed.
1122  */
1123 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1124 			  struct kvm_gfn_range *range)
1125 {
1126 	u64 new_spte = 0;
1127 
1128 	/* If we have a non-accessed entry we don't need to change the pte. */
1129 	if (!is_accessed_spte(iter->old_spte))
1130 		return false;
1131 
1132 	new_spte = iter->old_spte;
1133 
1134 	if (spte_ad_enabled(new_spte)) {
1135 		new_spte &= ~shadow_accessed_mask;
1136 	} else {
1137 		/*
1138 		 * Capture the dirty status of the page, so that it doesn't get
1139 		 * lost when the SPTE is marked for access tracking.
1140 		 */
1141 		if (is_writable_pte(new_spte))
1142 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1143 
1144 		new_spte = mark_spte_for_access_track(new_spte);
1145 	}
1146 
1147 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1148 
1149 	return true;
1150 }
1151 
1152 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1153 {
1154 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1155 }
1156 
1157 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1158 			 struct kvm_gfn_range *range)
1159 {
1160 	return is_accessed_spte(iter->old_spte);
1161 }
1162 
1163 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1164 {
1165 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1166 }
1167 
1168 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1169 			 struct kvm_gfn_range *range)
1170 {
1171 	u64 new_spte;
1172 
1173 	/* Huge pages aren't expected to be modified without first being zapped. */
1174 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1175 
1176 	if (iter->level != PG_LEVEL_4K ||
1177 	    !is_shadow_present_pte(iter->old_spte))
1178 		return false;
1179 
1180 	/*
1181 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1182 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1183 	 * invariant that the PFN of a present * leaf SPTE can never change.
1184 	 * See __handle_changed_spte().
1185 	 */
1186 	tdp_mmu_set_spte(kvm, iter, 0);
1187 
1188 	if (!pte_write(range->pte)) {
1189 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1190 								  pte_pfn(range->pte));
1191 
1192 		tdp_mmu_set_spte(kvm, iter, new_spte);
1193 	}
1194 
1195 	return true;
1196 }
1197 
1198 /*
1199  * Handle the changed_pte MMU notifier for the TDP MMU.
1200  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1201  * notifier.
1202  * Returns non-zero if a flush is needed before releasing the MMU lock.
1203  */
1204 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1205 {
1206 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1207 
1208 	/* FIXME: return 'flush' instead of flushing here. */
1209 	if (flush)
1210 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1211 
1212 	return false;
1213 }
1214 
1215 /*
1216  * Remove write access from all SPTEs at or above min_level that map GFNs
1217  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1218  * be flushed.
1219  */
1220 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1221 			     gfn_t start, gfn_t end, int min_level)
1222 {
1223 	struct tdp_iter iter;
1224 	u64 new_spte;
1225 	bool spte_set = false;
1226 
1227 	rcu_read_lock();
1228 
1229 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1230 
1231 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1232 				   min_level, start, end) {
1233 retry:
1234 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1235 			continue;
1236 
1237 		if (!is_shadow_present_pte(iter.old_spte) ||
1238 		    !is_last_spte(iter.old_spte, iter.level) ||
1239 		    !(iter.old_spte & PT_WRITABLE_MASK))
1240 			continue;
1241 
1242 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1243 
1244 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1245 							  new_spte)) {
1246 			/*
1247 			 * The iter must explicitly re-read the SPTE because
1248 			 * the atomic cmpxchg failed.
1249 			 */
1250 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1251 			goto retry;
1252 		}
1253 		spte_set = true;
1254 	}
1255 
1256 	rcu_read_unlock();
1257 	return spte_set;
1258 }
1259 
1260 /*
1261  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1262  * only affect leaf SPTEs down to min_level.
1263  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1264  */
1265 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1266 			     const struct kvm_memory_slot *slot, int min_level)
1267 {
1268 	struct kvm_mmu_page *root;
1269 	bool spte_set = false;
1270 
1271 	lockdep_assert_held_read(&kvm->mmu_lock);
1272 
1273 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1274 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1275 			     slot->base_gfn + slot->npages, min_level);
1276 
1277 	return spte_set;
1278 }
1279 
1280 /*
1281  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1282  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1283  * If AD bits are not enabled, this will require clearing the writable bit on
1284  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1285  * be flushed.
1286  */
1287 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1288 			   gfn_t start, gfn_t end)
1289 {
1290 	struct tdp_iter iter;
1291 	u64 new_spte;
1292 	bool spte_set = false;
1293 
1294 	rcu_read_lock();
1295 
1296 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1297 retry:
1298 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1299 			continue;
1300 
1301 		if (spte_ad_need_write_protect(iter.old_spte)) {
1302 			if (is_writable_pte(iter.old_spte))
1303 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1304 			else
1305 				continue;
1306 		} else {
1307 			if (iter.old_spte & shadow_dirty_mask)
1308 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1309 			else
1310 				continue;
1311 		}
1312 
1313 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1314 							  new_spte)) {
1315 			/*
1316 			 * The iter must explicitly re-read the SPTE because
1317 			 * the atomic cmpxchg failed.
1318 			 */
1319 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1320 			goto retry;
1321 		}
1322 		spte_set = true;
1323 	}
1324 
1325 	rcu_read_unlock();
1326 	return spte_set;
1327 }
1328 
1329 /*
1330  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1331  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1332  * If AD bits are not enabled, this will require clearing the writable bit on
1333  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1334  * be flushed.
1335  */
1336 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1337 				  const struct kvm_memory_slot *slot)
1338 {
1339 	struct kvm_mmu_page *root;
1340 	bool spte_set = false;
1341 
1342 	lockdep_assert_held_read(&kvm->mmu_lock);
1343 
1344 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1345 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1346 				slot->base_gfn + slot->npages);
1347 
1348 	return spte_set;
1349 }
1350 
1351 /*
1352  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1353  * set in mask, starting at gfn. The given memslot is expected to contain all
1354  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1355  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1356  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1357  */
1358 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1359 				  gfn_t gfn, unsigned long mask, bool wrprot)
1360 {
1361 	struct tdp_iter iter;
1362 	u64 new_spte;
1363 
1364 	rcu_read_lock();
1365 
1366 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1367 				    gfn + BITS_PER_LONG) {
1368 		if (!mask)
1369 			break;
1370 
1371 		if (iter.level > PG_LEVEL_4K ||
1372 		    !(mask & (1UL << (iter.gfn - gfn))))
1373 			continue;
1374 
1375 		mask &= ~(1UL << (iter.gfn - gfn));
1376 
1377 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1378 			if (is_writable_pte(iter.old_spte))
1379 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1380 			else
1381 				continue;
1382 		} else {
1383 			if (iter.old_spte & shadow_dirty_mask)
1384 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1385 			else
1386 				continue;
1387 		}
1388 
1389 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1390 	}
1391 
1392 	rcu_read_unlock();
1393 }
1394 
1395 /*
1396  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1397  * set in mask, starting at gfn. The given memslot is expected to contain all
1398  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1399  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1400  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1401  */
1402 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1403 				       struct kvm_memory_slot *slot,
1404 				       gfn_t gfn, unsigned long mask,
1405 				       bool wrprot)
1406 {
1407 	struct kvm_mmu_page *root;
1408 
1409 	lockdep_assert_held_write(&kvm->mmu_lock);
1410 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1411 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1412 }
1413 
1414 /*
1415  * Clear leaf entries which could be replaced by large mappings, for
1416  * GFNs within the slot.
1417  */
1418 static bool zap_collapsible_spte_range(struct kvm *kvm,
1419 				       struct kvm_mmu_page *root,
1420 				       const struct kvm_memory_slot *slot,
1421 				       bool flush)
1422 {
1423 	gfn_t start = slot->base_gfn;
1424 	gfn_t end = start + slot->npages;
1425 	struct tdp_iter iter;
1426 	kvm_pfn_t pfn;
1427 
1428 	rcu_read_lock();
1429 
1430 	tdp_root_for_each_pte(iter, root, start, end) {
1431 retry:
1432 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1433 			flush = false;
1434 			continue;
1435 		}
1436 
1437 		if (!is_shadow_present_pte(iter.old_spte) ||
1438 		    !is_last_spte(iter.old_spte, iter.level))
1439 			continue;
1440 
1441 		pfn = spte_to_pfn(iter.old_spte);
1442 		if (kvm_is_reserved_pfn(pfn) ||
1443 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1444 							    pfn, PG_LEVEL_NUM))
1445 			continue;
1446 
1447 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1448 			/*
1449 			 * The iter must explicitly re-read the SPTE because
1450 			 * the atomic cmpxchg failed.
1451 			 */
1452 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1453 			goto retry;
1454 		}
1455 		flush = true;
1456 	}
1457 
1458 	rcu_read_unlock();
1459 
1460 	return flush;
1461 }
1462 
1463 /*
1464  * Clear non-leaf entries (and free associated page tables) which could
1465  * be replaced by large mappings, for GFNs within the slot.
1466  */
1467 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1468 				       const struct kvm_memory_slot *slot,
1469 				       bool flush)
1470 {
1471 	struct kvm_mmu_page *root;
1472 
1473 	lockdep_assert_held_read(&kvm->mmu_lock);
1474 
1475 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1476 		flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1477 
1478 	return flush;
1479 }
1480 
1481 /*
1482  * Removes write access on the last level SPTE mapping this GFN and unsets the
1483  * MMU-writable bit to ensure future writes continue to be intercepted.
1484  * Returns true if an SPTE was set and a TLB flush is needed.
1485  */
1486 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1487 			      gfn_t gfn, int min_level)
1488 {
1489 	struct tdp_iter iter;
1490 	u64 new_spte;
1491 	bool spte_set = false;
1492 
1493 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1494 
1495 	rcu_read_lock();
1496 
1497 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1498 				   min_level, gfn, gfn + 1) {
1499 		if (!is_shadow_present_pte(iter.old_spte) ||
1500 		    !is_last_spte(iter.old_spte, iter.level))
1501 			continue;
1502 
1503 		if (!is_writable_pte(iter.old_spte))
1504 			break;
1505 
1506 		new_spte = iter.old_spte &
1507 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1508 
1509 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1510 		spte_set = true;
1511 	}
1512 
1513 	rcu_read_unlock();
1514 
1515 	return spte_set;
1516 }
1517 
1518 /*
1519  * Removes write access on the last level SPTE mapping this GFN and unsets the
1520  * MMU-writable bit to ensure future writes continue to be intercepted.
1521  * Returns true if an SPTE was set and a TLB flush is needed.
1522  */
1523 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1524 				   struct kvm_memory_slot *slot, gfn_t gfn,
1525 				   int min_level)
1526 {
1527 	struct kvm_mmu_page *root;
1528 	bool spte_set = false;
1529 
1530 	lockdep_assert_held_write(&kvm->mmu_lock);
1531 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1532 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1533 
1534 	return spte_set;
1535 }
1536 
1537 /*
1538  * Return the level of the lowest level SPTE added to sptes.
1539  * That SPTE may be non-present.
1540  *
1541  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1542  */
1543 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1544 			 int *root_level)
1545 {
1546 	struct tdp_iter iter;
1547 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1548 	gfn_t gfn = addr >> PAGE_SHIFT;
1549 	int leaf = -1;
1550 
1551 	*root_level = vcpu->arch.mmu->shadow_root_level;
1552 
1553 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1554 		leaf = iter.level;
1555 		sptes[leaf] = iter.old_spte;
1556 	}
1557 
1558 	return leaf;
1559 }
1560 
1561 /*
1562  * Returns the last level spte pointer of the shadow page walk for the given
1563  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1564  * walk could be performed, returns NULL and *spte does not contain valid data.
1565  *
1566  * Contract:
1567  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1568  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1569  *
1570  * WARNING: This function is only intended to be called during fast_page_fault.
1571  */
1572 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1573 					u64 *spte)
1574 {
1575 	struct tdp_iter iter;
1576 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1577 	gfn_t gfn = addr >> PAGE_SHIFT;
1578 	tdp_ptep_t sptep = NULL;
1579 
1580 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1581 		*spte = iter.old_spte;
1582 		sptep = iter.sptep;
1583 	}
1584 
1585 	/*
1586 	 * Perform the rcu_dereference to get the raw spte pointer value since
1587 	 * we are passing it up to fast_page_fault, which is shared with the
1588 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1589 	 * annotation.
1590 	 *
1591 	 * This is safe since fast_page_fault obeys the contracts of this
1592 	 * function as well as all TDP MMU contracts around modifying SPTEs
1593 	 * outside of mmu_lock.
1594 	 */
1595 	return rcu_dereference(sptep);
1596 }
1597