xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 67bb66d32905627e29400e2cb7f87a7c4c8cf667)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47 
48 	/*
49 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
50 	 * can run before the VM is torn down.
51 	 */
52 	rcu_barrier();
53 }
54 
55 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
56 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
57 			  bool shared);
58 
59 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
60 {
61 	free_page((unsigned long)sp->spt);
62 	kmem_cache_free(mmu_page_header_cache, sp);
63 }
64 
65 /*
66  * This is called through call_rcu in order to free TDP page table memory
67  * safely with respect to other kernel threads that may be operating on
68  * the memory.
69  * By only accessing TDP MMU page table memory in an RCU read critical
70  * section, and freeing it after a grace period, lockless access to that
71  * memory won't use it after it is freed.
72  */
73 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
74 {
75 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
76 					       rcu_head);
77 
78 	tdp_mmu_free_sp(sp);
79 }
80 
81 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
82 			  bool shared)
83 {
84 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
85 
86 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
87 
88 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
89 		return;
90 
91 	WARN_ON(!root->tdp_mmu_page);
92 
93 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
94 	list_del_rcu(&root->link);
95 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
96 
97 	zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
98 
99 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
100 }
101 
102 /*
103  * Finds the next valid root after root (or the first valid root if root
104  * is NULL), takes a reference on it, and returns that next root. If root
105  * is not NULL, this thread should have already taken a reference on it, and
106  * that reference will be dropped. If no valid root is found, this
107  * function will return NULL.
108  */
109 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
110 					      struct kvm_mmu_page *prev_root,
111 					      bool shared)
112 {
113 	struct kvm_mmu_page *next_root;
114 
115 	rcu_read_lock();
116 
117 	if (prev_root)
118 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
119 						  &prev_root->link,
120 						  typeof(*prev_root), link);
121 	else
122 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
123 						   typeof(*next_root), link);
124 
125 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
126 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
127 				&next_root->link, typeof(*next_root), link);
128 
129 	rcu_read_unlock();
130 
131 	if (prev_root)
132 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
133 
134 	return next_root;
135 }
136 
137 /*
138  * Note: this iterator gets and puts references to the roots it iterates over.
139  * This makes it safe to release the MMU lock and yield within the loop, but
140  * if exiting the loop early, the caller must drop the reference to the most
141  * recent root. (Unless keeping a live reference is desirable.)
142  *
143  * If shared is set, this function is operating under the MMU lock in read
144  * mode. In the unlikely event that this thread must free a root, the lock
145  * will be temporarily dropped and reacquired in write mode.
146  */
147 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
148 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
149 	     _root;							\
150 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
151 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
152 		} else
153 
154 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
155 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
156 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
157 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
158 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
159 		} else
160 
161 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
162 						   int level)
163 {
164 	union kvm_mmu_page_role role;
165 
166 	role = vcpu->arch.mmu->mmu_role.base;
167 	role.level = level;
168 	role.direct = true;
169 	role.gpte_is_8_bytes = true;
170 	role.access = ACC_ALL;
171 
172 	return role;
173 }
174 
175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
176 					       int level)
177 {
178 	struct kvm_mmu_page *sp;
179 
180 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
181 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
182 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
183 
184 	sp->role.word = page_role_for_level(vcpu, level).word;
185 	sp->gfn = gfn;
186 	sp->tdp_mmu_page = true;
187 
188 	trace_kvm_mmu_get_page(sp, true);
189 
190 	return sp;
191 }
192 
193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
194 {
195 	union kvm_mmu_page_role role;
196 	struct kvm *kvm = vcpu->kvm;
197 	struct kvm_mmu_page *root;
198 
199 	lockdep_assert_held_write(&kvm->mmu_lock);
200 
201 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
202 
203 	/* Check for an existing root before allocating a new one. */
204 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
205 		if (root->role.word == role.word &&
206 		    kvm_tdp_mmu_get_root(kvm, root))
207 			goto out;
208 	}
209 
210 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
211 	refcount_set(&root->tdp_mmu_root_count, 1);
212 
213 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
214 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
215 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
216 
217 out:
218 	return __pa(root->spt);
219 }
220 
221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
222 				u64 old_spte, u64 new_spte, int level,
223 				bool shared);
224 
225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
226 {
227 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
228 		return;
229 
230 	if (is_accessed_spte(old_spte) &&
231 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
232 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
233 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
234 }
235 
236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
237 					  u64 old_spte, u64 new_spte, int level)
238 {
239 	bool pfn_changed;
240 	struct kvm_memory_slot *slot;
241 
242 	if (level > PG_LEVEL_4K)
243 		return;
244 
245 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
246 
247 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
248 	    is_writable_pte(new_spte)) {
249 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
250 		mark_page_dirty_in_slot(kvm, slot, gfn);
251 	}
252 }
253 
254 /**
255  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
256  *
257  * @kvm: kvm instance
258  * @sp: the new page
259  * @shared: This operation may not be running under the exclusive use of
260  *	    the MMU lock and the operation must synchronize with other
261  *	    threads that might be adding or removing pages.
262  * @account_nx: This page replaces a NX large page and should be marked for
263  *		eventual reclaim.
264  */
265 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
266 			      bool shared, bool account_nx)
267 {
268 	if (shared)
269 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
270 	else
271 		lockdep_assert_held_write(&kvm->mmu_lock);
272 
273 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
274 	if (account_nx)
275 		account_huge_nx_page(kvm, sp);
276 
277 	if (shared)
278 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
279 }
280 
281 /**
282  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
283  *
284  * @kvm: kvm instance
285  * @sp: the page to be removed
286  * @shared: This operation may not be running under the exclusive use of
287  *	    the MMU lock and the operation must synchronize with other
288  *	    threads that might be adding or removing pages.
289  */
290 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
291 				bool shared)
292 {
293 	if (shared)
294 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
295 	else
296 		lockdep_assert_held_write(&kvm->mmu_lock);
297 
298 	list_del(&sp->link);
299 	if (sp->lpage_disallowed)
300 		unaccount_huge_nx_page(kvm, sp);
301 
302 	if (shared)
303 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
304 }
305 
306 /**
307  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
308  *
309  * @kvm: kvm instance
310  * @pt: the page removed from the paging structure
311  * @shared: This operation may not be running under the exclusive use
312  *	    of the MMU lock and the operation must synchronize with other
313  *	    threads that might be modifying SPTEs.
314  *
315  * Given a page table that has been removed from the TDP paging structure,
316  * iterates through the page table to clear SPTEs and free child page tables.
317  *
318  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
319  * protection. Since this thread removed it from the paging structure,
320  * this thread will be responsible for ensuring the page is freed. Hence the
321  * early rcu_dereferences in the function.
322  */
323 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
324 					bool shared)
325 {
326 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
327 	int level = sp->role.level;
328 	gfn_t base_gfn = sp->gfn;
329 	u64 old_child_spte;
330 	u64 *sptep;
331 	gfn_t gfn;
332 	int i;
333 
334 	trace_kvm_mmu_prepare_zap_page(sp);
335 
336 	tdp_mmu_unlink_page(kvm, sp, shared);
337 
338 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
339 		sptep = rcu_dereference(pt) + i;
340 		gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
341 
342 		if (shared) {
343 			/*
344 			 * Set the SPTE to a nonpresent value that other
345 			 * threads will not overwrite. If the SPTE was
346 			 * already marked as removed then another thread
347 			 * handling a page fault could overwrite it, so
348 			 * set the SPTE until it is set from some other
349 			 * value to the removed SPTE value.
350 			 */
351 			for (;;) {
352 				old_child_spte = xchg(sptep, REMOVED_SPTE);
353 				if (!is_removed_spte(old_child_spte))
354 					break;
355 				cpu_relax();
356 			}
357 		} else {
358 			/*
359 			 * If the SPTE is not MMU-present, there is no backing
360 			 * page associated with the SPTE and so no side effects
361 			 * that need to be recorded, and exclusive ownership of
362 			 * mmu_lock ensures the SPTE can't be made present.
363 			 * Note, zapping MMIO SPTEs is also unnecessary as they
364 			 * are guarded by the memslots generation, not by being
365 			 * unreachable.
366 			 */
367 			old_child_spte = READ_ONCE(*sptep);
368 			if (!is_shadow_present_pte(old_child_spte))
369 				continue;
370 
371 			/*
372 			 * Marking the SPTE as a removed SPTE is not
373 			 * strictly necessary here as the MMU lock will
374 			 * stop other threads from concurrently modifying
375 			 * this SPTE. Using the removed SPTE value keeps
376 			 * the two branches consistent and simplifies
377 			 * the function.
378 			 */
379 			WRITE_ONCE(*sptep, REMOVED_SPTE);
380 		}
381 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
382 				    old_child_spte, REMOVED_SPTE, level,
383 				    shared);
384 	}
385 
386 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
387 					   KVM_PAGES_PER_HPAGE(level + 1));
388 
389 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
390 }
391 
392 /**
393  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
394  * @kvm: kvm instance
395  * @as_id: the address space of the paging structure the SPTE was a part of
396  * @gfn: the base GFN that was mapped by the SPTE
397  * @old_spte: The value of the SPTE before the change
398  * @new_spte: The value of the SPTE after the change
399  * @level: the level of the PT the SPTE is part of in the paging structure
400  * @shared: This operation may not be running under the exclusive use of
401  *	    the MMU lock and the operation must synchronize with other
402  *	    threads that might be modifying SPTEs.
403  *
404  * Handle bookkeeping that might result from the modification of a SPTE.
405  * This function must be called for all TDP SPTE modifications.
406  */
407 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
408 				  u64 old_spte, u64 new_spte, int level,
409 				  bool shared)
410 {
411 	bool was_present = is_shadow_present_pte(old_spte);
412 	bool is_present = is_shadow_present_pte(new_spte);
413 	bool was_leaf = was_present && is_last_spte(old_spte, level);
414 	bool is_leaf = is_present && is_last_spte(new_spte, level);
415 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
416 
417 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
418 	WARN_ON(level < PG_LEVEL_4K);
419 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
420 
421 	/*
422 	 * If this warning were to trigger it would indicate that there was a
423 	 * missing MMU notifier or a race with some notifier handler.
424 	 * A present, leaf SPTE should never be directly replaced with another
425 	 * present leaf SPTE pointing to a different PFN. A notifier handler
426 	 * should be zapping the SPTE before the main MM's page table is
427 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
428 	 * thread before replacement.
429 	 */
430 	if (was_leaf && is_leaf && pfn_changed) {
431 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
432 		       "SPTE with another present leaf SPTE mapping a\n"
433 		       "different PFN!\n"
434 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
435 		       as_id, gfn, old_spte, new_spte, level);
436 
437 		/*
438 		 * Crash the host to prevent error propagation and guest data
439 		 * corruption.
440 		 */
441 		BUG();
442 	}
443 
444 	if (old_spte == new_spte)
445 		return;
446 
447 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
448 
449 	if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
450 		if (is_large_pte(old_spte))
451 			atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
452 		else
453 			atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
454 	}
455 
456 	/*
457 	 * The only times a SPTE should be changed from a non-present to
458 	 * non-present state is when an MMIO entry is installed/modified/
459 	 * removed. In that case, there is nothing to do here.
460 	 */
461 	if (!was_present && !is_present) {
462 		/*
463 		 * If this change does not involve a MMIO SPTE or removed SPTE,
464 		 * it is unexpected. Log the change, though it should not
465 		 * impact the guest since both the former and current SPTEs
466 		 * are nonpresent.
467 		 */
468 		if (WARN_ON(!is_mmio_spte(old_spte) &&
469 			    !is_mmio_spte(new_spte) &&
470 			    !is_removed_spte(new_spte)))
471 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
472 			       "should not be replaced with another,\n"
473 			       "different nonpresent SPTE, unless one or both\n"
474 			       "are MMIO SPTEs, or the new SPTE is\n"
475 			       "a temporary removed SPTE.\n"
476 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
477 			       as_id, gfn, old_spte, new_spte, level);
478 		return;
479 	}
480 
481 
482 	if (was_leaf && is_dirty_spte(old_spte) &&
483 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
484 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
485 
486 	/*
487 	 * Recursively handle child PTs if the change removed a subtree from
488 	 * the paging structure.
489 	 */
490 	if (was_present && !was_leaf && (pfn_changed || !is_present))
491 		handle_removed_tdp_mmu_page(kvm,
492 				spte_to_child_pt(old_spte, level), shared);
493 }
494 
495 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
496 				u64 old_spte, u64 new_spte, int level,
497 				bool shared)
498 {
499 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
500 			      shared);
501 	handle_changed_spte_acc_track(old_spte, new_spte, level);
502 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
503 				      new_spte, level);
504 }
505 
506 /*
507  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
508  * and handle the associated bookkeeping, but do not mark the page dirty
509  * in KVM's dirty bitmaps.
510  *
511  * @kvm: kvm instance
512  * @iter: a tdp_iter instance currently on the SPTE that should be set
513  * @new_spte: The value the SPTE should be set to
514  * Returns: true if the SPTE was set, false if it was not. If false is returned,
515  *	    this function will have no side-effects.
516  */
517 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
518 							struct tdp_iter *iter,
519 							u64 new_spte)
520 {
521 	lockdep_assert_held_read(&kvm->mmu_lock);
522 
523 	/*
524 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
525 	 * may modify it.
526 	 */
527 	if (is_removed_spte(iter->old_spte))
528 		return false;
529 
530 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
531 		      new_spte) != iter->old_spte)
532 		return false;
533 
534 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
535 			      new_spte, iter->level, true);
536 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
537 
538 	return true;
539 }
540 
541 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
542 					   struct tdp_iter *iter,
543 					   u64 new_spte)
544 {
545 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
546 		return false;
547 
548 	handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
549 				      iter->old_spte, new_spte, iter->level);
550 	return true;
551 }
552 
553 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
554 					   struct tdp_iter *iter)
555 {
556 	/*
557 	 * Freeze the SPTE by setting it to a special,
558 	 * non-present value. This will stop other threads from
559 	 * immediately installing a present entry in its place
560 	 * before the TLBs are flushed.
561 	 */
562 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
563 		return false;
564 
565 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
566 					   KVM_PAGES_PER_HPAGE(iter->level));
567 
568 	/*
569 	 * No other thread can overwrite the removed SPTE as they
570 	 * must either wait on the MMU lock or use
571 	 * tdp_mmu_set_spte_atomic which will not overwrite the
572 	 * special removed SPTE value. No bookkeeping is needed
573 	 * here since the SPTE is going from non-present
574 	 * to non-present.
575 	 */
576 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
577 
578 	return true;
579 }
580 
581 
582 /*
583  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
584  * @kvm: kvm instance
585  * @iter: a tdp_iter instance currently on the SPTE that should be set
586  * @new_spte: The value the SPTE should be set to
587  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
588  *		      of the page. Should be set unless handling an MMU
589  *		      notifier for access tracking. Leaving record_acc_track
590  *		      unset in that case prevents page accesses from being
591  *		      double counted.
592  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
593  *		      appropriate for the change being made. Should be set
594  *		      unless performing certain dirty logging operations.
595  *		      Leaving record_dirty_log unset in that case prevents page
596  *		      writes from being double counted.
597  */
598 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
599 				      u64 new_spte, bool record_acc_track,
600 				      bool record_dirty_log)
601 {
602 	lockdep_assert_held_write(&kvm->mmu_lock);
603 
604 	/*
605 	 * No thread should be using this function to set SPTEs to the
606 	 * temporary removed SPTE value.
607 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
608 	 * should be used. If operating under the MMU lock in write mode, the
609 	 * use of the removed SPTE should not be necessary.
610 	 */
611 	WARN_ON(is_removed_spte(iter->old_spte));
612 
613 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
614 
615 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
616 			      new_spte, iter->level, false);
617 	if (record_acc_track)
618 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
619 					      iter->level);
620 	if (record_dirty_log)
621 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
622 					      iter->old_spte, new_spte,
623 					      iter->level);
624 }
625 
626 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
627 				    u64 new_spte)
628 {
629 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
630 }
631 
632 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
633 						 struct tdp_iter *iter,
634 						 u64 new_spte)
635 {
636 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
637 }
638 
639 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
640 						 struct tdp_iter *iter,
641 						 u64 new_spte)
642 {
643 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
644 }
645 
646 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
647 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
648 
649 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
650 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
651 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
652 		    !is_last_spte(_iter.old_spte, _iter.level))		\
653 			continue;					\
654 		else
655 
656 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
657 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
658 			 _mmu->shadow_root_level, _start, _end)
659 
660 /*
661  * Yield if the MMU lock is contended or this thread needs to return control
662  * to the scheduler.
663  *
664  * If this function should yield and flush is set, it will perform a remote
665  * TLB flush before yielding.
666  *
667  * If this function yields, it will also reset the tdp_iter's walk over the
668  * paging structure and the calling function should skip to the next
669  * iteration to allow the iterator to continue its traversal from the
670  * paging structure root.
671  *
672  * Return true if this function yielded and the iterator's traversal was reset.
673  * Return false if a yield was not needed.
674  */
675 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
676 					     struct tdp_iter *iter, bool flush,
677 					     bool shared)
678 {
679 	/* Ensure forward progress has been made before yielding. */
680 	if (iter->next_last_level_gfn == iter->yielded_gfn)
681 		return false;
682 
683 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
684 		rcu_read_unlock();
685 
686 		if (flush)
687 			kvm_flush_remote_tlbs(kvm);
688 
689 		if (shared)
690 			cond_resched_rwlock_read(&kvm->mmu_lock);
691 		else
692 			cond_resched_rwlock_write(&kvm->mmu_lock);
693 
694 		rcu_read_lock();
695 
696 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
697 
698 		tdp_iter_restart(iter);
699 
700 		return true;
701 	}
702 
703 	return false;
704 }
705 
706 /*
707  * Tears down the mappings for the range of gfns, [start, end), and frees the
708  * non-root pages mapping GFNs strictly within that range. Returns true if
709  * SPTEs have been cleared and a TLB flush is needed before releasing the
710  * MMU lock.
711  *
712  * If can_yield is true, will release the MMU lock and reschedule if the
713  * scheduler needs the CPU or there is contention on the MMU lock. If this
714  * function cannot yield, it will not release the MMU lock or reschedule and
715  * the caller must ensure it does not supply too large a GFN range, or the
716  * operation can cause a soft lockup.
717  *
718  * If shared is true, this thread holds the MMU lock in read mode and must
719  * account for the possibility that other threads are modifying the paging
720  * structures concurrently. If shared is false, this thread should hold the
721  * MMU lock in write mode.
722  */
723 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
724 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
725 			  bool shared)
726 {
727 	struct tdp_iter iter;
728 
729 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
730 
731 	rcu_read_lock();
732 
733 	tdp_root_for_each_pte(iter, root, start, end) {
734 retry:
735 		if (can_yield &&
736 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
737 			flush = false;
738 			continue;
739 		}
740 
741 		if (!is_shadow_present_pte(iter.old_spte))
742 			continue;
743 
744 		/*
745 		 * If this is a non-last-level SPTE that covers a larger range
746 		 * than should be zapped, continue, and zap the mappings at a
747 		 * lower level.
748 		 */
749 		if ((iter.gfn < start ||
750 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
751 		    !is_last_spte(iter.old_spte, iter.level))
752 			continue;
753 
754 		if (!shared) {
755 			tdp_mmu_set_spte(kvm, &iter, 0);
756 			flush = true;
757 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
758 			/*
759 			 * The iter must explicitly re-read the SPTE because
760 			 * the atomic cmpxchg failed.
761 			 */
762 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
763 			goto retry;
764 		}
765 	}
766 
767 	rcu_read_unlock();
768 	return flush;
769 }
770 
771 /*
772  * Tears down the mappings for the range of gfns, [start, end), and frees the
773  * non-root pages mapping GFNs strictly within that range. Returns true if
774  * SPTEs have been cleared and a TLB flush is needed before releasing the
775  * MMU lock.
776  *
777  * If shared is true, this thread holds the MMU lock in read mode and must
778  * account for the possibility that other threads are modifying the paging
779  * structures concurrently. If shared is false, this thread should hold the
780  * MMU in write mode.
781  */
782 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
783 				 gfn_t end, bool can_yield, bool flush,
784 				 bool shared)
785 {
786 	struct kvm_mmu_page *root;
787 
788 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
789 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
790 				      shared);
791 
792 	return flush;
793 }
794 
795 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
796 {
797 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
798 	bool flush = false;
799 	int i;
800 
801 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
802 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
803 						  flush, false);
804 
805 	if (flush)
806 		kvm_flush_remote_tlbs(kvm);
807 }
808 
809 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
810 						  struct kvm_mmu_page *prev_root)
811 {
812 	struct kvm_mmu_page *next_root;
813 
814 	if (prev_root)
815 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
816 						  &prev_root->link,
817 						  typeof(*prev_root), link);
818 	else
819 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
820 						   typeof(*next_root), link);
821 
822 	while (next_root && !(next_root->role.invalid &&
823 			      refcount_read(&next_root->tdp_mmu_root_count)))
824 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
825 						  &next_root->link,
826 						  typeof(*next_root), link);
827 
828 	return next_root;
829 }
830 
831 /*
832  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
833  * invalidated root, they will not be freed until this function drops the
834  * reference. Before dropping that reference, tear down the paging
835  * structure so that whichever thread does drop the last reference
836  * only has to do a trivial amount of work. Since the roots are invalid,
837  * no new SPTEs should be created under them.
838  */
839 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
840 {
841 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
842 	struct kvm_mmu_page *next_root;
843 	struct kvm_mmu_page *root;
844 	bool flush = false;
845 
846 	lockdep_assert_held_read(&kvm->mmu_lock);
847 
848 	rcu_read_lock();
849 
850 	root = next_invalidated_root(kvm, NULL);
851 
852 	while (root) {
853 		next_root = next_invalidated_root(kvm, root);
854 
855 		rcu_read_unlock();
856 
857 		flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
858 				      true);
859 
860 		/*
861 		 * Put the reference acquired in
862 		 * kvm_tdp_mmu_invalidate_roots
863 		 */
864 		kvm_tdp_mmu_put_root(kvm, root, true);
865 
866 		root = next_root;
867 
868 		rcu_read_lock();
869 	}
870 
871 	rcu_read_unlock();
872 
873 	if (flush)
874 		kvm_flush_remote_tlbs(kvm);
875 }
876 
877 /*
878  * Mark each TDP MMU root as invalid so that other threads
879  * will drop their references and allow the root count to
880  * go to 0.
881  *
882  * Also take a reference on all roots so that this thread
883  * can do the bulk of the work required to free the roots
884  * once they are invalidated. Without this reference, a
885  * vCPU thread might drop the last reference to a root and
886  * get stuck with tearing down the entire paging structure.
887  *
888  * Roots which have a zero refcount should be skipped as
889  * they're already being torn down.
890  * Already invalid roots should be referenced again so that
891  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
892  * done with them.
893  *
894  * This has essentially the same effect for the TDP MMU
895  * as updating mmu_valid_gen does for the shadow MMU.
896  */
897 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
898 {
899 	struct kvm_mmu_page *root;
900 
901 	lockdep_assert_held_write(&kvm->mmu_lock);
902 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
903 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
904 			root->role.invalid = true;
905 }
906 
907 /*
908  * Installs a last-level SPTE to handle a TDP page fault.
909  * (NPT/EPT violation/misconfiguration)
910  */
911 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
912 					  int map_writable,
913 					  struct tdp_iter *iter,
914 					  kvm_pfn_t pfn, bool prefault)
915 {
916 	u64 new_spte;
917 	int ret = RET_PF_FIXED;
918 	int make_spte_ret = 0;
919 
920 	if (unlikely(is_noslot_pfn(pfn)))
921 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
922 	else
923 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
924 					 pfn, iter->old_spte, prefault, true,
925 					 map_writable, !shadow_accessed_mask,
926 					 &new_spte);
927 
928 	if (new_spte == iter->old_spte)
929 		ret = RET_PF_SPURIOUS;
930 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
931 		return RET_PF_RETRY;
932 
933 	/*
934 	 * If the page fault was caused by a write but the page is write
935 	 * protected, emulation is needed. If the emulation was skipped,
936 	 * the vCPU would have the same fault again.
937 	 */
938 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
939 		if (write)
940 			ret = RET_PF_EMULATE;
941 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
942 	}
943 
944 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
945 	if (unlikely(is_mmio_spte(new_spte))) {
946 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
947 				     new_spte);
948 		ret = RET_PF_EMULATE;
949 	} else {
950 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
951 				       rcu_dereference(iter->sptep));
952 	}
953 
954 	/*
955 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
956 	 * consistent with legacy MMU behavior.
957 	 */
958 	if (ret != RET_PF_SPURIOUS)
959 		vcpu->stat.pf_fixed++;
960 
961 	return ret;
962 }
963 
964 /*
965  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
966  * page tables and SPTEs to translate the faulting guest physical address.
967  */
968 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
969 		    int map_writable, int max_level, kvm_pfn_t pfn,
970 		    bool prefault)
971 {
972 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
973 	bool write = error_code & PFERR_WRITE_MASK;
974 	bool exec = error_code & PFERR_FETCH_MASK;
975 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
976 	struct kvm_mmu *mmu = vcpu->arch.mmu;
977 	struct tdp_iter iter;
978 	struct kvm_mmu_page *sp;
979 	u64 *child_pt;
980 	u64 new_spte;
981 	int ret;
982 	gfn_t gfn = gpa >> PAGE_SHIFT;
983 	int level;
984 	int req_level;
985 
986 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
987 					huge_page_disallowed, &req_level);
988 
989 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
990 
991 	rcu_read_lock();
992 
993 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
994 		if (nx_huge_page_workaround_enabled)
995 			disallowed_hugepage_adjust(iter.old_spte, gfn,
996 						   iter.level, &pfn, &level);
997 
998 		if (iter.level == level)
999 			break;
1000 
1001 		/*
1002 		 * If there is an SPTE mapping a large page at a higher level
1003 		 * than the target, that SPTE must be cleared and replaced
1004 		 * with a non-leaf SPTE.
1005 		 */
1006 		if (is_shadow_present_pte(iter.old_spte) &&
1007 		    is_large_pte(iter.old_spte)) {
1008 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1009 				break;
1010 
1011 			/*
1012 			 * The iter must explicitly re-read the spte here
1013 			 * because the new value informs the !present
1014 			 * path below.
1015 			 */
1016 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1017 		}
1018 
1019 		if (!is_shadow_present_pte(iter.old_spte)) {
1020 			/*
1021 			 * If SPTE has been frozen by another thread, just
1022 			 * give up and retry, avoiding unnecessary page table
1023 			 * allocation and free.
1024 			 */
1025 			if (is_removed_spte(iter.old_spte))
1026 				break;
1027 
1028 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1029 			child_pt = sp->spt;
1030 
1031 			new_spte = make_nonleaf_spte(child_pt,
1032 						     !shadow_accessed_mask);
1033 
1034 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1035 						    new_spte)) {
1036 				tdp_mmu_link_page(vcpu->kvm, sp, true,
1037 						  huge_page_disallowed &&
1038 						  req_level >= iter.level);
1039 
1040 				trace_kvm_mmu_get_page(sp, true);
1041 			} else {
1042 				tdp_mmu_free_sp(sp);
1043 				break;
1044 			}
1045 		}
1046 	}
1047 
1048 	if (iter.level != level) {
1049 		rcu_read_unlock();
1050 		return RET_PF_RETRY;
1051 	}
1052 
1053 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1054 					      pfn, prefault);
1055 	rcu_read_unlock();
1056 
1057 	return ret;
1058 }
1059 
1060 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1061 				 bool flush)
1062 {
1063 	struct kvm_mmu_page *root;
1064 
1065 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1066 		flush |= zap_gfn_range(kvm, root, range->start, range->end,
1067 				       range->may_block, flush, false);
1068 
1069 	return flush;
1070 }
1071 
1072 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1073 			      struct kvm_gfn_range *range);
1074 
1075 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1076 						   struct kvm_gfn_range *range,
1077 						   tdp_handler_t handler)
1078 {
1079 	struct kvm_mmu_page *root;
1080 	struct tdp_iter iter;
1081 	bool ret = false;
1082 
1083 	rcu_read_lock();
1084 
1085 	/*
1086 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1087 	 * into this helper allow blocking; it'd be dead, wasteful code.
1088 	 */
1089 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1090 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1091 			ret |= handler(kvm, &iter, range);
1092 	}
1093 
1094 	rcu_read_unlock();
1095 
1096 	return ret;
1097 }
1098 
1099 /*
1100  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1101  * if any of the GFNs in the range have been accessed.
1102  */
1103 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1104 			  struct kvm_gfn_range *range)
1105 {
1106 	u64 new_spte = 0;
1107 
1108 	/* If we have a non-accessed entry we don't need to change the pte. */
1109 	if (!is_accessed_spte(iter->old_spte))
1110 		return false;
1111 
1112 	new_spte = iter->old_spte;
1113 
1114 	if (spte_ad_enabled(new_spte)) {
1115 		new_spte &= ~shadow_accessed_mask;
1116 	} else {
1117 		/*
1118 		 * Capture the dirty status of the page, so that it doesn't get
1119 		 * lost when the SPTE is marked for access tracking.
1120 		 */
1121 		if (is_writable_pte(new_spte))
1122 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1123 
1124 		new_spte = mark_spte_for_access_track(new_spte);
1125 	}
1126 
1127 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1128 
1129 	return true;
1130 }
1131 
1132 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1133 {
1134 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1135 }
1136 
1137 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1138 			 struct kvm_gfn_range *range)
1139 {
1140 	return is_accessed_spte(iter->old_spte);
1141 }
1142 
1143 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1144 {
1145 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1146 }
1147 
1148 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1149 			 struct kvm_gfn_range *range)
1150 {
1151 	u64 new_spte;
1152 
1153 	/* Huge pages aren't expected to be modified without first being zapped. */
1154 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1155 
1156 	if (iter->level != PG_LEVEL_4K ||
1157 	    !is_shadow_present_pte(iter->old_spte))
1158 		return false;
1159 
1160 	/*
1161 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1162 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1163 	 * invariant that the PFN of a present * leaf SPTE can never change.
1164 	 * See __handle_changed_spte().
1165 	 */
1166 	tdp_mmu_set_spte(kvm, iter, 0);
1167 
1168 	if (!pte_write(range->pte)) {
1169 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1170 								  pte_pfn(range->pte));
1171 
1172 		tdp_mmu_set_spte(kvm, iter, new_spte);
1173 	}
1174 
1175 	return true;
1176 }
1177 
1178 /*
1179  * Handle the changed_pte MMU notifier for the TDP MMU.
1180  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1181  * notifier.
1182  * Returns non-zero if a flush is needed before releasing the MMU lock.
1183  */
1184 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1185 {
1186 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1187 
1188 	/* FIXME: return 'flush' instead of flushing here. */
1189 	if (flush)
1190 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1191 
1192 	return false;
1193 }
1194 
1195 /*
1196  * Remove write access from all SPTEs at or above min_level that map GFNs
1197  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1198  * be flushed.
1199  */
1200 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1201 			     gfn_t start, gfn_t end, int min_level)
1202 {
1203 	struct tdp_iter iter;
1204 	u64 new_spte;
1205 	bool spte_set = false;
1206 
1207 	rcu_read_lock();
1208 
1209 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1210 
1211 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1212 				   min_level, start, end) {
1213 retry:
1214 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1215 			continue;
1216 
1217 		if (!is_shadow_present_pte(iter.old_spte) ||
1218 		    !is_last_spte(iter.old_spte, iter.level) ||
1219 		    !(iter.old_spte & PT_WRITABLE_MASK))
1220 			continue;
1221 
1222 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1223 
1224 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1225 							  new_spte)) {
1226 			/*
1227 			 * The iter must explicitly re-read the SPTE because
1228 			 * the atomic cmpxchg failed.
1229 			 */
1230 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1231 			goto retry;
1232 		}
1233 		spte_set = true;
1234 	}
1235 
1236 	rcu_read_unlock();
1237 	return spte_set;
1238 }
1239 
1240 /*
1241  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1242  * only affect leaf SPTEs down to min_level.
1243  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1244  */
1245 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1246 			     int min_level)
1247 {
1248 	struct kvm_mmu_page *root;
1249 	bool spte_set = false;
1250 
1251 	lockdep_assert_held_read(&kvm->mmu_lock);
1252 
1253 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1254 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1255 			     slot->base_gfn + slot->npages, min_level);
1256 
1257 	return spte_set;
1258 }
1259 
1260 /*
1261  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1262  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1263  * If AD bits are not enabled, this will require clearing the writable bit on
1264  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1265  * be flushed.
1266  */
1267 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1268 			   gfn_t start, gfn_t end)
1269 {
1270 	struct tdp_iter iter;
1271 	u64 new_spte;
1272 	bool spte_set = false;
1273 
1274 	rcu_read_lock();
1275 
1276 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1277 retry:
1278 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1279 			continue;
1280 
1281 		if (spte_ad_need_write_protect(iter.old_spte)) {
1282 			if (is_writable_pte(iter.old_spte))
1283 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1284 			else
1285 				continue;
1286 		} else {
1287 			if (iter.old_spte & shadow_dirty_mask)
1288 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1289 			else
1290 				continue;
1291 		}
1292 
1293 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1294 							  new_spte)) {
1295 			/*
1296 			 * The iter must explicitly re-read the SPTE because
1297 			 * the atomic cmpxchg failed.
1298 			 */
1299 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1300 			goto retry;
1301 		}
1302 		spte_set = true;
1303 	}
1304 
1305 	rcu_read_unlock();
1306 	return spte_set;
1307 }
1308 
1309 /*
1310  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1311  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1312  * If AD bits are not enabled, this will require clearing the writable bit on
1313  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1314  * be flushed.
1315  */
1316 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1317 {
1318 	struct kvm_mmu_page *root;
1319 	bool spte_set = false;
1320 
1321 	lockdep_assert_held_read(&kvm->mmu_lock);
1322 
1323 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1324 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1325 				slot->base_gfn + slot->npages);
1326 
1327 	return spte_set;
1328 }
1329 
1330 /*
1331  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1332  * set in mask, starting at gfn. The given memslot is expected to contain all
1333  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1334  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1335  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1336  */
1337 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1338 				  gfn_t gfn, unsigned long mask, bool wrprot)
1339 {
1340 	struct tdp_iter iter;
1341 	u64 new_spte;
1342 
1343 	rcu_read_lock();
1344 
1345 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1346 				    gfn + BITS_PER_LONG) {
1347 		if (!mask)
1348 			break;
1349 
1350 		if (iter.level > PG_LEVEL_4K ||
1351 		    !(mask & (1UL << (iter.gfn - gfn))))
1352 			continue;
1353 
1354 		mask &= ~(1UL << (iter.gfn - gfn));
1355 
1356 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1357 			if (is_writable_pte(iter.old_spte))
1358 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1359 			else
1360 				continue;
1361 		} else {
1362 			if (iter.old_spte & shadow_dirty_mask)
1363 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1364 			else
1365 				continue;
1366 		}
1367 
1368 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1369 	}
1370 
1371 	rcu_read_unlock();
1372 }
1373 
1374 /*
1375  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1376  * set in mask, starting at gfn. The given memslot is expected to contain all
1377  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1378  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1379  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1380  */
1381 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1382 				       struct kvm_memory_slot *slot,
1383 				       gfn_t gfn, unsigned long mask,
1384 				       bool wrprot)
1385 {
1386 	struct kvm_mmu_page *root;
1387 
1388 	lockdep_assert_held_write(&kvm->mmu_lock);
1389 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1390 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1391 }
1392 
1393 /*
1394  * Clear leaf entries which could be replaced by large mappings, for
1395  * GFNs within the slot.
1396  */
1397 static bool zap_collapsible_spte_range(struct kvm *kvm,
1398 				       struct kvm_mmu_page *root,
1399 				       const struct kvm_memory_slot *slot,
1400 				       bool flush)
1401 {
1402 	gfn_t start = slot->base_gfn;
1403 	gfn_t end = start + slot->npages;
1404 	struct tdp_iter iter;
1405 	kvm_pfn_t pfn;
1406 
1407 	rcu_read_lock();
1408 
1409 	tdp_root_for_each_pte(iter, root, start, end) {
1410 retry:
1411 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1412 			flush = false;
1413 			continue;
1414 		}
1415 
1416 		if (!is_shadow_present_pte(iter.old_spte) ||
1417 		    !is_last_spte(iter.old_spte, iter.level))
1418 			continue;
1419 
1420 		pfn = spte_to_pfn(iter.old_spte);
1421 		if (kvm_is_reserved_pfn(pfn) ||
1422 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1423 							    pfn, PG_LEVEL_NUM))
1424 			continue;
1425 
1426 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1427 			/*
1428 			 * The iter must explicitly re-read the SPTE because
1429 			 * the atomic cmpxchg failed.
1430 			 */
1431 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1432 			goto retry;
1433 		}
1434 		flush = true;
1435 	}
1436 
1437 	rcu_read_unlock();
1438 
1439 	return flush;
1440 }
1441 
1442 /*
1443  * Clear non-leaf entries (and free associated page tables) which could
1444  * be replaced by large mappings, for GFNs within the slot.
1445  */
1446 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1447 				       const struct kvm_memory_slot *slot,
1448 				       bool flush)
1449 {
1450 	struct kvm_mmu_page *root;
1451 
1452 	lockdep_assert_held_read(&kvm->mmu_lock);
1453 
1454 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1455 		flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1456 
1457 	return flush;
1458 }
1459 
1460 /*
1461  * Removes write access on the last level SPTE mapping this GFN and unsets the
1462  * MMU-writable bit to ensure future writes continue to be intercepted.
1463  * Returns true if an SPTE was set and a TLB flush is needed.
1464  */
1465 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1466 			      gfn_t gfn, int min_level)
1467 {
1468 	struct tdp_iter iter;
1469 	u64 new_spte;
1470 	bool spte_set = false;
1471 
1472 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1473 
1474 	rcu_read_lock();
1475 
1476 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1477 				   min_level, gfn, gfn + 1) {
1478 		if (!is_shadow_present_pte(iter.old_spte) ||
1479 		    !is_last_spte(iter.old_spte, iter.level))
1480 			continue;
1481 
1482 		if (!is_writable_pte(iter.old_spte))
1483 			break;
1484 
1485 		new_spte = iter.old_spte &
1486 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1487 
1488 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1489 		spte_set = true;
1490 	}
1491 
1492 	rcu_read_unlock();
1493 
1494 	return spte_set;
1495 }
1496 
1497 /*
1498  * Removes write access on the last level SPTE mapping this GFN and unsets the
1499  * MMU-writable bit to ensure future writes continue to be intercepted.
1500  * Returns true if an SPTE was set and a TLB flush is needed.
1501  */
1502 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1503 				   struct kvm_memory_slot *slot, gfn_t gfn,
1504 				   int min_level)
1505 {
1506 	struct kvm_mmu_page *root;
1507 	bool spte_set = false;
1508 
1509 	lockdep_assert_held_write(&kvm->mmu_lock);
1510 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1511 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1512 
1513 	return spte_set;
1514 }
1515 
1516 /*
1517  * Return the level of the lowest level SPTE added to sptes.
1518  * That SPTE may be non-present.
1519  */
1520 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1521 			 int *root_level)
1522 {
1523 	struct tdp_iter iter;
1524 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1525 	gfn_t gfn = addr >> PAGE_SHIFT;
1526 	int leaf = -1;
1527 
1528 	*root_level = vcpu->arch.mmu->shadow_root_level;
1529 
1530 	rcu_read_lock();
1531 
1532 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1533 		leaf = iter.level;
1534 		sptes[leaf] = iter.old_spte;
1535 	}
1536 
1537 	rcu_read_unlock();
1538 
1539 	return leaf;
1540 }
1541