xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 8938c48f)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 }
29 
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
31 {
32 	if (!kvm->arch.tdp_mmu_enabled)
33 		return;
34 
35 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
36 
37 	/*
38 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 	 * can run before the VM is torn down.
40 	 */
41 	rcu_barrier();
42 }
43 
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
45 {
46 	if (kvm_mmu_put_root(kvm, root))
47 		kvm_tdp_mmu_free_root(kvm, root);
48 }
49 
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51 					   struct kvm_mmu_page *root)
52 {
53 	lockdep_assert_held_write(&kvm->mmu_lock);
54 
55 	if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
56 		return false;
57 
58 	kvm_mmu_get_root(kvm, root);
59 	return true;
60 
61 }
62 
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64 						     struct kvm_mmu_page *root)
65 {
66 	struct kvm_mmu_page *next_root;
67 
68 	next_root = list_next_entry(root, link);
69 	tdp_mmu_put_root(kvm, root);
70 	return next_root;
71 }
72 
73 /*
74  * Note: this iterator gets and puts references to the roots it iterates over.
75  * This makes it safe to release the MMU lock and yield within the loop, but
76  * if exiting the loop early, the caller must drop the reference to the most
77  * recent root. (Unless keeping a live reference is desirable.)
78  */
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)				\
80 	for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,	\
81 				      typeof(*_root), link);		\
82 	     tdp_mmu_next_root_valid(_kvm, _root);			\
83 	     _root = tdp_mmu_next_root(_kvm, _root))
84 
85 #define for_each_tdp_mmu_root(_kvm, _root)				\
86 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
87 
88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
89 			  gfn_t start, gfn_t end, bool can_yield);
90 
91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
92 {
93 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
94 
95 	lockdep_assert_held_write(&kvm->mmu_lock);
96 
97 	WARN_ON(root->root_count);
98 	WARN_ON(!root->tdp_mmu_page);
99 
100 	list_del(&root->link);
101 
102 	zap_gfn_range(kvm, root, 0, max_gfn, false);
103 
104 	free_page((unsigned long)root->spt);
105 	kmem_cache_free(mmu_page_header_cache, root);
106 }
107 
108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
109 						   int level)
110 {
111 	union kvm_mmu_page_role role;
112 
113 	role = vcpu->arch.mmu->mmu_role.base;
114 	role.level = level;
115 	role.direct = true;
116 	role.gpte_is_8_bytes = true;
117 	role.access = ACC_ALL;
118 
119 	return role;
120 }
121 
122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
123 					       int level)
124 {
125 	struct kvm_mmu_page *sp;
126 
127 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
128 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
129 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
130 
131 	sp->role.word = page_role_for_level(vcpu, level).word;
132 	sp->gfn = gfn;
133 	sp->tdp_mmu_page = true;
134 
135 	trace_kvm_mmu_get_page(sp, true);
136 
137 	return sp;
138 }
139 
140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
141 {
142 	union kvm_mmu_page_role role;
143 	struct kvm *kvm = vcpu->kvm;
144 	struct kvm_mmu_page *root;
145 
146 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
147 
148 	write_lock(&kvm->mmu_lock);
149 
150 	/* Check for an existing root before allocating a new one. */
151 	for_each_tdp_mmu_root(kvm, root) {
152 		if (root->role.word == role.word) {
153 			kvm_mmu_get_root(kvm, root);
154 			write_unlock(&kvm->mmu_lock);
155 			return root;
156 		}
157 	}
158 
159 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
160 	root->root_count = 1;
161 
162 	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
163 
164 	write_unlock(&kvm->mmu_lock);
165 
166 	return root;
167 }
168 
169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
170 {
171 	struct kvm_mmu_page *root;
172 
173 	root = get_tdp_mmu_vcpu_root(vcpu);
174 	if (!root)
175 		return INVALID_PAGE;
176 
177 	return __pa(root->spt);
178 }
179 
180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
181 {
182 	free_page((unsigned long)sp->spt);
183 	kmem_cache_free(mmu_page_header_cache, sp);
184 }
185 
186 /*
187  * This is called through call_rcu in order to free TDP page table memory
188  * safely with respect to other kernel threads that may be operating on
189  * the memory.
190  * By only accessing TDP MMU page table memory in an RCU read critical
191  * section, and freeing it after a grace period, lockless access to that
192  * memory won't use it after it is freed.
193  */
194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
195 {
196 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
197 					       rcu_head);
198 
199 	tdp_mmu_free_sp(sp);
200 }
201 
202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
203 				u64 old_spte, u64 new_spte, int level,
204 				bool shared);
205 
206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
207 {
208 	return sp->role.smm ? 1 : 0;
209 }
210 
211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
212 {
213 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
214 
215 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
216 		return;
217 
218 	if (is_accessed_spte(old_spte) &&
219 	    (!is_accessed_spte(new_spte) || pfn_changed))
220 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
221 }
222 
223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
224 					  u64 old_spte, u64 new_spte, int level)
225 {
226 	bool pfn_changed;
227 	struct kvm_memory_slot *slot;
228 
229 	if (level > PG_LEVEL_4K)
230 		return;
231 
232 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
233 
234 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
235 	    is_writable_pte(new_spte)) {
236 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
237 		mark_page_dirty_in_slot(kvm, slot, gfn);
238 	}
239 }
240 
241 /**
242  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
243  *
244  * @kvm: kvm instance
245  * @sp: the new page
246  * @shared: This operation may not be running under the exclusive use of
247  *	    the MMU lock and the operation must synchronize with other
248  *	    threads that might be adding or removing pages.
249  * @account_nx: This page replaces a NX large page and should be marked for
250  *		eventual reclaim.
251  */
252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
253 			      bool shared, bool account_nx)
254 {
255 	if (shared)
256 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
257 	else
258 		lockdep_assert_held_write(&kvm->mmu_lock);
259 
260 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
261 	if (account_nx)
262 		account_huge_nx_page(kvm, sp);
263 
264 	if (shared)
265 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
266 }
267 
268 /**
269  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
270  *
271  * @kvm: kvm instance
272  * @sp: the page to be removed
273  * @shared: This operation may not be running under the exclusive use of
274  *	    the MMU lock and the operation must synchronize with other
275  *	    threads that might be adding or removing pages.
276  */
277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
278 				bool shared)
279 {
280 	if (shared)
281 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
282 	else
283 		lockdep_assert_held_write(&kvm->mmu_lock);
284 
285 	list_del(&sp->link);
286 	if (sp->lpage_disallowed)
287 		unaccount_huge_nx_page(kvm, sp);
288 
289 	if (shared)
290 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
291 }
292 
293 /**
294  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
295  *
296  * @kvm: kvm instance
297  * @pt: the page removed from the paging structure
298  * @shared: This operation may not be running under the exclusive use
299  *	    of the MMU lock and the operation must synchronize with other
300  *	    threads that might be modifying SPTEs.
301  *
302  * Given a page table that has been removed from the TDP paging structure,
303  * iterates through the page table to clear SPTEs and free child page tables.
304  */
305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
306 					bool shared)
307 {
308 	struct kvm_mmu_page *sp = sptep_to_sp(pt);
309 	int level = sp->role.level;
310 	gfn_t base_gfn = sp->gfn;
311 	u64 old_child_spte;
312 	u64 *sptep;
313 	gfn_t gfn;
314 	int i;
315 
316 	trace_kvm_mmu_prepare_zap_page(sp);
317 
318 	tdp_mmu_unlink_page(kvm, sp, shared);
319 
320 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
321 		sptep = pt + i;
322 		gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
323 
324 		if (shared) {
325 			/*
326 			 * Set the SPTE to a nonpresent value that other
327 			 * threads will not overwrite. If the SPTE was
328 			 * already marked as removed then another thread
329 			 * handling a page fault could overwrite it, so
330 			 * set the SPTE until it is set from some other
331 			 * value to the removed SPTE value.
332 			 */
333 			for (;;) {
334 				old_child_spte = xchg(sptep, REMOVED_SPTE);
335 				if (!is_removed_spte(old_child_spte))
336 					break;
337 				cpu_relax();
338 			}
339 		} else {
340 			old_child_spte = READ_ONCE(*sptep);
341 
342 			/*
343 			 * Marking the SPTE as a removed SPTE is not
344 			 * strictly necessary here as the MMU lock will
345 			 * stop other threads from concurrently modifying
346 			 * this SPTE. Using the removed SPTE value keeps
347 			 * the two branches consistent and simplifies
348 			 * the function.
349 			 */
350 			WRITE_ONCE(*sptep, REMOVED_SPTE);
351 		}
352 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
353 				    old_child_spte, REMOVED_SPTE, level - 1,
354 				    shared);
355 	}
356 
357 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
358 					   KVM_PAGES_PER_HPAGE(level));
359 
360 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
361 }
362 
363 /**
364  * handle_changed_spte - handle bookkeeping associated with an SPTE change
365  * @kvm: kvm instance
366  * @as_id: the address space of the paging structure the SPTE was a part of
367  * @gfn: the base GFN that was mapped by the SPTE
368  * @old_spte: The value of the SPTE before the change
369  * @new_spte: The value of the SPTE after the change
370  * @level: the level of the PT the SPTE is part of in the paging structure
371  * @shared: This operation may not be running under the exclusive use of
372  *	    the MMU lock and the operation must synchronize with other
373  *	    threads that might be modifying SPTEs.
374  *
375  * Handle bookkeeping that might result from the modification of a SPTE.
376  * This function must be called for all TDP SPTE modifications.
377  */
378 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
379 				  u64 old_spte, u64 new_spte, int level,
380 				  bool shared)
381 {
382 	bool was_present = is_shadow_present_pte(old_spte);
383 	bool is_present = is_shadow_present_pte(new_spte);
384 	bool was_leaf = was_present && is_last_spte(old_spte, level);
385 	bool is_leaf = is_present && is_last_spte(new_spte, level);
386 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
387 
388 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
389 	WARN_ON(level < PG_LEVEL_4K);
390 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
391 
392 	/*
393 	 * If this warning were to trigger it would indicate that there was a
394 	 * missing MMU notifier or a race with some notifier handler.
395 	 * A present, leaf SPTE should never be directly replaced with another
396 	 * present leaf SPTE pointing to a differnt PFN. A notifier handler
397 	 * should be zapping the SPTE before the main MM's page table is
398 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
399 	 * thread before replacement.
400 	 */
401 	if (was_leaf && is_leaf && pfn_changed) {
402 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
403 		       "SPTE with another present leaf SPTE mapping a\n"
404 		       "different PFN!\n"
405 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
406 		       as_id, gfn, old_spte, new_spte, level);
407 
408 		/*
409 		 * Crash the host to prevent error propagation and guest data
410 		 * courruption.
411 		 */
412 		BUG();
413 	}
414 
415 	if (old_spte == new_spte)
416 		return;
417 
418 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
419 
420 	/*
421 	 * The only times a SPTE should be changed from a non-present to
422 	 * non-present state is when an MMIO entry is installed/modified/
423 	 * removed. In that case, there is nothing to do here.
424 	 */
425 	if (!was_present && !is_present) {
426 		/*
427 		 * If this change does not involve a MMIO SPTE or removed SPTE,
428 		 * it is unexpected. Log the change, though it should not
429 		 * impact the guest since both the former and current SPTEs
430 		 * are nonpresent.
431 		 */
432 		if (WARN_ON(!is_mmio_spte(old_spte) &&
433 			    !is_mmio_spte(new_spte) &&
434 			    !is_removed_spte(new_spte)))
435 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
436 			       "should not be replaced with another,\n"
437 			       "different nonpresent SPTE, unless one or both\n"
438 			       "are MMIO SPTEs, or the new SPTE is\n"
439 			       "a temporary removed SPTE.\n"
440 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
441 			       as_id, gfn, old_spte, new_spte, level);
442 		return;
443 	}
444 
445 
446 	if (was_leaf && is_dirty_spte(old_spte) &&
447 	    (!is_dirty_spte(new_spte) || pfn_changed))
448 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
449 
450 	/*
451 	 * Recursively handle child PTs if the change removed a subtree from
452 	 * the paging structure.
453 	 */
454 	if (was_present && !was_leaf && (pfn_changed || !is_present))
455 		handle_removed_tdp_mmu_page(kvm,
456 				spte_to_child_pt(old_spte, level), shared);
457 }
458 
459 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
460 				u64 old_spte, u64 new_spte, int level,
461 				bool shared)
462 {
463 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
464 			      shared);
465 	handle_changed_spte_acc_track(old_spte, new_spte, level);
466 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
467 				      new_spte, level);
468 }
469 
470 /*
471  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
472  * associated bookkeeping
473  *
474  * @kvm: kvm instance
475  * @iter: a tdp_iter instance currently on the SPTE that should be set
476  * @new_spte: The value the SPTE should be set to
477  * Returns: true if the SPTE was set, false if it was not. If false is returned,
478  *	    this function will have no side-effects.
479  */
480 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
481 					   struct tdp_iter *iter,
482 					   u64 new_spte)
483 {
484 	u64 *root_pt = tdp_iter_root_pt(iter);
485 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
486 	int as_id = kvm_mmu_page_as_id(root);
487 
488 	lockdep_assert_held_read(&kvm->mmu_lock);
489 
490 	/*
491 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
492 	 * may modify it.
493 	 */
494 	if (iter->old_spte == REMOVED_SPTE)
495 		return false;
496 
497 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
498 		      new_spte) != iter->old_spte)
499 		return false;
500 
501 	handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
502 			    iter->level, true);
503 
504 	return true;
505 }
506 
507 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
508 					   struct tdp_iter *iter)
509 {
510 	/*
511 	 * Freeze the SPTE by setting it to a special,
512 	 * non-present value. This will stop other threads from
513 	 * immediately installing a present entry in its place
514 	 * before the TLBs are flushed.
515 	 */
516 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
517 		return false;
518 
519 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
520 					   KVM_PAGES_PER_HPAGE(iter->level));
521 
522 	/*
523 	 * No other thread can overwrite the removed SPTE as they
524 	 * must either wait on the MMU lock or use
525 	 * tdp_mmu_set_spte_atomic which will not overrite the
526 	 * special removed SPTE value. No bookkeeping is needed
527 	 * here since the SPTE is going from non-present
528 	 * to non-present.
529 	 */
530 	WRITE_ONCE(*iter->sptep, 0);
531 
532 	return true;
533 }
534 
535 
536 /*
537  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
538  * @kvm: kvm instance
539  * @iter: a tdp_iter instance currently on the SPTE that should be set
540  * @new_spte: The value the SPTE should be set to
541  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
542  *		      of the page. Should be set unless handling an MMU
543  *		      notifier for access tracking. Leaving record_acc_track
544  *		      unset in that case prevents page accesses from being
545  *		      double counted.
546  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
547  *		      appropriate for the change being made. Should be set
548  *		      unless performing certain dirty logging operations.
549  *		      Leaving record_dirty_log unset in that case prevents page
550  *		      writes from being double counted.
551  */
552 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
553 				      u64 new_spte, bool record_acc_track,
554 				      bool record_dirty_log)
555 {
556 	tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
557 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
558 	int as_id = kvm_mmu_page_as_id(root);
559 
560 	lockdep_assert_held_write(&kvm->mmu_lock);
561 
562 	/*
563 	 * No thread should be using this function to set SPTEs to the
564 	 * temporary removed SPTE value.
565 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
566 	 * should be used. If operating under the MMU lock in write mode, the
567 	 * use of the removed SPTE should not be necessary.
568 	 */
569 	WARN_ON(iter->old_spte == REMOVED_SPTE);
570 
571 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
572 
573 	__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
574 			      iter->level, false);
575 	if (record_acc_track)
576 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
577 					      iter->level);
578 	if (record_dirty_log)
579 		handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
580 					      iter->old_spte, new_spte,
581 					      iter->level);
582 }
583 
584 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
585 				    u64 new_spte)
586 {
587 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
588 }
589 
590 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
591 						 struct tdp_iter *iter,
592 						 u64 new_spte)
593 {
594 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
595 }
596 
597 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
598 						 struct tdp_iter *iter,
599 						 u64 new_spte)
600 {
601 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
602 }
603 
604 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
605 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
606 
607 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
608 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
609 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
610 		    !is_last_spte(_iter.old_spte, _iter.level))		\
611 			continue;					\
612 		else
613 
614 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
615 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
616 			 _mmu->shadow_root_level, _start, _end)
617 
618 /*
619  * Yield if the MMU lock is contended or this thread needs to return control
620  * to the scheduler.
621  *
622  * If this function should yield and flush is set, it will perform a remote
623  * TLB flush before yielding.
624  *
625  * If this function yields, it will also reset the tdp_iter's walk over the
626  * paging structure and the calling function should skip to the next
627  * iteration to allow the iterator to continue its traversal from the
628  * paging structure root.
629  *
630  * Return true if this function yielded and the iterator's traversal was reset.
631  * Return false if a yield was not needed.
632  */
633 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
634 					     struct tdp_iter *iter, bool flush)
635 {
636 	/* Ensure forward progress has been made before yielding. */
637 	if (iter->next_last_level_gfn == iter->yielded_gfn)
638 		return false;
639 
640 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
641 		rcu_read_unlock();
642 
643 		if (flush)
644 			kvm_flush_remote_tlbs(kvm);
645 
646 		cond_resched_rwlock_write(&kvm->mmu_lock);
647 		rcu_read_lock();
648 
649 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
650 
651 		tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
652 			       iter->root_level, iter->min_level,
653 			       iter->next_last_level_gfn);
654 
655 		return true;
656 	}
657 
658 	return false;
659 }
660 
661 /*
662  * Tears down the mappings for the range of gfns, [start, end), and frees the
663  * non-root pages mapping GFNs strictly within that range. Returns true if
664  * SPTEs have been cleared and a TLB flush is needed before releasing the
665  * MMU lock.
666  * If can_yield is true, will release the MMU lock and reschedule if the
667  * scheduler needs the CPU or there is contention on the MMU lock. If this
668  * function cannot yield, it will not release the MMU lock or reschedule and
669  * the caller must ensure it does not supply too large a GFN range, or the
670  * operation can cause a soft lockup.
671  */
672 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
673 			  gfn_t start, gfn_t end, bool can_yield)
674 {
675 	struct tdp_iter iter;
676 	bool flush_needed = false;
677 
678 	rcu_read_lock();
679 
680 	tdp_root_for_each_pte(iter, root, start, end) {
681 		if (can_yield &&
682 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
683 			flush_needed = false;
684 			continue;
685 		}
686 
687 		if (!is_shadow_present_pte(iter.old_spte))
688 			continue;
689 
690 		/*
691 		 * If this is a non-last-level SPTE that covers a larger range
692 		 * than should be zapped, continue, and zap the mappings at a
693 		 * lower level.
694 		 */
695 		if ((iter.gfn < start ||
696 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
697 		    !is_last_spte(iter.old_spte, iter.level))
698 			continue;
699 
700 		tdp_mmu_set_spte(kvm, &iter, 0);
701 		flush_needed = true;
702 	}
703 
704 	rcu_read_unlock();
705 	return flush_needed;
706 }
707 
708 /*
709  * Tears down the mappings for the range of gfns, [start, end), and frees the
710  * non-root pages mapping GFNs strictly within that range. Returns true if
711  * SPTEs have been cleared and a TLB flush is needed before releasing the
712  * MMU lock.
713  */
714 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
715 {
716 	struct kvm_mmu_page *root;
717 	bool flush = false;
718 
719 	for_each_tdp_mmu_root_yield_safe(kvm, root)
720 		flush |= zap_gfn_range(kvm, root, start, end, true);
721 
722 	return flush;
723 }
724 
725 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
726 {
727 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
728 	bool flush;
729 
730 	flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
731 	if (flush)
732 		kvm_flush_remote_tlbs(kvm);
733 }
734 
735 /*
736  * Installs a last-level SPTE to handle a TDP page fault.
737  * (NPT/EPT violation/misconfiguration)
738  */
739 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
740 					  int map_writable,
741 					  struct tdp_iter *iter,
742 					  kvm_pfn_t pfn, bool prefault)
743 {
744 	u64 new_spte;
745 	int ret = 0;
746 	int make_spte_ret = 0;
747 
748 	if (unlikely(is_noslot_pfn(pfn)))
749 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
750 	else
751 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
752 					 pfn, iter->old_spte, prefault, true,
753 					 map_writable, !shadow_accessed_mask,
754 					 &new_spte);
755 
756 	if (new_spte == iter->old_spte)
757 		ret = RET_PF_SPURIOUS;
758 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
759 		return RET_PF_RETRY;
760 
761 	/*
762 	 * If the page fault was caused by a write but the page is write
763 	 * protected, emulation is needed. If the emulation was skipped,
764 	 * the vCPU would have the same fault again.
765 	 */
766 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
767 		if (write)
768 			ret = RET_PF_EMULATE;
769 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
770 	}
771 
772 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
773 	if (unlikely(is_mmio_spte(new_spte))) {
774 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
775 				     new_spte);
776 		ret = RET_PF_EMULATE;
777 	} else
778 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
779 				       rcu_dereference(iter->sptep));
780 
781 	trace_kvm_mmu_set_spte(iter->level, iter->gfn,
782 			       rcu_dereference(iter->sptep));
783 	if (!prefault)
784 		vcpu->stat.pf_fixed++;
785 
786 	return ret;
787 }
788 
789 /*
790  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
791  * page tables and SPTEs to translate the faulting guest physical address.
792  */
793 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
794 		    int map_writable, int max_level, kvm_pfn_t pfn,
795 		    bool prefault)
796 {
797 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
798 	bool write = error_code & PFERR_WRITE_MASK;
799 	bool exec = error_code & PFERR_FETCH_MASK;
800 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
801 	struct kvm_mmu *mmu = vcpu->arch.mmu;
802 	struct tdp_iter iter;
803 	struct kvm_mmu_page *sp;
804 	u64 *child_pt;
805 	u64 new_spte;
806 	int ret;
807 	gfn_t gfn = gpa >> PAGE_SHIFT;
808 	int level;
809 	int req_level;
810 
811 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
812 		return RET_PF_RETRY;
813 	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
814 		return RET_PF_RETRY;
815 
816 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
817 					huge_page_disallowed, &req_level);
818 
819 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
820 
821 	rcu_read_lock();
822 
823 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
824 		if (nx_huge_page_workaround_enabled)
825 			disallowed_hugepage_adjust(iter.old_spte, gfn,
826 						   iter.level, &pfn, &level);
827 
828 		if (iter.level == level)
829 			break;
830 
831 		/*
832 		 * If there is an SPTE mapping a large page at a higher level
833 		 * than the target, that SPTE must be cleared and replaced
834 		 * with a non-leaf SPTE.
835 		 */
836 		if (is_shadow_present_pte(iter.old_spte) &&
837 		    is_large_pte(iter.old_spte)) {
838 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
839 				break;
840 
841 			/*
842 			 * The iter must explicitly re-read the spte here
843 			 * because the new value informs the !present
844 			 * path below.
845 			 */
846 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
847 		}
848 
849 		if (!is_shadow_present_pte(iter.old_spte)) {
850 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
851 			child_pt = sp->spt;
852 
853 			new_spte = make_nonleaf_spte(child_pt,
854 						     !shadow_accessed_mask);
855 
856 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
857 						    new_spte)) {
858 				tdp_mmu_link_page(vcpu->kvm, sp, true,
859 						  huge_page_disallowed &&
860 						  req_level >= iter.level);
861 
862 				trace_kvm_mmu_get_page(sp, true);
863 			} else {
864 				tdp_mmu_free_sp(sp);
865 				break;
866 			}
867 		}
868 	}
869 
870 	if (iter.level != level) {
871 		rcu_read_unlock();
872 		return RET_PF_RETRY;
873 	}
874 
875 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
876 					      pfn, prefault);
877 	rcu_read_unlock();
878 
879 	return ret;
880 }
881 
882 static __always_inline int
883 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
884 			     unsigned long start,
885 			     unsigned long end,
886 			     unsigned long data,
887 			     int (*handler)(struct kvm *kvm,
888 					    struct kvm_memory_slot *slot,
889 					    struct kvm_mmu_page *root,
890 					    gfn_t start,
891 					    gfn_t end,
892 					    unsigned long data))
893 {
894 	struct kvm_memslots *slots;
895 	struct kvm_memory_slot *memslot;
896 	struct kvm_mmu_page *root;
897 	int ret = 0;
898 	int as_id;
899 
900 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
901 		as_id = kvm_mmu_page_as_id(root);
902 		slots = __kvm_memslots(kvm, as_id);
903 		kvm_for_each_memslot(memslot, slots) {
904 			unsigned long hva_start, hva_end;
905 			gfn_t gfn_start, gfn_end;
906 
907 			hva_start = max(start, memslot->userspace_addr);
908 			hva_end = min(end, memslot->userspace_addr +
909 				      (memslot->npages << PAGE_SHIFT));
910 			if (hva_start >= hva_end)
911 				continue;
912 			/*
913 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
914 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
915 			 */
916 			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
917 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
918 
919 			ret |= handler(kvm, memslot, root, gfn_start,
920 				       gfn_end, data);
921 		}
922 	}
923 
924 	return ret;
925 }
926 
927 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
928 				     struct kvm_memory_slot *slot,
929 				     struct kvm_mmu_page *root, gfn_t start,
930 				     gfn_t end, unsigned long unused)
931 {
932 	return zap_gfn_range(kvm, root, start, end, false);
933 }
934 
935 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
936 			      unsigned long end)
937 {
938 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
939 					    zap_gfn_range_hva_wrapper);
940 }
941 
942 /*
943  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
944  * if any of the GFNs in the range have been accessed.
945  */
946 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
947 			 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
948 			 unsigned long unused)
949 {
950 	struct tdp_iter iter;
951 	int young = 0;
952 	u64 new_spte = 0;
953 
954 	rcu_read_lock();
955 
956 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
957 		/*
958 		 * If we have a non-accessed entry we don't need to change the
959 		 * pte.
960 		 */
961 		if (!is_accessed_spte(iter.old_spte))
962 			continue;
963 
964 		new_spte = iter.old_spte;
965 
966 		if (spte_ad_enabled(new_spte)) {
967 			clear_bit((ffs(shadow_accessed_mask) - 1),
968 				  (unsigned long *)&new_spte);
969 		} else {
970 			/*
971 			 * Capture the dirty status of the page, so that it doesn't get
972 			 * lost when the SPTE is marked for access tracking.
973 			 */
974 			if (is_writable_pte(new_spte))
975 				kvm_set_pfn_dirty(spte_to_pfn(new_spte));
976 
977 			new_spte = mark_spte_for_access_track(new_spte);
978 		}
979 		new_spte &= ~shadow_dirty_mask;
980 
981 		tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
982 		young = 1;
983 
984 		trace_kvm_age_page(iter.gfn, iter.level, slot, young);
985 	}
986 
987 	rcu_read_unlock();
988 
989 	return young;
990 }
991 
992 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
993 			      unsigned long end)
994 {
995 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
996 					    age_gfn_range);
997 }
998 
999 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1000 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1001 			unsigned long unused2)
1002 {
1003 	struct tdp_iter iter;
1004 
1005 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1006 		if (is_accessed_spte(iter.old_spte))
1007 			return 1;
1008 
1009 	return 0;
1010 }
1011 
1012 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1013 {
1014 	return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1015 					    test_age_gfn);
1016 }
1017 
1018 /*
1019  * Handle the changed_pte MMU notifier for the TDP MMU.
1020  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1021  * notifier.
1022  * Returns non-zero if a flush is needed before releasing the MMU lock.
1023  */
1024 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1025 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1026 			unsigned long data)
1027 {
1028 	struct tdp_iter iter;
1029 	pte_t *ptep = (pte_t *)data;
1030 	kvm_pfn_t new_pfn;
1031 	u64 new_spte;
1032 	int need_flush = 0;
1033 
1034 	rcu_read_lock();
1035 
1036 	WARN_ON(pte_huge(*ptep));
1037 
1038 	new_pfn = pte_pfn(*ptep);
1039 
1040 	tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1041 		if (iter.level != PG_LEVEL_4K)
1042 			continue;
1043 
1044 		if (!is_shadow_present_pte(iter.old_spte))
1045 			break;
1046 
1047 		tdp_mmu_set_spte(kvm, &iter, 0);
1048 
1049 		kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1050 
1051 		if (!pte_write(*ptep)) {
1052 			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1053 					iter.old_spte, new_pfn);
1054 
1055 			tdp_mmu_set_spte(kvm, &iter, new_spte);
1056 		}
1057 
1058 		need_flush = 1;
1059 	}
1060 
1061 	if (need_flush)
1062 		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1063 
1064 	rcu_read_unlock();
1065 
1066 	return 0;
1067 }
1068 
1069 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1070 			     pte_t *host_ptep)
1071 {
1072 	return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1073 					    (unsigned long)host_ptep,
1074 					    set_tdp_spte);
1075 }
1076 
1077 /*
1078  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1079  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1080  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1081  */
1082 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1083 			     gfn_t start, gfn_t end, int min_level)
1084 {
1085 	struct tdp_iter iter;
1086 	u64 new_spte;
1087 	bool spte_set = false;
1088 
1089 	rcu_read_lock();
1090 
1091 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1092 
1093 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1094 				   min_level, start, end) {
1095 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1096 			continue;
1097 
1098 		if (!is_shadow_present_pte(iter.old_spte) ||
1099 		    !is_last_spte(iter.old_spte, iter.level) ||
1100 		    !(iter.old_spte & PT_WRITABLE_MASK))
1101 			continue;
1102 
1103 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1104 
1105 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1106 		spte_set = true;
1107 	}
1108 
1109 	rcu_read_unlock();
1110 	return spte_set;
1111 }
1112 
1113 /*
1114  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1115  * only affect leaf SPTEs down to min_level.
1116  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1117  */
1118 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1119 			     int min_level)
1120 {
1121 	struct kvm_mmu_page *root;
1122 	int root_as_id;
1123 	bool spte_set = false;
1124 
1125 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1126 		root_as_id = kvm_mmu_page_as_id(root);
1127 		if (root_as_id != slot->as_id)
1128 			continue;
1129 
1130 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1131 			     slot->base_gfn + slot->npages, min_level);
1132 	}
1133 
1134 	return spte_set;
1135 }
1136 
1137 /*
1138  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1139  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1140  * If AD bits are not enabled, this will require clearing the writable bit on
1141  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1142  * be flushed.
1143  */
1144 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1145 			   gfn_t start, gfn_t end)
1146 {
1147 	struct tdp_iter iter;
1148 	u64 new_spte;
1149 	bool spte_set = false;
1150 
1151 	rcu_read_lock();
1152 
1153 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1154 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1155 			continue;
1156 
1157 		if (spte_ad_need_write_protect(iter.old_spte)) {
1158 			if (is_writable_pte(iter.old_spte))
1159 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1160 			else
1161 				continue;
1162 		} else {
1163 			if (iter.old_spte & shadow_dirty_mask)
1164 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1165 			else
1166 				continue;
1167 		}
1168 
1169 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1170 		spte_set = true;
1171 	}
1172 
1173 	rcu_read_unlock();
1174 	return spte_set;
1175 }
1176 
1177 /*
1178  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1179  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1180  * If AD bits are not enabled, this will require clearing the writable bit on
1181  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1182  * be flushed.
1183  */
1184 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1185 {
1186 	struct kvm_mmu_page *root;
1187 	int root_as_id;
1188 	bool spte_set = false;
1189 
1190 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1191 		root_as_id = kvm_mmu_page_as_id(root);
1192 		if (root_as_id != slot->as_id)
1193 			continue;
1194 
1195 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1196 				slot->base_gfn + slot->npages);
1197 	}
1198 
1199 	return spte_set;
1200 }
1201 
1202 /*
1203  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1204  * set in mask, starting at gfn. The given memslot is expected to contain all
1205  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1206  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1207  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1208  */
1209 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1210 				  gfn_t gfn, unsigned long mask, bool wrprot)
1211 {
1212 	struct tdp_iter iter;
1213 	u64 new_spte;
1214 
1215 	rcu_read_lock();
1216 
1217 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1218 				    gfn + BITS_PER_LONG) {
1219 		if (!mask)
1220 			break;
1221 
1222 		if (iter.level > PG_LEVEL_4K ||
1223 		    !(mask & (1UL << (iter.gfn - gfn))))
1224 			continue;
1225 
1226 		mask &= ~(1UL << (iter.gfn - gfn));
1227 
1228 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1229 			if (is_writable_pte(iter.old_spte))
1230 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1231 			else
1232 				continue;
1233 		} else {
1234 			if (iter.old_spte & shadow_dirty_mask)
1235 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1236 			else
1237 				continue;
1238 		}
1239 
1240 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1241 	}
1242 
1243 	rcu_read_unlock();
1244 }
1245 
1246 /*
1247  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1248  * set in mask, starting at gfn. The given memslot is expected to contain all
1249  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1250  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1251  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1252  */
1253 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1254 				       struct kvm_memory_slot *slot,
1255 				       gfn_t gfn, unsigned long mask,
1256 				       bool wrprot)
1257 {
1258 	struct kvm_mmu_page *root;
1259 	int root_as_id;
1260 
1261 	lockdep_assert_held_write(&kvm->mmu_lock);
1262 	for_each_tdp_mmu_root(kvm, root) {
1263 		root_as_id = kvm_mmu_page_as_id(root);
1264 		if (root_as_id != slot->as_id)
1265 			continue;
1266 
1267 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1268 	}
1269 }
1270 
1271 /*
1272  * Clear leaf entries which could be replaced by large mappings, for
1273  * GFNs within the slot.
1274  */
1275 static void zap_collapsible_spte_range(struct kvm *kvm,
1276 				       struct kvm_mmu_page *root,
1277 				       struct kvm_memory_slot *slot)
1278 {
1279 	gfn_t start = slot->base_gfn;
1280 	gfn_t end = start + slot->npages;
1281 	struct tdp_iter iter;
1282 	kvm_pfn_t pfn;
1283 	bool spte_set = false;
1284 
1285 	rcu_read_lock();
1286 
1287 	tdp_root_for_each_pte(iter, root, start, end) {
1288 		if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1289 			spte_set = false;
1290 			continue;
1291 		}
1292 
1293 		if (!is_shadow_present_pte(iter.old_spte) ||
1294 		    !is_last_spte(iter.old_spte, iter.level))
1295 			continue;
1296 
1297 		pfn = spte_to_pfn(iter.old_spte);
1298 		if (kvm_is_reserved_pfn(pfn) ||
1299 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1300 							    pfn, PG_LEVEL_NUM))
1301 			continue;
1302 
1303 		tdp_mmu_set_spte(kvm, &iter, 0);
1304 
1305 		spte_set = true;
1306 	}
1307 
1308 	rcu_read_unlock();
1309 	if (spte_set)
1310 		kvm_flush_remote_tlbs(kvm);
1311 }
1312 
1313 /*
1314  * Clear non-leaf entries (and free associated page tables) which could
1315  * be replaced by large mappings, for GFNs within the slot.
1316  */
1317 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1318 				       struct kvm_memory_slot *slot)
1319 {
1320 	struct kvm_mmu_page *root;
1321 	int root_as_id;
1322 
1323 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1324 		root_as_id = kvm_mmu_page_as_id(root);
1325 		if (root_as_id != slot->as_id)
1326 			continue;
1327 
1328 		zap_collapsible_spte_range(kvm, root, slot);
1329 	}
1330 }
1331 
1332 /*
1333  * Removes write access on the last level SPTE mapping this GFN and unsets the
1334  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1335  * Returns true if an SPTE was set and a TLB flush is needed.
1336  */
1337 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1338 			      gfn_t gfn)
1339 {
1340 	struct tdp_iter iter;
1341 	u64 new_spte;
1342 	bool spte_set = false;
1343 
1344 	rcu_read_lock();
1345 
1346 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1347 		if (!is_writable_pte(iter.old_spte))
1348 			break;
1349 
1350 		new_spte = iter.old_spte &
1351 			~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1352 
1353 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1354 		spte_set = true;
1355 	}
1356 
1357 	rcu_read_unlock();
1358 
1359 	return spte_set;
1360 }
1361 
1362 /*
1363  * Removes write access on the last level SPTE mapping this GFN and unsets the
1364  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1365  * Returns true if an SPTE was set and a TLB flush is needed.
1366  */
1367 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1368 				   struct kvm_memory_slot *slot, gfn_t gfn)
1369 {
1370 	struct kvm_mmu_page *root;
1371 	int root_as_id;
1372 	bool spte_set = false;
1373 
1374 	lockdep_assert_held_write(&kvm->mmu_lock);
1375 	for_each_tdp_mmu_root(kvm, root) {
1376 		root_as_id = kvm_mmu_page_as_id(root);
1377 		if (root_as_id != slot->as_id)
1378 			continue;
1379 
1380 		spte_set |= write_protect_gfn(kvm, root, gfn);
1381 	}
1382 	return spte_set;
1383 }
1384 
1385 /*
1386  * Return the level of the lowest level SPTE added to sptes.
1387  * That SPTE may be non-present.
1388  */
1389 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1390 			 int *root_level)
1391 {
1392 	struct tdp_iter iter;
1393 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1394 	gfn_t gfn = addr >> PAGE_SHIFT;
1395 	int leaf = -1;
1396 
1397 	*root_level = vcpu->arch.mmu->shadow_root_level;
1398 
1399 	rcu_read_lock();
1400 
1401 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1402 		leaf = iter.level;
1403 		sptes[leaf] = iter.old_spte;
1404 	}
1405 
1406 	rcu_read_unlock();
1407 
1408 	return leaf;
1409 }
1410