xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 8b0adbe3e38dbe5aae9edf6f5159ffdca7cfbdf1)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 }
29 
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
31 {
32 	if (!kvm->arch.tdp_mmu_enabled)
33 		return;
34 
35 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
36 
37 	/*
38 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 	 * can run before the VM is torn down.
40 	 */
41 	rcu_barrier();
42 }
43 
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
45 {
46 	if (kvm_mmu_put_root(kvm, root))
47 		kvm_tdp_mmu_free_root(kvm, root);
48 }
49 
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51 					   struct kvm_mmu_page *root)
52 {
53 	lockdep_assert_held_write(&kvm->mmu_lock);
54 
55 	if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
56 		return false;
57 
58 	kvm_mmu_get_root(kvm, root);
59 	return true;
60 
61 }
62 
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64 						     struct kvm_mmu_page *root)
65 {
66 	struct kvm_mmu_page *next_root;
67 
68 	next_root = list_next_entry(root, link);
69 	tdp_mmu_put_root(kvm, root);
70 	return next_root;
71 }
72 
73 /*
74  * Note: this iterator gets and puts references to the roots it iterates over.
75  * This makes it safe to release the MMU lock and yield within the loop, but
76  * if exiting the loop early, the caller must drop the reference to the most
77  * recent root. (Unless keeping a live reference is desirable.)
78  */
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)				\
80 	for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,	\
81 				      typeof(*_root), link);		\
82 	     tdp_mmu_next_root_valid(_kvm, _root);			\
83 	     _root = tdp_mmu_next_root(_kvm, _root))
84 
85 #define for_each_tdp_mmu_root(_kvm, _root)				\
86 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
87 
88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
89 			  gfn_t start, gfn_t end, bool can_yield);
90 
91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
92 {
93 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
94 
95 	lockdep_assert_held_write(&kvm->mmu_lock);
96 
97 	WARN_ON(root->root_count);
98 	WARN_ON(!root->tdp_mmu_page);
99 
100 	list_del(&root->link);
101 
102 	zap_gfn_range(kvm, root, 0, max_gfn, false);
103 
104 	free_page((unsigned long)root->spt);
105 	kmem_cache_free(mmu_page_header_cache, root);
106 }
107 
108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
109 						   int level)
110 {
111 	union kvm_mmu_page_role role;
112 
113 	role = vcpu->arch.mmu->mmu_role.base;
114 	role.level = level;
115 	role.direct = true;
116 	role.gpte_is_8_bytes = true;
117 	role.access = ACC_ALL;
118 
119 	return role;
120 }
121 
122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
123 					       int level)
124 {
125 	struct kvm_mmu_page *sp;
126 
127 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
128 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
129 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
130 
131 	sp->role.word = page_role_for_level(vcpu, level).word;
132 	sp->gfn = gfn;
133 	sp->tdp_mmu_page = true;
134 
135 	trace_kvm_mmu_get_page(sp, true);
136 
137 	return sp;
138 }
139 
140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
141 {
142 	union kvm_mmu_page_role role;
143 	struct kvm *kvm = vcpu->kvm;
144 	struct kvm_mmu_page *root;
145 
146 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
147 
148 	write_lock(&kvm->mmu_lock);
149 
150 	/* Check for an existing root before allocating a new one. */
151 	for_each_tdp_mmu_root(kvm, root) {
152 		if (root->role.word == role.word) {
153 			kvm_mmu_get_root(kvm, root);
154 			write_unlock(&kvm->mmu_lock);
155 			return root;
156 		}
157 	}
158 
159 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
160 	root->root_count = 1;
161 
162 	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
163 
164 	write_unlock(&kvm->mmu_lock);
165 
166 	return root;
167 }
168 
169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
170 {
171 	struct kvm_mmu_page *root;
172 
173 	root = get_tdp_mmu_vcpu_root(vcpu);
174 	if (!root)
175 		return INVALID_PAGE;
176 
177 	return __pa(root->spt);
178 }
179 
180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
181 {
182 	free_page((unsigned long)sp->spt);
183 	kmem_cache_free(mmu_page_header_cache, sp);
184 }
185 
186 /*
187  * This is called through call_rcu in order to free TDP page table memory
188  * safely with respect to other kernel threads that may be operating on
189  * the memory.
190  * By only accessing TDP MMU page table memory in an RCU read critical
191  * section, and freeing it after a grace period, lockless access to that
192  * memory won't use it after it is freed.
193  */
194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
195 {
196 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
197 					       rcu_head);
198 
199 	tdp_mmu_free_sp(sp);
200 }
201 
202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
203 				u64 old_spte, u64 new_spte, int level,
204 				bool shared);
205 
206 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
207 {
208 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
209 
210 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
211 		return;
212 
213 	if (is_accessed_spte(old_spte) &&
214 	    (!is_accessed_spte(new_spte) || pfn_changed))
215 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
216 }
217 
218 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
219 					  u64 old_spte, u64 new_spte, int level)
220 {
221 	bool pfn_changed;
222 	struct kvm_memory_slot *slot;
223 
224 	if (level > PG_LEVEL_4K)
225 		return;
226 
227 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
228 
229 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
230 	    is_writable_pte(new_spte)) {
231 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
232 		mark_page_dirty_in_slot(kvm, slot, gfn);
233 	}
234 }
235 
236 /**
237  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
238  *
239  * @kvm: kvm instance
240  * @sp: the new page
241  * @shared: This operation may not be running under the exclusive use of
242  *	    the MMU lock and the operation must synchronize with other
243  *	    threads that might be adding or removing pages.
244  * @account_nx: This page replaces a NX large page and should be marked for
245  *		eventual reclaim.
246  */
247 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
248 			      bool shared, bool account_nx)
249 {
250 	if (shared)
251 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
252 	else
253 		lockdep_assert_held_write(&kvm->mmu_lock);
254 
255 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
256 	if (account_nx)
257 		account_huge_nx_page(kvm, sp);
258 
259 	if (shared)
260 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
261 }
262 
263 /**
264  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
265  *
266  * @kvm: kvm instance
267  * @sp: the page to be removed
268  * @shared: This operation may not be running under the exclusive use of
269  *	    the MMU lock and the operation must synchronize with other
270  *	    threads that might be adding or removing pages.
271  */
272 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
273 				bool shared)
274 {
275 	if (shared)
276 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
277 	else
278 		lockdep_assert_held_write(&kvm->mmu_lock);
279 
280 	list_del(&sp->link);
281 	if (sp->lpage_disallowed)
282 		unaccount_huge_nx_page(kvm, sp);
283 
284 	if (shared)
285 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
286 }
287 
288 /**
289  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
290  *
291  * @kvm: kvm instance
292  * @pt: the page removed from the paging structure
293  * @shared: This operation may not be running under the exclusive use
294  *	    of the MMU lock and the operation must synchronize with other
295  *	    threads that might be modifying SPTEs.
296  *
297  * Given a page table that has been removed from the TDP paging structure,
298  * iterates through the page table to clear SPTEs and free child page tables.
299  *
300  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
301  * protection. Since this thread removed it from the paging structure,
302  * this thread will be responsible for ensuring the page is freed. Hence the
303  * early rcu_dereferences in the function.
304  */
305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
306 					bool shared)
307 {
308 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
309 	int level = sp->role.level;
310 	gfn_t base_gfn = sp->gfn;
311 	u64 old_child_spte;
312 	u64 *sptep;
313 	gfn_t gfn;
314 	int i;
315 
316 	trace_kvm_mmu_prepare_zap_page(sp);
317 
318 	tdp_mmu_unlink_page(kvm, sp, shared);
319 
320 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
321 		sptep = rcu_dereference(pt) + i;
322 		gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
323 
324 		if (shared) {
325 			/*
326 			 * Set the SPTE to a nonpresent value that other
327 			 * threads will not overwrite. If the SPTE was
328 			 * already marked as removed then another thread
329 			 * handling a page fault could overwrite it, so
330 			 * set the SPTE until it is set from some other
331 			 * value to the removed SPTE value.
332 			 */
333 			for (;;) {
334 				old_child_spte = xchg(sptep, REMOVED_SPTE);
335 				if (!is_removed_spte(old_child_spte))
336 					break;
337 				cpu_relax();
338 			}
339 		} else {
340 			/*
341 			 * If the SPTE is not MMU-present, there is no backing
342 			 * page associated with the SPTE and so no side effects
343 			 * that need to be recorded, and exclusive ownership of
344 			 * mmu_lock ensures the SPTE can't be made present.
345 			 * Note, zapping MMIO SPTEs is also unnecessary as they
346 			 * are guarded by the memslots generation, not by being
347 			 * unreachable.
348 			 */
349 			old_child_spte = READ_ONCE(*sptep);
350 			if (!is_shadow_present_pte(old_child_spte))
351 				continue;
352 
353 			/*
354 			 * Marking the SPTE as a removed SPTE is not
355 			 * strictly necessary here as the MMU lock will
356 			 * stop other threads from concurrently modifying
357 			 * this SPTE. Using the removed SPTE value keeps
358 			 * the two branches consistent and simplifies
359 			 * the function.
360 			 */
361 			WRITE_ONCE(*sptep, REMOVED_SPTE);
362 		}
363 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
364 				    old_child_spte, REMOVED_SPTE, level - 1,
365 				    shared);
366 	}
367 
368 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
369 					   KVM_PAGES_PER_HPAGE(level));
370 
371 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
372 }
373 
374 /**
375  * handle_changed_spte - handle bookkeeping associated with an SPTE change
376  * @kvm: kvm instance
377  * @as_id: the address space of the paging structure the SPTE was a part of
378  * @gfn: the base GFN that was mapped by the SPTE
379  * @old_spte: The value of the SPTE before the change
380  * @new_spte: The value of the SPTE after the change
381  * @level: the level of the PT the SPTE is part of in the paging structure
382  * @shared: This operation may not be running under the exclusive use of
383  *	    the MMU lock and the operation must synchronize with other
384  *	    threads that might be modifying SPTEs.
385  *
386  * Handle bookkeeping that might result from the modification of a SPTE.
387  * This function must be called for all TDP SPTE modifications.
388  */
389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
390 				  u64 old_spte, u64 new_spte, int level,
391 				  bool shared)
392 {
393 	bool was_present = is_shadow_present_pte(old_spte);
394 	bool is_present = is_shadow_present_pte(new_spte);
395 	bool was_leaf = was_present && is_last_spte(old_spte, level);
396 	bool is_leaf = is_present && is_last_spte(new_spte, level);
397 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
398 
399 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
400 	WARN_ON(level < PG_LEVEL_4K);
401 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
402 
403 	/*
404 	 * If this warning were to trigger it would indicate that there was a
405 	 * missing MMU notifier or a race with some notifier handler.
406 	 * A present, leaf SPTE should never be directly replaced with another
407 	 * present leaf SPTE pointing to a differnt PFN. A notifier handler
408 	 * should be zapping the SPTE before the main MM's page table is
409 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
410 	 * thread before replacement.
411 	 */
412 	if (was_leaf && is_leaf && pfn_changed) {
413 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
414 		       "SPTE with another present leaf SPTE mapping a\n"
415 		       "different PFN!\n"
416 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
417 		       as_id, gfn, old_spte, new_spte, level);
418 
419 		/*
420 		 * Crash the host to prevent error propagation and guest data
421 		 * courruption.
422 		 */
423 		BUG();
424 	}
425 
426 	if (old_spte == new_spte)
427 		return;
428 
429 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
430 
431 	/*
432 	 * The only times a SPTE should be changed from a non-present to
433 	 * non-present state is when an MMIO entry is installed/modified/
434 	 * removed. In that case, there is nothing to do here.
435 	 */
436 	if (!was_present && !is_present) {
437 		/*
438 		 * If this change does not involve a MMIO SPTE or removed SPTE,
439 		 * it is unexpected. Log the change, though it should not
440 		 * impact the guest since both the former and current SPTEs
441 		 * are nonpresent.
442 		 */
443 		if (WARN_ON(!is_mmio_spte(old_spte) &&
444 			    !is_mmio_spte(new_spte) &&
445 			    !is_removed_spte(new_spte)))
446 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
447 			       "should not be replaced with another,\n"
448 			       "different nonpresent SPTE, unless one or both\n"
449 			       "are MMIO SPTEs, or the new SPTE is\n"
450 			       "a temporary removed SPTE.\n"
451 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
452 			       as_id, gfn, old_spte, new_spte, level);
453 		return;
454 	}
455 
456 
457 	if (was_leaf && is_dirty_spte(old_spte) &&
458 	    (!is_dirty_spte(new_spte) || pfn_changed))
459 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
460 
461 	/*
462 	 * Recursively handle child PTs if the change removed a subtree from
463 	 * the paging structure.
464 	 */
465 	if (was_present && !was_leaf && (pfn_changed || !is_present))
466 		handle_removed_tdp_mmu_page(kvm,
467 				spte_to_child_pt(old_spte, level), shared);
468 }
469 
470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
471 				u64 old_spte, u64 new_spte, int level,
472 				bool shared)
473 {
474 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
475 			      shared);
476 	handle_changed_spte_acc_track(old_spte, new_spte, level);
477 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
478 				      new_spte, level);
479 }
480 
481 /*
482  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
483  * associated bookkeeping
484  *
485  * @kvm: kvm instance
486  * @iter: a tdp_iter instance currently on the SPTE that should be set
487  * @new_spte: The value the SPTE should be set to
488  * Returns: true if the SPTE was set, false if it was not. If false is returned,
489  *	    this function will have no side-effects.
490  */
491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
492 					   struct tdp_iter *iter,
493 					   u64 new_spte)
494 {
495 	lockdep_assert_held_read(&kvm->mmu_lock);
496 
497 	/*
498 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
499 	 * may modify it.
500 	 */
501 	if (iter->old_spte == REMOVED_SPTE)
502 		return false;
503 
504 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
505 		      new_spte) != iter->old_spte)
506 		return false;
507 
508 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
509 			    new_spte, iter->level, true);
510 
511 	return true;
512 }
513 
514 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
515 					   struct tdp_iter *iter)
516 {
517 	/*
518 	 * Freeze the SPTE by setting it to a special,
519 	 * non-present value. This will stop other threads from
520 	 * immediately installing a present entry in its place
521 	 * before the TLBs are flushed.
522 	 */
523 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
524 		return false;
525 
526 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
527 					   KVM_PAGES_PER_HPAGE(iter->level));
528 
529 	/*
530 	 * No other thread can overwrite the removed SPTE as they
531 	 * must either wait on the MMU lock or use
532 	 * tdp_mmu_set_spte_atomic which will not overrite the
533 	 * special removed SPTE value. No bookkeeping is needed
534 	 * here since the SPTE is going from non-present
535 	 * to non-present.
536 	 */
537 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
538 
539 	return true;
540 }
541 
542 
543 /*
544  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
545  * @kvm: kvm instance
546  * @iter: a tdp_iter instance currently on the SPTE that should be set
547  * @new_spte: The value the SPTE should be set to
548  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
549  *		      of the page. Should be set unless handling an MMU
550  *		      notifier for access tracking. Leaving record_acc_track
551  *		      unset in that case prevents page accesses from being
552  *		      double counted.
553  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
554  *		      appropriate for the change being made. Should be set
555  *		      unless performing certain dirty logging operations.
556  *		      Leaving record_dirty_log unset in that case prevents page
557  *		      writes from being double counted.
558  */
559 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
560 				      u64 new_spte, bool record_acc_track,
561 				      bool record_dirty_log)
562 {
563 	lockdep_assert_held_write(&kvm->mmu_lock);
564 
565 	/*
566 	 * No thread should be using this function to set SPTEs to the
567 	 * temporary removed SPTE value.
568 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
569 	 * should be used. If operating under the MMU lock in write mode, the
570 	 * use of the removed SPTE should not be necessary.
571 	 */
572 	WARN_ON(iter->old_spte == REMOVED_SPTE);
573 
574 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
575 
576 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
577 			      new_spte, iter->level, false);
578 	if (record_acc_track)
579 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
580 					      iter->level);
581 	if (record_dirty_log)
582 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
583 					      iter->old_spte, new_spte,
584 					      iter->level);
585 }
586 
587 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
588 				    u64 new_spte)
589 {
590 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
591 }
592 
593 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
594 						 struct tdp_iter *iter,
595 						 u64 new_spte)
596 {
597 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
598 }
599 
600 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
601 						 struct tdp_iter *iter,
602 						 u64 new_spte)
603 {
604 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
605 }
606 
607 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
608 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
609 
610 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
611 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
612 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
613 		    !is_last_spte(_iter.old_spte, _iter.level))		\
614 			continue;					\
615 		else
616 
617 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
618 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
619 			 _mmu->shadow_root_level, _start, _end)
620 
621 /*
622  * Yield if the MMU lock is contended or this thread needs to return control
623  * to the scheduler.
624  *
625  * If this function should yield and flush is set, it will perform a remote
626  * TLB flush before yielding.
627  *
628  * If this function yields, it will also reset the tdp_iter's walk over the
629  * paging structure and the calling function should skip to the next
630  * iteration to allow the iterator to continue its traversal from the
631  * paging structure root.
632  *
633  * Return true if this function yielded and the iterator's traversal was reset.
634  * Return false if a yield was not needed.
635  */
636 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
637 					     struct tdp_iter *iter, bool flush)
638 {
639 	/* Ensure forward progress has been made before yielding. */
640 	if (iter->next_last_level_gfn == iter->yielded_gfn)
641 		return false;
642 
643 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
644 		rcu_read_unlock();
645 
646 		if (flush)
647 			kvm_flush_remote_tlbs(kvm);
648 
649 		cond_resched_rwlock_write(&kvm->mmu_lock);
650 		rcu_read_lock();
651 
652 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
653 
654 		tdp_iter_restart(iter);
655 
656 		return true;
657 	}
658 
659 	return false;
660 }
661 
662 /*
663  * Tears down the mappings for the range of gfns, [start, end), and frees the
664  * non-root pages mapping GFNs strictly within that range. Returns true if
665  * SPTEs have been cleared and a TLB flush is needed before releasing the
666  * MMU lock.
667  * If can_yield is true, will release the MMU lock and reschedule if the
668  * scheduler needs the CPU or there is contention on the MMU lock. If this
669  * function cannot yield, it will not release the MMU lock or reschedule and
670  * the caller must ensure it does not supply too large a GFN range, or the
671  * operation can cause a soft lockup.
672  */
673 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
674 			  gfn_t start, gfn_t end, bool can_yield)
675 {
676 	struct tdp_iter iter;
677 	bool flush_needed = false;
678 
679 	rcu_read_lock();
680 
681 	tdp_root_for_each_pte(iter, root, start, end) {
682 		if (can_yield &&
683 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
684 			flush_needed = false;
685 			continue;
686 		}
687 
688 		if (!is_shadow_present_pte(iter.old_spte))
689 			continue;
690 
691 		/*
692 		 * If this is a non-last-level SPTE that covers a larger range
693 		 * than should be zapped, continue, and zap the mappings at a
694 		 * lower level.
695 		 */
696 		if ((iter.gfn < start ||
697 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
698 		    !is_last_spte(iter.old_spte, iter.level))
699 			continue;
700 
701 		tdp_mmu_set_spte(kvm, &iter, 0);
702 		flush_needed = true;
703 	}
704 
705 	rcu_read_unlock();
706 	return flush_needed;
707 }
708 
709 /*
710  * Tears down the mappings for the range of gfns, [start, end), and frees the
711  * non-root pages mapping GFNs strictly within that range. Returns true if
712  * SPTEs have been cleared and a TLB flush is needed before releasing the
713  * MMU lock.
714  */
715 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
716 {
717 	struct kvm_mmu_page *root;
718 	bool flush = false;
719 
720 	for_each_tdp_mmu_root_yield_safe(kvm, root)
721 		flush |= zap_gfn_range(kvm, root, start, end, true);
722 
723 	return flush;
724 }
725 
726 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
727 {
728 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
729 	bool flush;
730 
731 	flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
732 	if (flush)
733 		kvm_flush_remote_tlbs(kvm);
734 }
735 
736 /*
737  * Installs a last-level SPTE to handle a TDP page fault.
738  * (NPT/EPT violation/misconfiguration)
739  */
740 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
741 					  int map_writable,
742 					  struct tdp_iter *iter,
743 					  kvm_pfn_t pfn, bool prefault)
744 {
745 	u64 new_spte;
746 	int ret = 0;
747 	int make_spte_ret = 0;
748 
749 	if (unlikely(is_noslot_pfn(pfn)))
750 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
751 	else
752 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
753 					 pfn, iter->old_spte, prefault, true,
754 					 map_writable, !shadow_accessed_mask,
755 					 &new_spte);
756 
757 	if (new_spte == iter->old_spte)
758 		ret = RET_PF_SPURIOUS;
759 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
760 		return RET_PF_RETRY;
761 
762 	/*
763 	 * If the page fault was caused by a write but the page is write
764 	 * protected, emulation is needed. If the emulation was skipped,
765 	 * the vCPU would have the same fault again.
766 	 */
767 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
768 		if (write)
769 			ret = RET_PF_EMULATE;
770 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
771 	}
772 
773 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
774 	if (unlikely(is_mmio_spte(new_spte))) {
775 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
776 				     new_spte);
777 		ret = RET_PF_EMULATE;
778 	} else
779 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
780 				       rcu_dereference(iter->sptep));
781 
782 	trace_kvm_mmu_set_spte(iter->level, iter->gfn,
783 			       rcu_dereference(iter->sptep));
784 	if (!prefault)
785 		vcpu->stat.pf_fixed++;
786 
787 	return ret;
788 }
789 
790 /*
791  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
792  * page tables and SPTEs to translate the faulting guest physical address.
793  */
794 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
795 		    int map_writable, int max_level, kvm_pfn_t pfn,
796 		    bool prefault)
797 {
798 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
799 	bool write = error_code & PFERR_WRITE_MASK;
800 	bool exec = error_code & PFERR_FETCH_MASK;
801 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
802 	struct kvm_mmu *mmu = vcpu->arch.mmu;
803 	struct tdp_iter iter;
804 	struct kvm_mmu_page *sp;
805 	u64 *child_pt;
806 	u64 new_spte;
807 	int ret;
808 	gfn_t gfn = gpa >> PAGE_SHIFT;
809 	int level;
810 	int req_level;
811 
812 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
813 		return RET_PF_RETRY;
814 	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
815 		return RET_PF_RETRY;
816 
817 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
818 					huge_page_disallowed, &req_level);
819 
820 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
821 
822 	rcu_read_lock();
823 
824 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
825 		if (nx_huge_page_workaround_enabled)
826 			disallowed_hugepage_adjust(iter.old_spte, gfn,
827 						   iter.level, &pfn, &level);
828 
829 		if (iter.level == level)
830 			break;
831 
832 		/*
833 		 * If there is an SPTE mapping a large page at a higher level
834 		 * than the target, that SPTE must be cleared and replaced
835 		 * with a non-leaf SPTE.
836 		 */
837 		if (is_shadow_present_pte(iter.old_spte) &&
838 		    is_large_pte(iter.old_spte)) {
839 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
840 				break;
841 
842 			/*
843 			 * The iter must explicitly re-read the spte here
844 			 * because the new value informs the !present
845 			 * path below.
846 			 */
847 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
848 		}
849 
850 		if (!is_shadow_present_pte(iter.old_spte)) {
851 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
852 			child_pt = sp->spt;
853 
854 			new_spte = make_nonleaf_spte(child_pt,
855 						     !shadow_accessed_mask);
856 
857 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
858 						    new_spte)) {
859 				tdp_mmu_link_page(vcpu->kvm, sp, true,
860 						  huge_page_disallowed &&
861 						  req_level >= iter.level);
862 
863 				trace_kvm_mmu_get_page(sp, true);
864 			} else {
865 				tdp_mmu_free_sp(sp);
866 				break;
867 			}
868 		}
869 	}
870 
871 	if (iter.level != level) {
872 		rcu_read_unlock();
873 		return RET_PF_RETRY;
874 	}
875 
876 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
877 					      pfn, prefault);
878 	rcu_read_unlock();
879 
880 	return ret;
881 }
882 
883 static __always_inline int
884 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
885 			     unsigned long start,
886 			     unsigned long end,
887 			     unsigned long data,
888 			     int (*handler)(struct kvm *kvm,
889 					    struct kvm_memory_slot *slot,
890 					    struct kvm_mmu_page *root,
891 					    gfn_t start,
892 					    gfn_t end,
893 					    unsigned long data))
894 {
895 	struct kvm_memslots *slots;
896 	struct kvm_memory_slot *memslot;
897 	struct kvm_mmu_page *root;
898 	int ret = 0;
899 	int as_id;
900 
901 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
902 		as_id = kvm_mmu_page_as_id(root);
903 		slots = __kvm_memslots(kvm, as_id);
904 		kvm_for_each_memslot(memslot, slots) {
905 			unsigned long hva_start, hva_end;
906 			gfn_t gfn_start, gfn_end;
907 
908 			hva_start = max(start, memslot->userspace_addr);
909 			hva_end = min(end, memslot->userspace_addr +
910 				      (memslot->npages << PAGE_SHIFT));
911 			if (hva_start >= hva_end)
912 				continue;
913 			/*
914 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
915 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
916 			 */
917 			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
918 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
919 
920 			ret |= handler(kvm, memslot, root, gfn_start,
921 				       gfn_end, data);
922 		}
923 	}
924 
925 	return ret;
926 }
927 
928 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
929 				     struct kvm_memory_slot *slot,
930 				     struct kvm_mmu_page *root, gfn_t start,
931 				     gfn_t end, unsigned long unused)
932 {
933 	return zap_gfn_range(kvm, root, start, end, false);
934 }
935 
936 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
937 			      unsigned long end)
938 {
939 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
940 					    zap_gfn_range_hva_wrapper);
941 }
942 
943 /*
944  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
945  * if any of the GFNs in the range have been accessed.
946  */
947 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
948 			 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
949 			 unsigned long unused)
950 {
951 	struct tdp_iter iter;
952 	int young = 0;
953 	u64 new_spte = 0;
954 
955 	rcu_read_lock();
956 
957 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
958 		/*
959 		 * If we have a non-accessed entry we don't need to change the
960 		 * pte.
961 		 */
962 		if (!is_accessed_spte(iter.old_spte))
963 			continue;
964 
965 		new_spte = iter.old_spte;
966 
967 		if (spte_ad_enabled(new_spte)) {
968 			clear_bit((ffs(shadow_accessed_mask) - 1),
969 				  (unsigned long *)&new_spte);
970 		} else {
971 			/*
972 			 * Capture the dirty status of the page, so that it doesn't get
973 			 * lost when the SPTE is marked for access tracking.
974 			 */
975 			if (is_writable_pte(new_spte))
976 				kvm_set_pfn_dirty(spte_to_pfn(new_spte));
977 
978 			new_spte = mark_spte_for_access_track(new_spte);
979 		}
980 		new_spte &= ~shadow_dirty_mask;
981 
982 		tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
983 		young = 1;
984 
985 		trace_kvm_age_page(iter.gfn, iter.level, slot, young);
986 	}
987 
988 	rcu_read_unlock();
989 
990 	return young;
991 }
992 
993 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
994 			      unsigned long end)
995 {
996 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
997 					    age_gfn_range);
998 }
999 
1000 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1001 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1002 			unsigned long unused2)
1003 {
1004 	struct tdp_iter iter;
1005 
1006 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1007 		if (is_accessed_spte(iter.old_spte))
1008 			return 1;
1009 
1010 	return 0;
1011 }
1012 
1013 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1014 {
1015 	return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1016 					    test_age_gfn);
1017 }
1018 
1019 /*
1020  * Handle the changed_pte MMU notifier for the TDP MMU.
1021  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1022  * notifier.
1023  * Returns non-zero if a flush is needed before releasing the MMU lock.
1024  */
1025 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1026 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1027 			unsigned long data)
1028 {
1029 	struct tdp_iter iter;
1030 	pte_t *ptep = (pte_t *)data;
1031 	kvm_pfn_t new_pfn;
1032 	u64 new_spte;
1033 	int need_flush = 0;
1034 
1035 	rcu_read_lock();
1036 
1037 	WARN_ON(pte_huge(*ptep));
1038 
1039 	new_pfn = pte_pfn(*ptep);
1040 
1041 	tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1042 		if (iter.level != PG_LEVEL_4K)
1043 			continue;
1044 
1045 		if (!is_shadow_present_pte(iter.old_spte))
1046 			break;
1047 
1048 		tdp_mmu_set_spte(kvm, &iter, 0);
1049 
1050 		kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1051 
1052 		if (!pte_write(*ptep)) {
1053 			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1054 					iter.old_spte, new_pfn);
1055 
1056 			tdp_mmu_set_spte(kvm, &iter, new_spte);
1057 		}
1058 
1059 		need_flush = 1;
1060 	}
1061 
1062 	if (need_flush)
1063 		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1064 
1065 	rcu_read_unlock();
1066 
1067 	return 0;
1068 }
1069 
1070 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1071 			     pte_t *host_ptep)
1072 {
1073 	return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1074 					    (unsigned long)host_ptep,
1075 					    set_tdp_spte);
1076 }
1077 
1078 /*
1079  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1080  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1081  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1082  */
1083 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1084 			     gfn_t start, gfn_t end, int min_level)
1085 {
1086 	struct tdp_iter iter;
1087 	u64 new_spte;
1088 	bool spte_set = false;
1089 
1090 	rcu_read_lock();
1091 
1092 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1093 
1094 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1095 				   min_level, start, end) {
1096 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1097 			continue;
1098 
1099 		if (!is_shadow_present_pte(iter.old_spte) ||
1100 		    !is_last_spte(iter.old_spte, iter.level) ||
1101 		    !(iter.old_spte & PT_WRITABLE_MASK))
1102 			continue;
1103 
1104 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1105 
1106 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1107 		spte_set = true;
1108 	}
1109 
1110 	rcu_read_unlock();
1111 	return spte_set;
1112 }
1113 
1114 /*
1115  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1116  * only affect leaf SPTEs down to min_level.
1117  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1118  */
1119 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1120 			     int min_level)
1121 {
1122 	struct kvm_mmu_page *root;
1123 	int root_as_id;
1124 	bool spte_set = false;
1125 
1126 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1127 		root_as_id = kvm_mmu_page_as_id(root);
1128 		if (root_as_id != slot->as_id)
1129 			continue;
1130 
1131 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1132 			     slot->base_gfn + slot->npages, min_level);
1133 	}
1134 
1135 	return spte_set;
1136 }
1137 
1138 /*
1139  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1140  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1141  * If AD bits are not enabled, this will require clearing the writable bit on
1142  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1143  * be flushed.
1144  */
1145 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1146 			   gfn_t start, gfn_t end)
1147 {
1148 	struct tdp_iter iter;
1149 	u64 new_spte;
1150 	bool spte_set = false;
1151 
1152 	rcu_read_lock();
1153 
1154 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1155 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1156 			continue;
1157 
1158 		if (spte_ad_need_write_protect(iter.old_spte)) {
1159 			if (is_writable_pte(iter.old_spte))
1160 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1161 			else
1162 				continue;
1163 		} else {
1164 			if (iter.old_spte & shadow_dirty_mask)
1165 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1166 			else
1167 				continue;
1168 		}
1169 
1170 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1171 		spte_set = true;
1172 	}
1173 
1174 	rcu_read_unlock();
1175 	return spte_set;
1176 }
1177 
1178 /*
1179  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1180  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1181  * If AD bits are not enabled, this will require clearing the writable bit on
1182  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1183  * be flushed.
1184  */
1185 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1186 {
1187 	struct kvm_mmu_page *root;
1188 	int root_as_id;
1189 	bool spte_set = false;
1190 
1191 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1192 		root_as_id = kvm_mmu_page_as_id(root);
1193 		if (root_as_id != slot->as_id)
1194 			continue;
1195 
1196 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1197 				slot->base_gfn + slot->npages);
1198 	}
1199 
1200 	return spte_set;
1201 }
1202 
1203 /*
1204  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1205  * set in mask, starting at gfn. The given memslot is expected to contain all
1206  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1207  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1208  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1209  */
1210 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1211 				  gfn_t gfn, unsigned long mask, bool wrprot)
1212 {
1213 	struct tdp_iter iter;
1214 	u64 new_spte;
1215 
1216 	rcu_read_lock();
1217 
1218 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1219 				    gfn + BITS_PER_LONG) {
1220 		if (!mask)
1221 			break;
1222 
1223 		if (iter.level > PG_LEVEL_4K ||
1224 		    !(mask & (1UL << (iter.gfn - gfn))))
1225 			continue;
1226 
1227 		mask &= ~(1UL << (iter.gfn - gfn));
1228 
1229 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1230 			if (is_writable_pte(iter.old_spte))
1231 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1232 			else
1233 				continue;
1234 		} else {
1235 			if (iter.old_spte & shadow_dirty_mask)
1236 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1237 			else
1238 				continue;
1239 		}
1240 
1241 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1242 	}
1243 
1244 	rcu_read_unlock();
1245 }
1246 
1247 /*
1248  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1249  * set in mask, starting at gfn. The given memslot is expected to contain all
1250  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1251  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1252  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1253  */
1254 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1255 				       struct kvm_memory_slot *slot,
1256 				       gfn_t gfn, unsigned long mask,
1257 				       bool wrprot)
1258 {
1259 	struct kvm_mmu_page *root;
1260 	int root_as_id;
1261 
1262 	lockdep_assert_held_write(&kvm->mmu_lock);
1263 	for_each_tdp_mmu_root(kvm, root) {
1264 		root_as_id = kvm_mmu_page_as_id(root);
1265 		if (root_as_id != slot->as_id)
1266 			continue;
1267 
1268 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1269 	}
1270 }
1271 
1272 /*
1273  * Clear leaf entries which could be replaced by large mappings, for
1274  * GFNs within the slot.
1275  */
1276 static void zap_collapsible_spte_range(struct kvm *kvm,
1277 				       struct kvm_mmu_page *root,
1278 				       struct kvm_memory_slot *slot)
1279 {
1280 	gfn_t start = slot->base_gfn;
1281 	gfn_t end = start + slot->npages;
1282 	struct tdp_iter iter;
1283 	kvm_pfn_t pfn;
1284 	bool spte_set = false;
1285 
1286 	rcu_read_lock();
1287 
1288 	tdp_root_for_each_pte(iter, root, start, end) {
1289 		if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1290 			spte_set = false;
1291 			continue;
1292 		}
1293 
1294 		if (!is_shadow_present_pte(iter.old_spte) ||
1295 		    !is_last_spte(iter.old_spte, iter.level))
1296 			continue;
1297 
1298 		pfn = spte_to_pfn(iter.old_spte);
1299 		if (kvm_is_reserved_pfn(pfn) ||
1300 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1301 							    pfn, PG_LEVEL_NUM))
1302 			continue;
1303 
1304 		tdp_mmu_set_spte(kvm, &iter, 0);
1305 
1306 		spte_set = true;
1307 	}
1308 
1309 	rcu_read_unlock();
1310 	if (spte_set)
1311 		kvm_flush_remote_tlbs(kvm);
1312 }
1313 
1314 /*
1315  * Clear non-leaf entries (and free associated page tables) which could
1316  * be replaced by large mappings, for GFNs within the slot.
1317  */
1318 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1319 				       struct kvm_memory_slot *slot)
1320 {
1321 	struct kvm_mmu_page *root;
1322 	int root_as_id;
1323 
1324 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1325 		root_as_id = kvm_mmu_page_as_id(root);
1326 		if (root_as_id != slot->as_id)
1327 			continue;
1328 
1329 		zap_collapsible_spte_range(kvm, root, slot);
1330 	}
1331 }
1332 
1333 /*
1334  * Removes write access on the last level SPTE mapping this GFN and unsets the
1335  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1336  * Returns true if an SPTE was set and a TLB flush is needed.
1337  */
1338 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1339 			      gfn_t gfn)
1340 {
1341 	struct tdp_iter iter;
1342 	u64 new_spte;
1343 	bool spte_set = false;
1344 
1345 	rcu_read_lock();
1346 
1347 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1348 		if (!is_writable_pte(iter.old_spte))
1349 			break;
1350 
1351 		new_spte = iter.old_spte &
1352 			~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1353 
1354 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1355 		spte_set = true;
1356 	}
1357 
1358 	rcu_read_unlock();
1359 
1360 	return spte_set;
1361 }
1362 
1363 /*
1364  * Removes write access on the last level SPTE mapping this GFN and unsets the
1365  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1366  * Returns true if an SPTE was set and a TLB flush is needed.
1367  */
1368 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1369 				   struct kvm_memory_slot *slot, gfn_t gfn)
1370 {
1371 	struct kvm_mmu_page *root;
1372 	int root_as_id;
1373 	bool spte_set = false;
1374 
1375 	lockdep_assert_held_write(&kvm->mmu_lock);
1376 	for_each_tdp_mmu_root(kvm, root) {
1377 		root_as_id = kvm_mmu_page_as_id(root);
1378 		if (root_as_id != slot->as_id)
1379 			continue;
1380 
1381 		spte_set |= write_protect_gfn(kvm, root, gfn);
1382 	}
1383 	return spte_set;
1384 }
1385 
1386 /*
1387  * Return the level of the lowest level SPTE added to sptes.
1388  * That SPTE may be non-present.
1389  */
1390 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1391 			 int *root_level)
1392 {
1393 	struct tdp_iter iter;
1394 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1395 	gfn_t gfn = addr >> PAGE_SHIFT;
1396 	int leaf = -1;
1397 
1398 	*root_level = vcpu->arch.mmu->shadow_root_level;
1399 
1400 	rcu_read_lock();
1401 
1402 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1403 		leaf = iter.level;
1404 		sptes[leaf] = iter.old_spte;
1405 	}
1406 
1407 	rcu_read_unlock();
1408 
1409 	return leaf;
1410 }
1411