xref: /openbmc/linux/arch/x86/kvm/mmu/tdp_mmu.c (revision d9f6e12f)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 }
29 
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
31 {
32 	if (!kvm->arch.tdp_mmu_enabled)
33 		return;
34 
35 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
36 
37 	/*
38 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 	 * can run before the VM is torn down.
40 	 */
41 	rcu_barrier();
42 }
43 
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
45 {
46 	if (kvm_mmu_put_root(kvm, root))
47 		kvm_tdp_mmu_free_root(kvm, root);
48 }
49 
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51 					   struct kvm_mmu_page *root)
52 {
53 	lockdep_assert_held_write(&kvm->mmu_lock);
54 
55 	if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
56 		return false;
57 
58 	kvm_mmu_get_root(kvm, root);
59 	return true;
60 
61 }
62 
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64 						     struct kvm_mmu_page *root)
65 {
66 	struct kvm_mmu_page *next_root;
67 
68 	next_root = list_next_entry(root, link);
69 	tdp_mmu_put_root(kvm, root);
70 	return next_root;
71 }
72 
73 /*
74  * Note: this iterator gets and puts references to the roots it iterates over.
75  * This makes it safe to release the MMU lock and yield within the loop, but
76  * if exiting the loop early, the caller must drop the reference to the most
77  * recent root. (Unless keeping a live reference is desirable.)
78  */
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)				\
80 	for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,	\
81 				      typeof(*_root), link);		\
82 	     tdp_mmu_next_root_valid(_kvm, _root);			\
83 	     _root = tdp_mmu_next_root(_kvm, _root))
84 
85 #define for_each_tdp_mmu_root(_kvm, _root)				\
86 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
87 
88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
89 			  gfn_t start, gfn_t end, bool can_yield);
90 
91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
92 {
93 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
94 
95 	lockdep_assert_held_write(&kvm->mmu_lock);
96 
97 	WARN_ON(root->root_count);
98 	WARN_ON(!root->tdp_mmu_page);
99 
100 	list_del(&root->link);
101 
102 	zap_gfn_range(kvm, root, 0, max_gfn, false);
103 
104 	free_page((unsigned long)root->spt);
105 	kmem_cache_free(mmu_page_header_cache, root);
106 }
107 
108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
109 						   int level)
110 {
111 	union kvm_mmu_page_role role;
112 
113 	role = vcpu->arch.mmu->mmu_role.base;
114 	role.level = level;
115 	role.direct = true;
116 	role.gpte_is_8_bytes = true;
117 	role.access = ACC_ALL;
118 
119 	return role;
120 }
121 
122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
123 					       int level)
124 {
125 	struct kvm_mmu_page *sp;
126 
127 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
128 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
129 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
130 
131 	sp->role.word = page_role_for_level(vcpu, level).word;
132 	sp->gfn = gfn;
133 	sp->tdp_mmu_page = true;
134 
135 	trace_kvm_mmu_get_page(sp, true);
136 
137 	return sp;
138 }
139 
140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
141 {
142 	union kvm_mmu_page_role role;
143 	struct kvm *kvm = vcpu->kvm;
144 	struct kvm_mmu_page *root;
145 
146 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
147 
148 	write_lock(&kvm->mmu_lock);
149 
150 	/* Check for an existing root before allocating a new one. */
151 	for_each_tdp_mmu_root(kvm, root) {
152 		if (root->role.word == role.word) {
153 			kvm_mmu_get_root(kvm, root);
154 			write_unlock(&kvm->mmu_lock);
155 			return root;
156 		}
157 	}
158 
159 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
160 	root->root_count = 1;
161 
162 	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
163 
164 	write_unlock(&kvm->mmu_lock);
165 
166 	return root;
167 }
168 
169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
170 {
171 	struct kvm_mmu_page *root;
172 
173 	root = get_tdp_mmu_vcpu_root(vcpu);
174 	if (!root)
175 		return INVALID_PAGE;
176 
177 	return __pa(root->spt);
178 }
179 
180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
181 {
182 	free_page((unsigned long)sp->spt);
183 	kmem_cache_free(mmu_page_header_cache, sp);
184 }
185 
186 /*
187  * This is called through call_rcu in order to free TDP page table memory
188  * safely with respect to other kernel threads that may be operating on
189  * the memory.
190  * By only accessing TDP MMU page table memory in an RCU read critical
191  * section, and freeing it after a grace period, lockless access to that
192  * memory won't use it after it is freed.
193  */
194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
195 {
196 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
197 					       rcu_head);
198 
199 	tdp_mmu_free_sp(sp);
200 }
201 
202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
203 				u64 old_spte, u64 new_spte, int level,
204 				bool shared);
205 
206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
207 {
208 	return sp->role.smm ? 1 : 0;
209 }
210 
211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
212 {
213 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
214 
215 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
216 		return;
217 
218 	if (is_accessed_spte(old_spte) &&
219 	    (!is_accessed_spte(new_spte) || pfn_changed))
220 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
221 }
222 
223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
224 					  u64 old_spte, u64 new_spte, int level)
225 {
226 	bool pfn_changed;
227 	struct kvm_memory_slot *slot;
228 
229 	if (level > PG_LEVEL_4K)
230 		return;
231 
232 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
233 
234 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
235 	    is_writable_pte(new_spte)) {
236 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
237 		mark_page_dirty_in_slot(kvm, slot, gfn);
238 	}
239 }
240 
241 /**
242  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
243  *
244  * @kvm: kvm instance
245  * @sp: the new page
246  * @shared: This operation may not be running under the exclusive use of
247  *	    the MMU lock and the operation must synchronize with other
248  *	    threads that might be adding or removing pages.
249  * @account_nx: This page replaces a NX large page and should be marked for
250  *		eventual reclaim.
251  */
252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
253 			      bool shared, bool account_nx)
254 {
255 	if (shared)
256 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
257 	else
258 		lockdep_assert_held_write(&kvm->mmu_lock);
259 
260 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
261 	if (account_nx)
262 		account_huge_nx_page(kvm, sp);
263 
264 	if (shared)
265 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
266 }
267 
268 /**
269  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
270  *
271  * @kvm: kvm instance
272  * @sp: the page to be removed
273  * @shared: This operation may not be running under the exclusive use of
274  *	    the MMU lock and the operation must synchronize with other
275  *	    threads that might be adding or removing pages.
276  */
277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
278 				bool shared)
279 {
280 	if (shared)
281 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
282 	else
283 		lockdep_assert_held_write(&kvm->mmu_lock);
284 
285 	list_del(&sp->link);
286 	if (sp->lpage_disallowed)
287 		unaccount_huge_nx_page(kvm, sp);
288 
289 	if (shared)
290 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
291 }
292 
293 /**
294  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
295  *
296  * @kvm: kvm instance
297  * @pt: the page removed from the paging structure
298  * @shared: This operation may not be running under the exclusive use
299  *	    of the MMU lock and the operation must synchronize with other
300  *	    threads that might be modifying SPTEs.
301  *
302  * Given a page table that has been removed from the TDP paging structure,
303  * iterates through the page table to clear SPTEs and free child page tables.
304  */
305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
306 					bool shared)
307 {
308 	struct kvm_mmu_page *sp = sptep_to_sp(pt);
309 	int level = sp->role.level;
310 	gfn_t base_gfn = sp->gfn;
311 	u64 old_child_spte;
312 	u64 *sptep;
313 	gfn_t gfn;
314 	int i;
315 
316 	trace_kvm_mmu_prepare_zap_page(sp);
317 
318 	tdp_mmu_unlink_page(kvm, sp, shared);
319 
320 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
321 		sptep = pt + i;
322 		gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
323 
324 		if (shared) {
325 			/*
326 			 * Set the SPTE to a nonpresent value that other
327 			 * threads will not overwrite. If the SPTE was
328 			 * already marked as removed then another thread
329 			 * handling a page fault could overwrite it, so
330 			 * set the SPTE until it is set from some other
331 			 * value to the removed SPTE value.
332 			 */
333 			for (;;) {
334 				old_child_spte = xchg(sptep, REMOVED_SPTE);
335 				if (!is_removed_spte(old_child_spte))
336 					break;
337 				cpu_relax();
338 			}
339 		} else {
340 			/*
341 			 * If the SPTE is not MMU-present, there is no backing
342 			 * page associated with the SPTE and so no side effects
343 			 * that need to be recorded, and exclusive ownership of
344 			 * mmu_lock ensures the SPTE can't be made present.
345 			 * Note, zapping MMIO SPTEs is also unnecessary as they
346 			 * are guarded by the memslots generation, not by being
347 			 * unreachable.
348 			 */
349 			old_child_spte = READ_ONCE(*sptep);
350 			if (!is_shadow_present_pte(old_child_spte))
351 				continue;
352 
353 			/*
354 			 * Marking the SPTE as a removed SPTE is not
355 			 * strictly necessary here as the MMU lock will
356 			 * stop other threads from concurrently modifying
357 			 * this SPTE. Using the removed SPTE value keeps
358 			 * the two branches consistent and simplifies
359 			 * the function.
360 			 */
361 			WRITE_ONCE(*sptep, REMOVED_SPTE);
362 		}
363 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
364 				    old_child_spte, REMOVED_SPTE, level - 1,
365 				    shared);
366 	}
367 
368 	kvm_flush_remote_tlbs_with_address(kvm, gfn,
369 					   KVM_PAGES_PER_HPAGE(level));
370 
371 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
372 }
373 
374 /**
375  * handle_changed_spte - handle bookkeeping associated with an SPTE change
376  * @kvm: kvm instance
377  * @as_id: the address space of the paging structure the SPTE was a part of
378  * @gfn: the base GFN that was mapped by the SPTE
379  * @old_spte: The value of the SPTE before the change
380  * @new_spte: The value of the SPTE after the change
381  * @level: the level of the PT the SPTE is part of in the paging structure
382  * @shared: This operation may not be running under the exclusive use of
383  *	    the MMU lock and the operation must synchronize with other
384  *	    threads that might be modifying SPTEs.
385  *
386  * Handle bookkeeping that might result from the modification of a SPTE.
387  * This function must be called for all TDP SPTE modifications.
388  */
389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
390 				  u64 old_spte, u64 new_spte, int level,
391 				  bool shared)
392 {
393 	bool was_present = is_shadow_present_pte(old_spte);
394 	bool is_present = is_shadow_present_pte(new_spte);
395 	bool was_leaf = was_present && is_last_spte(old_spte, level);
396 	bool is_leaf = is_present && is_last_spte(new_spte, level);
397 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
398 
399 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
400 	WARN_ON(level < PG_LEVEL_4K);
401 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
402 
403 	/*
404 	 * If this warning were to trigger it would indicate that there was a
405 	 * missing MMU notifier or a race with some notifier handler.
406 	 * A present, leaf SPTE should never be directly replaced with another
407 	 * present leaf SPTE pointing to a different PFN. A notifier handler
408 	 * should be zapping the SPTE before the main MM's page table is
409 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
410 	 * thread before replacement.
411 	 */
412 	if (was_leaf && is_leaf && pfn_changed) {
413 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
414 		       "SPTE with another present leaf SPTE mapping a\n"
415 		       "different PFN!\n"
416 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
417 		       as_id, gfn, old_spte, new_spte, level);
418 
419 		/*
420 		 * Crash the host to prevent error propagation and guest data
421 		 * corruption.
422 		 */
423 		BUG();
424 	}
425 
426 	if (old_spte == new_spte)
427 		return;
428 
429 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
430 
431 	/*
432 	 * The only times a SPTE should be changed from a non-present to
433 	 * non-present state is when an MMIO entry is installed/modified/
434 	 * removed. In that case, there is nothing to do here.
435 	 */
436 	if (!was_present && !is_present) {
437 		/*
438 		 * If this change does not involve a MMIO SPTE or removed SPTE,
439 		 * it is unexpected. Log the change, though it should not
440 		 * impact the guest since both the former and current SPTEs
441 		 * are nonpresent.
442 		 */
443 		if (WARN_ON(!is_mmio_spte(old_spte) &&
444 			    !is_mmio_spte(new_spte) &&
445 			    !is_removed_spte(new_spte)))
446 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
447 			       "should not be replaced with another,\n"
448 			       "different nonpresent SPTE, unless one or both\n"
449 			       "are MMIO SPTEs, or the new SPTE is\n"
450 			       "a temporary removed SPTE.\n"
451 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
452 			       as_id, gfn, old_spte, new_spte, level);
453 		return;
454 	}
455 
456 
457 	if (was_leaf && is_dirty_spte(old_spte) &&
458 	    (!is_dirty_spte(new_spte) || pfn_changed))
459 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
460 
461 	/*
462 	 * Recursively handle child PTs if the change removed a subtree from
463 	 * the paging structure.
464 	 */
465 	if (was_present && !was_leaf && (pfn_changed || !is_present))
466 		handle_removed_tdp_mmu_page(kvm,
467 				spte_to_child_pt(old_spte, level), shared);
468 }
469 
470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
471 				u64 old_spte, u64 new_spte, int level,
472 				bool shared)
473 {
474 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
475 			      shared);
476 	handle_changed_spte_acc_track(old_spte, new_spte, level);
477 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
478 				      new_spte, level);
479 }
480 
481 /*
482  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
483  * associated bookkeeping
484  *
485  * @kvm: kvm instance
486  * @iter: a tdp_iter instance currently on the SPTE that should be set
487  * @new_spte: The value the SPTE should be set to
488  * Returns: true if the SPTE was set, false if it was not. If false is returned,
489  *	    this function will have no side-effects.
490  */
491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
492 					   struct tdp_iter *iter,
493 					   u64 new_spte)
494 {
495 	u64 *root_pt = tdp_iter_root_pt(iter);
496 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
497 	int as_id = kvm_mmu_page_as_id(root);
498 
499 	lockdep_assert_held_read(&kvm->mmu_lock);
500 
501 	/*
502 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
503 	 * may modify it.
504 	 */
505 	if (iter->old_spte == REMOVED_SPTE)
506 		return false;
507 
508 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
509 		      new_spte) != iter->old_spte)
510 		return false;
511 
512 	handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
513 			    iter->level, true);
514 
515 	return true;
516 }
517 
518 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
519 					   struct tdp_iter *iter)
520 {
521 	/*
522 	 * Freeze the SPTE by setting it to a special,
523 	 * non-present value. This will stop other threads from
524 	 * immediately installing a present entry in its place
525 	 * before the TLBs are flushed.
526 	 */
527 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
528 		return false;
529 
530 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
531 					   KVM_PAGES_PER_HPAGE(iter->level));
532 
533 	/*
534 	 * No other thread can overwrite the removed SPTE as they
535 	 * must either wait on the MMU lock or use
536 	 * tdp_mmu_set_spte_atomic which will not overwrite the
537 	 * special removed SPTE value. No bookkeeping is needed
538 	 * here since the SPTE is going from non-present
539 	 * to non-present.
540 	 */
541 	WRITE_ONCE(*iter->sptep, 0);
542 
543 	return true;
544 }
545 
546 
547 /*
548  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
549  * @kvm: kvm instance
550  * @iter: a tdp_iter instance currently on the SPTE that should be set
551  * @new_spte: The value the SPTE should be set to
552  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
553  *		      of the page. Should be set unless handling an MMU
554  *		      notifier for access tracking. Leaving record_acc_track
555  *		      unset in that case prevents page accesses from being
556  *		      double counted.
557  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
558  *		      appropriate for the change being made. Should be set
559  *		      unless performing certain dirty logging operations.
560  *		      Leaving record_dirty_log unset in that case prevents page
561  *		      writes from being double counted.
562  */
563 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
564 				      u64 new_spte, bool record_acc_track,
565 				      bool record_dirty_log)
566 {
567 	tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
568 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
569 	int as_id = kvm_mmu_page_as_id(root);
570 
571 	lockdep_assert_held_write(&kvm->mmu_lock);
572 
573 	/*
574 	 * No thread should be using this function to set SPTEs to the
575 	 * temporary removed SPTE value.
576 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
577 	 * should be used. If operating under the MMU lock in write mode, the
578 	 * use of the removed SPTE should not be necessary.
579 	 */
580 	WARN_ON(iter->old_spte == REMOVED_SPTE);
581 
582 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
583 
584 	__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
585 			      iter->level, false);
586 	if (record_acc_track)
587 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
588 					      iter->level);
589 	if (record_dirty_log)
590 		handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
591 					      iter->old_spte, new_spte,
592 					      iter->level);
593 }
594 
595 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
596 				    u64 new_spte)
597 {
598 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
599 }
600 
601 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
602 						 struct tdp_iter *iter,
603 						 u64 new_spte)
604 {
605 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
606 }
607 
608 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
609 						 struct tdp_iter *iter,
610 						 u64 new_spte)
611 {
612 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
613 }
614 
615 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
616 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
617 
618 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
619 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
620 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
621 		    !is_last_spte(_iter.old_spte, _iter.level))		\
622 			continue;					\
623 		else
624 
625 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
626 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
627 			 _mmu->shadow_root_level, _start, _end)
628 
629 /*
630  * Yield if the MMU lock is contended or this thread needs to return control
631  * to the scheduler.
632  *
633  * If this function should yield and flush is set, it will perform a remote
634  * TLB flush before yielding.
635  *
636  * If this function yields, it will also reset the tdp_iter's walk over the
637  * paging structure and the calling function should skip to the next
638  * iteration to allow the iterator to continue its traversal from the
639  * paging structure root.
640  *
641  * Return true if this function yielded and the iterator's traversal was reset.
642  * Return false if a yield was not needed.
643  */
644 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
645 					     struct tdp_iter *iter, bool flush)
646 {
647 	/* Ensure forward progress has been made before yielding. */
648 	if (iter->next_last_level_gfn == iter->yielded_gfn)
649 		return false;
650 
651 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
652 		rcu_read_unlock();
653 
654 		if (flush)
655 			kvm_flush_remote_tlbs(kvm);
656 
657 		cond_resched_rwlock_write(&kvm->mmu_lock);
658 		rcu_read_lock();
659 
660 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
661 
662 		tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
663 			       iter->root_level, iter->min_level,
664 			       iter->next_last_level_gfn);
665 
666 		return true;
667 	}
668 
669 	return false;
670 }
671 
672 /*
673  * Tears down the mappings for the range of gfns, [start, end), and frees the
674  * non-root pages mapping GFNs strictly within that range. Returns true if
675  * SPTEs have been cleared and a TLB flush is needed before releasing the
676  * MMU lock.
677  * If can_yield is true, will release the MMU lock and reschedule if the
678  * scheduler needs the CPU or there is contention on the MMU lock. If this
679  * function cannot yield, it will not release the MMU lock or reschedule and
680  * the caller must ensure it does not supply too large a GFN range, or the
681  * operation can cause a soft lockup.
682  */
683 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
684 			  gfn_t start, gfn_t end, bool can_yield)
685 {
686 	struct tdp_iter iter;
687 	bool flush_needed = false;
688 
689 	rcu_read_lock();
690 
691 	tdp_root_for_each_pte(iter, root, start, end) {
692 		if (can_yield &&
693 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
694 			flush_needed = false;
695 			continue;
696 		}
697 
698 		if (!is_shadow_present_pte(iter.old_spte))
699 			continue;
700 
701 		/*
702 		 * If this is a non-last-level SPTE that covers a larger range
703 		 * than should be zapped, continue, and zap the mappings at a
704 		 * lower level.
705 		 */
706 		if ((iter.gfn < start ||
707 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
708 		    !is_last_spte(iter.old_spte, iter.level))
709 			continue;
710 
711 		tdp_mmu_set_spte(kvm, &iter, 0);
712 		flush_needed = true;
713 	}
714 
715 	rcu_read_unlock();
716 	return flush_needed;
717 }
718 
719 /*
720  * Tears down the mappings for the range of gfns, [start, end), and frees the
721  * non-root pages mapping GFNs strictly within that range. Returns true if
722  * SPTEs have been cleared and a TLB flush is needed before releasing the
723  * MMU lock.
724  */
725 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
726 {
727 	struct kvm_mmu_page *root;
728 	bool flush = false;
729 
730 	for_each_tdp_mmu_root_yield_safe(kvm, root)
731 		flush |= zap_gfn_range(kvm, root, start, end, true);
732 
733 	return flush;
734 }
735 
736 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
737 {
738 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
739 	bool flush;
740 
741 	flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
742 	if (flush)
743 		kvm_flush_remote_tlbs(kvm);
744 }
745 
746 /*
747  * Installs a last-level SPTE to handle a TDP page fault.
748  * (NPT/EPT violation/misconfiguration)
749  */
750 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
751 					  int map_writable,
752 					  struct tdp_iter *iter,
753 					  kvm_pfn_t pfn, bool prefault)
754 {
755 	u64 new_spte;
756 	int ret = 0;
757 	int make_spte_ret = 0;
758 
759 	if (unlikely(is_noslot_pfn(pfn)))
760 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
761 	else
762 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
763 					 pfn, iter->old_spte, prefault, true,
764 					 map_writable, !shadow_accessed_mask,
765 					 &new_spte);
766 
767 	if (new_spte == iter->old_spte)
768 		ret = RET_PF_SPURIOUS;
769 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
770 		return RET_PF_RETRY;
771 
772 	/*
773 	 * If the page fault was caused by a write but the page is write
774 	 * protected, emulation is needed. If the emulation was skipped,
775 	 * the vCPU would have the same fault again.
776 	 */
777 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
778 		if (write)
779 			ret = RET_PF_EMULATE;
780 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
781 	}
782 
783 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
784 	if (unlikely(is_mmio_spte(new_spte))) {
785 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
786 				     new_spte);
787 		ret = RET_PF_EMULATE;
788 	} else
789 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
790 				       rcu_dereference(iter->sptep));
791 
792 	trace_kvm_mmu_set_spte(iter->level, iter->gfn,
793 			       rcu_dereference(iter->sptep));
794 	if (!prefault)
795 		vcpu->stat.pf_fixed++;
796 
797 	return ret;
798 }
799 
800 /*
801  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
802  * page tables and SPTEs to translate the faulting guest physical address.
803  */
804 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
805 		    int map_writable, int max_level, kvm_pfn_t pfn,
806 		    bool prefault)
807 {
808 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
809 	bool write = error_code & PFERR_WRITE_MASK;
810 	bool exec = error_code & PFERR_FETCH_MASK;
811 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
812 	struct kvm_mmu *mmu = vcpu->arch.mmu;
813 	struct tdp_iter iter;
814 	struct kvm_mmu_page *sp;
815 	u64 *child_pt;
816 	u64 new_spte;
817 	int ret;
818 	gfn_t gfn = gpa >> PAGE_SHIFT;
819 	int level;
820 	int req_level;
821 
822 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
823 		return RET_PF_RETRY;
824 	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
825 		return RET_PF_RETRY;
826 
827 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
828 					huge_page_disallowed, &req_level);
829 
830 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
831 
832 	rcu_read_lock();
833 
834 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
835 		if (nx_huge_page_workaround_enabled)
836 			disallowed_hugepage_adjust(iter.old_spte, gfn,
837 						   iter.level, &pfn, &level);
838 
839 		if (iter.level == level)
840 			break;
841 
842 		/*
843 		 * If there is an SPTE mapping a large page at a higher level
844 		 * than the target, that SPTE must be cleared and replaced
845 		 * with a non-leaf SPTE.
846 		 */
847 		if (is_shadow_present_pte(iter.old_spte) &&
848 		    is_large_pte(iter.old_spte)) {
849 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
850 				break;
851 
852 			/*
853 			 * The iter must explicitly re-read the spte here
854 			 * because the new value informs the !present
855 			 * path below.
856 			 */
857 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
858 		}
859 
860 		if (!is_shadow_present_pte(iter.old_spte)) {
861 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
862 			child_pt = sp->spt;
863 
864 			new_spte = make_nonleaf_spte(child_pt,
865 						     !shadow_accessed_mask);
866 
867 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
868 						    new_spte)) {
869 				tdp_mmu_link_page(vcpu->kvm, sp, true,
870 						  huge_page_disallowed &&
871 						  req_level >= iter.level);
872 
873 				trace_kvm_mmu_get_page(sp, true);
874 			} else {
875 				tdp_mmu_free_sp(sp);
876 				break;
877 			}
878 		}
879 	}
880 
881 	if (iter.level != level) {
882 		rcu_read_unlock();
883 		return RET_PF_RETRY;
884 	}
885 
886 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
887 					      pfn, prefault);
888 	rcu_read_unlock();
889 
890 	return ret;
891 }
892 
893 static __always_inline int
894 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
895 			     unsigned long start,
896 			     unsigned long end,
897 			     unsigned long data,
898 			     int (*handler)(struct kvm *kvm,
899 					    struct kvm_memory_slot *slot,
900 					    struct kvm_mmu_page *root,
901 					    gfn_t start,
902 					    gfn_t end,
903 					    unsigned long data))
904 {
905 	struct kvm_memslots *slots;
906 	struct kvm_memory_slot *memslot;
907 	struct kvm_mmu_page *root;
908 	int ret = 0;
909 	int as_id;
910 
911 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
912 		as_id = kvm_mmu_page_as_id(root);
913 		slots = __kvm_memslots(kvm, as_id);
914 		kvm_for_each_memslot(memslot, slots) {
915 			unsigned long hva_start, hva_end;
916 			gfn_t gfn_start, gfn_end;
917 
918 			hva_start = max(start, memslot->userspace_addr);
919 			hva_end = min(end, memslot->userspace_addr +
920 				      (memslot->npages << PAGE_SHIFT));
921 			if (hva_start >= hva_end)
922 				continue;
923 			/*
924 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
925 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
926 			 */
927 			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
928 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
929 
930 			ret |= handler(kvm, memslot, root, gfn_start,
931 				       gfn_end, data);
932 		}
933 	}
934 
935 	return ret;
936 }
937 
938 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
939 				     struct kvm_memory_slot *slot,
940 				     struct kvm_mmu_page *root, gfn_t start,
941 				     gfn_t end, unsigned long unused)
942 {
943 	return zap_gfn_range(kvm, root, start, end, false);
944 }
945 
946 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
947 			      unsigned long end)
948 {
949 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
950 					    zap_gfn_range_hva_wrapper);
951 }
952 
953 /*
954  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
955  * if any of the GFNs in the range have been accessed.
956  */
957 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
958 			 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
959 			 unsigned long unused)
960 {
961 	struct tdp_iter iter;
962 	int young = 0;
963 	u64 new_spte = 0;
964 
965 	rcu_read_lock();
966 
967 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
968 		/*
969 		 * If we have a non-accessed entry we don't need to change the
970 		 * pte.
971 		 */
972 		if (!is_accessed_spte(iter.old_spte))
973 			continue;
974 
975 		new_spte = iter.old_spte;
976 
977 		if (spte_ad_enabled(new_spte)) {
978 			clear_bit((ffs(shadow_accessed_mask) - 1),
979 				  (unsigned long *)&new_spte);
980 		} else {
981 			/*
982 			 * Capture the dirty status of the page, so that it doesn't get
983 			 * lost when the SPTE is marked for access tracking.
984 			 */
985 			if (is_writable_pte(new_spte))
986 				kvm_set_pfn_dirty(spte_to_pfn(new_spte));
987 
988 			new_spte = mark_spte_for_access_track(new_spte);
989 		}
990 		new_spte &= ~shadow_dirty_mask;
991 
992 		tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
993 		young = 1;
994 
995 		trace_kvm_age_page(iter.gfn, iter.level, slot, young);
996 	}
997 
998 	rcu_read_unlock();
999 
1000 	return young;
1001 }
1002 
1003 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
1004 			      unsigned long end)
1005 {
1006 	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
1007 					    age_gfn_range);
1008 }
1009 
1010 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1011 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1012 			unsigned long unused2)
1013 {
1014 	struct tdp_iter iter;
1015 
1016 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1017 		if (is_accessed_spte(iter.old_spte))
1018 			return 1;
1019 
1020 	return 0;
1021 }
1022 
1023 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1024 {
1025 	return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1026 					    test_age_gfn);
1027 }
1028 
1029 /*
1030  * Handle the changed_pte MMU notifier for the TDP MMU.
1031  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1032  * notifier.
1033  * Returns non-zero if a flush is needed before releasing the MMU lock.
1034  */
1035 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1036 			struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1037 			unsigned long data)
1038 {
1039 	struct tdp_iter iter;
1040 	pte_t *ptep = (pte_t *)data;
1041 	kvm_pfn_t new_pfn;
1042 	u64 new_spte;
1043 	int need_flush = 0;
1044 
1045 	rcu_read_lock();
1046 
1047 	WARN_ON(pte_huge(*ptep));
1048 
1049 	new_pfn = pte_pfn(*ptep);
1050 
1051 	tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1052 		if (iter.level != PG_LEVEL_4K)
1053 			continue;
1054 
1055 		if (!is_shadow_present_pte(iter.old_spte))
1056 			break;
1057 
1058 		tdp_mmu_set_spte(kvm, &iter, 0);
1059 
1060 		kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1061 
1062 		if (!pte_write(*ptep)) {
1063 			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1064 					iter.old_spte, new_pfn);
1065 
1066 			tdp_mmu_set_spte(kvm, &iter, new_spte);
1067 		}
1068 
1069 		need_flush = 1;
1070 	}
1071 
1072 	if (need_flush)
1073 		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1074 
1075 	rcu_read_unlock();
1076 
1077 	return 0;
1078 }
1079 
1080 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1081 			     pte_t *host_ptep)
1082 {
1083 	return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1084 					    (unsigned long)host_ptep,
1085 					    set_tdp_spte);
1086 }
1087 
1088 /*
1089  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1090  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1091  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1092  */
1093 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1094 			     gfn_t start, gfn_t end, int min_level)
1095 {
1096 	struct tdp_iter iter;
1097 	u64 new_spte;
1098 	bool spte_set = false;
1099 
1100 	rcu_read_lock();
1101 
1102 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1103 
1104 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1105 				   min_level, start, end) {
1106 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1107 			continue;
1108 
1109 		if (!is_shadow_present_pte(iter.old_spte) ||
1110 		    !is_last_spte(iter.old_spte, iter.level) ||
1111 		    !(iter.old_spte & PT_WRITABLE_MASK))
1112 			continue;
1113 
1114 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1115 
1116 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1117 		spte_set = true;
1118 	}
1119 
1120 	rcu_read_unlock();
1121 	return spte_set;
1122 }
1123 
1124 /*
1125  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1126  * only affect leaf SPTEs down to min_level.
1127  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1128  */
1129 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1130 			     int min_level)
1131 {
1132 	struct kvm_mmu_page *root;
1133 	int root_as_id;
1134 	bool spte_set = false;
1135 
1136 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1137 		root_as_id = kvm_mmu_page_as_id(root);
1138 		if (root_as_id != slot->as_id)
1139 			continue;
1140 
1141 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1142 			     slot->base_gfn + slot->npages, min_level);
1143 	}
1144 
1145 	return spte_set;
1146 }
1147 
1148 /*
1149  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1150  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1151  * If AD bits are not enabled, this will require clearing the writable bit on
1152  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1153  * be flushed.
1154  */
1155 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1156 			   gfn_t start, gfn_t end)
1157 {
1158 	struct tdp_iter iter;
1159 	u64 new_spte;
1160 	bool spte_set = false;
1161 
1162 	rcu_read_lock();
1163 
1164 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1165 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1166 			continue;
1167 
1168 		if (spte_ad_need_write_protect(iter.old_spte)) {
1169 			if (is_writable_pte(iter.old_spte))
1170 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1171 			else
1172 				continue;
1173 		} else {
1174 			if (iter.old_spte & shadow_dirty_mask)
1175 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1176 			else
1177 				continue;
1178 		}
1179 
1180 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1181 		spte_set = true;
1182 	}
1183 
1184 	rcu_read_unlock();
1185 	return spte_set;
1186 }
1187 
1188 /*
1189  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1190  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1191  * If AD bits are not enabled, this will require clearing the writable bit on
1192  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1193  * be flushed.
1194  */
1195 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1196 {
1197 	struct kvm_mmu_page *root;
1198 	int root_as_id;
1199 	bool spte_set = false;
1200 
1201 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1202 		root_as_id = kvm_mmu_page_as_id(root);
1203 		if (root_as_id != slot->as_id)
1204 			continue;
1205 
1206 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1207 				slot->base_gfn + slot->npages);
1208 	}
1209 
1210 	return spte_set;
1211 }
1212 
1213 /*
1214  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1215  * set in mask, starting at gfn. The given memslot is expected to contain all
1216  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1217  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1218  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1219  */
1220 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1221 				  gfn_t gfn, unsigned long mask, bool wrprot)
1222 {
1223 	struct tdp_iter iter;
1224 	u64 new_spte;
1225 
1226 	rcu_read_lock();
1227 
1228 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1229 				    gfn + BITS_PER_LONG) {
1230 		if (!mask)
1231 			break;
1232 
1233 		if (iter.level > PG_LEVEL_4K ||
1234 		    !(mask & (1UL << (iter.gfn - gfn))))
1235 			continue;
1236 
1237 		mask &= ~(1UL << (iter.gfn - gfn));
1238 
1239 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1240 			if (is_writable_pte(iter.old_spte))
1241 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1242 			else
1243 				continue;
1244 		} else {
1245 			if (iter.old_spte & shadow_dirty_mask)
1246 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1247 			else
1248 				continue;
1249 		}
1250 
1251 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1252 	}
1253 
1254 	rcu_read_unlock();
1255 }
1256 
1257 /*
1258  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1259  * set in mask, starting at gfn. The given memslot is expected to contain all
1260  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1261  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1262  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1263  */
1264 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1265 				       struct kvm_memory_slot *slot,
1266 				       gfn_t gfn, unsigned long mask,
1267 				       bool wrprot)
1268 {
1269 	struct kvm_mmu_page *root;
1270 	int root_as_id;
1271 
1272 	lockdep_assert_held_write(&kvm->mmu_lock);
1273 	for_each_tdp_mmu_root(kvm, root) {
1274 		root_as_id = kvm_mmu_page_as_id(root);
1275 		if (root_as_id != slot->as_id)
1276 			continue;
1277 
1278 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1279 	}
1280 }
1281 
1282 /*
1283  * Clear leaf entries which could be replaced by large mappings, for
1284  * GFNs within the slot.
1285  */
1286 static void zap_collapsible_spte_range(struct kvm *kvm,
1287 				       struct kvm_mmu_page *root,
1288 				       struct kvm_memory_slot *slot)
1289 {
1290 	gfn_t start = slot->base_gfn;
1291 	gfn_t end = start + slot->npages;
1292 	struct tdp_iter iter;
1293 	kvm_pfn_t pfn;
1294 	bool spte_set = false;
1295 
1296 	rcu_read_lock();
1297 
1298 	tdp_root_for_each_pte(iter, root, start, end) {
1299 		if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1300 			spte_set = false;
1301 			continue;
1302 		}
1303 
1304 		if (!is_shadow_present_pte(iter.old_spte) ||
1305 		    !is_last_spte(iter.old_spte, iter.level))
1306 			continue;
1307 
1308 		pfn = spte_to_pfn(iter.old_spte);
1309 		if (kvm_is_reserved_pfn(pfn) ||
1310 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1311 							    pfn, PG_LEVEL_NUM))
1312 			continue;
1313 
1314 		tdp_mmu_set_spte(kvm, &iter, 0);
1315 
1316 		spte_set = true;
1317 	}
1318 
1319 	rcu_read_unlock();
1320 	if (spte_set)
1321 		kvm_flush_remote_tlbs(kvm);
1322 }
1323 
1324 /*
1325  * Clear non-leaf entries (and free associated page tables) which could
1326  * be replaced by large mappings, for GFNs within the slot.
1327  */
1328 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1329 				       struct kvm_memory_slot *slot)
1330 {
1331 	struct kvm_mmu_page *root;
1332 	int root_as_id;
1333 
1334 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1335 		root_as_id = kvm_mmu_page_as_id(root);
1336 		if (root_as_id != slot->as_id)
1337 			continue;
1338 
1339 		zap_collapsible_spte_range(kvm, root, slot);
1340 	}
1341 }
1342 
1343 /*
1344  * Removes write access on the last level SPTE mapping this GFN and unsets the
1345  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1346  * Returns true if an SPTE was set and a TLB flush is needed.
1347  */
1348 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1349 			      gfn_t gfn)
1350 {
1351 	struct tdp_iter iter;
1352 	u64 new_spte;
1353 	bool spte_set = false;
1354 
1355 	rcu_read_lock();
1356 
1357 	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1358 		if (!is_writable_pte(iter.old_spte))
1359 			break;
1360 
1361 		new_spte = iter.old_spte &
1362 			~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1363 
1364 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1365 		spte_set = true;
1366 	}
1367 
1368 	rcu_read_unlock();
1369 
1370 	return spte_set;
1371 }
1372 
1373 /*
1374  * Removes write access on the last level SPTE mapping this GFN and unsets the
1375  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1376  * Returns true if an SPTE was set and a TLB flush is needed.
1377  */
1378 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1379 				   struct kvm_memory_slot *slot, gfn_t gfn)
1380 {
1381 	struct kvm_mmu_page *root;
1382 	int root_as_id;
1383 	bool spte_set = false;
1384 
1385 	lockdep_assert_held_write(&kvm->mmu_lock);
1386 	for_each_tdp_mmu_root(kvm, root) {
1387 		root_as_id = kvm_mmu_page_as_id(root);
1388 		if (root_as_id != slot->as_id)
1389 			continue;
1390 
1391 		spte_set |= write_protect_gfn(kvm, root, gfn);
1392 	}
1393 	return spte_set;
1394 }
1395 
1396 /*
1397  * Return the level of the lowest level SPTE added to sptes.
1398  * That SPTE may be non-present.
1399  */
1400 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1401 			 int *root_level)
1402 {
1403 	struct tdp_iter iter;
1404 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1405 	gfn_t gfn = addr >> PAGE_SHIFT;
1406 	int leaf = -1;
1407 
1408 	*root_level = vcpu->arch.mmu->shadow_root_level;
1409 
1410 	rcu_read_lock();
1411 
1412 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1413 		leaf = iter.level;
1414 		sptes[leaf] = iter.old_spte;
1415 	}
1416 
1417 	rcu_read_unlock();
1418 
1419 	return leaf;
1420 }
1421