1775c8a3dSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2af585b92SGleb Natapov /*
3af585b92SGleb Natapov * kvm asynchronous fault support
4af585b92SGleb Natapov *
5af585b92SGleb Natapov * Copyright 2010 Red Hat, Inc.
6af585b92SGleb Natapov *
7af585b92SGleb Natapov * Author:
8af585b92SGleb Natapov * Gleb Natapov <gleb@redhat.com>
9af585b92SGleb Natapov */
10af585b92SGleb Natapov
11af585b92SGleb Natapov #include <linux/kvm_host.h>
12af585b92SGleb Natapov #include <linux/slab.h>
13af585b92SGleb Natapov #include <linux/module.h>
14af585b92SGleb Natapov #include <linux/mmu_context.h>
156e84f315SIngo Molnar #include <linux/sched/mm.h>
16af585b92SGleb Natapov
17af585b92SGleb Natapov #include "async_pf.h"
18af585b92SGleb Natapov #include <trace/events/kvm.h>
19af585b92SGleb Natapov
20af585b92SGleb Natapov static struct kmem_cache *async_pf_cache;
21af585b92SGleb Natapov
kvm_async_pf_init(void)22af585b92SGleb Natapov int kvm_async_pf_init(void)
23af585b92SGleb Natapov {
24af585b92SGleb Natapov async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
25af585b92SGleb Natapov
26af585b92SGleb Natapov if (!async_pf_cache)
27af585b92SGleb Natapov return -ENOMEM;
28af585b92SGleb Natapov
29af585b92SGleb Natapov return 0;
30af585b92SGleb Natapov }
31af585b92SGleb Natapov
kvm_async_pf_deinit(void)32af585b92SGleb Natapov void kvm_async_pf_deinit(void)
33af585b92SGleb Natapov {
34af585b92SGleb Natapov kmem_cache_destroy(async_pf_cache);
35af585b92SGleb Natapov async_pf_cache = NULL;
36af585b92SGleb Natapov }
37af585b92SGleb Natapov
kvm_async_pf_vcpu_init(struct kvm_vcpu * vcpu)38af585b92SGleb Natapov void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
39af585b92SGleb Natapov {
40af585b92SGleb Natapov INIT_LIST_HEAD(&vcpu->async_pf.done);
41af585b92SGleb Natapov INIT_LIST_HEAD(&vcpu->async_pf.queue);
42af585b92SGleb Natapov spin_lock_init(&vcpu->async_pf.lock);
43af585b92SGleb Natapov }
44af585b92SGleb Natapov
async_pf_execute(struct work_struct * work)45af585b92SGleb Natapov static void async_pf_execute(struct work_struct *work)
46af585b92SGleb Natapov {
47af585b92SGleb Natapov struct kvm_async_pf *apf =
48af585b92SGleb Natapov container_of(work, struct kvm_async_pf, work);
49af585b92SGleb Natapov struct mm_struct *mm = apf->mm;
50af585b92SGleb Natapov struct kvm_vcpu *vcpu = apf->vcpu;
51af585b92SGleb Natapov unsigned long addr = apf->addr;
52736c291cSSean Christopherson gpa_t cr2_or_gpa = apf->cr2_or_gpa;
538b7457efSLorenzo Stoakes int locked = 1;
54557a961aSVitaly Kuznetsov bool first;
55af585b92SGleb Natapov
56af585b92SGleb Natapov might_sleep();
57af585b92SGleb Natapov
581e987790SDave Hansen /*
59bdd303cbSWei Yang * This work is run asynchronously to the task which owns
601e987790SDave Hansen * mm and might be done in another context, so we must
618b7457efSLorenzo Stoakes * access remotely.
621e987790SDave Hansen */
63d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
64ca5e8632SLorenzo Stoakes get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked);
658b7457efSLorenzo Stoakes if (locked)
66d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
671e987790SDave Hansen
684425f567SPaolo Bonzini if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
694425f567SPaolo Bonzini kvm_arch_async_page_present(vcpu, apf);
70af585b92SGleb Natapov
71af585b92SGleb Natapov spin_lock(&vcpu->async_pf.lock);
72557a961aSVitaly Kuznetsov first = list_empty(&vcpu->async_pf.done);
73af585b92SGleb Natapov list_add_tail(&apf->link, &vcpu->async_pf.done);
7422583f0dSPaolo Bonzini apf->vcpu = NULL;
75af585b92SGleb Natapov spin_unlock(&vcpu->async_pf.lock);
76af585b92SGleb Natapov
77557a961aSVitaly Kuznetsov if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
78557a961aSVitaly Kuznetsov kvm_arch_async_page_present_queued(vcpu);
79557a961aSVitaly Kuznetsov
80af585b92SGleb Natapov /*
81af585b92SGleb Natapov * apf may be freed by kvm_check_async_pf_completion() after
82af585b92SGleb Natapov * this point
83af585b92SGleb Natapov */
84af585b92SGleb Natapov
85736c291cSSean Christopherson trace_kvm_async_pf_completed(addr, cr2_or_gpa);
86af585b92SGleb Natapov
87d92a5d1cSSean Christopherson __kvm_vcpu_wake_up(vcpu);
88af585b92SGleb Natapov
8941c22f62SOleg Nesterov mmput(mm);
90*a75afe48SSean Christopherson }
91*a75afe48SSean Christopherson
kvm_flush_and_free_async_pf_work(struct kvm_async_pf * work)92*a75afe48SSean Christopherson static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
93*a75afe48SSean Christopherson {
94*a75afe48SSean Christopherson /*
95*a75afe48SSean Christopherson * The async #PF is "done", but KVM must wait for the work item itself,
96*a75afe48SSean Christopherson * i.e. async_pf_execute(), to run to completion. If KVM is a module,
97*a75afe48SSean Christopherson * KVM must ensure *no* code owned by the KVM (the module) can be run
98*a75afe48SSean Christopherson * after the last call to module_put(). Note, flushing the work item
99*a75afe48SSean Christopherson * is always required when the item is taken off the completion queue.
100*a75afe48SSean Christopherson * E.g. even if the vCPU handles the item in the "normal" path, the VM
101*a75afe48SSean Christopherson * could be terminated before async_pf_execute() completes.
102*a75afe48SSean Christopherson *
103*a75afe48SSean Christopherson * Wake all events skip the queue and go straight done, i.e. don't
104*a75afe48SSean Christopherson * need to be flushed (but sanity check that the work wasn't queued).
105*a75afe48SSean Christopherson */
106*a75afe48SSean Christopherson if (work->wakeup_all)
107*a75afe48SSean Christopherson WARN_ON_ONCE(work->work.func);
108*a75afe48SSean Christopherson else
109*a75afe48SSean Christopherson flush_work(&work->work);
110*a75afe48SSean Christopherson kmem_cache_free(async_pf_cache, work);
111af585b92SGleb Natapov }
112af585b92SGleb Natapov
kvm_clear_async_pf_completion_queue(struct kvm_vcpu * vcpu)113af585b92SGleb Natapov void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
114af585b92SGleb Natapov {
11522583f0dSPaolo Bonzini spin_lock(&vcpu->async_pf.lock);
11622583f0dSPaolo Bonzini
117af585b92SGleb Natapov /* cancel outstanding work queue item */
118af585b92SGleb Natapov while (!list_empty(&vcpu->async_pf.queue)) {
119af585b92SGleb Natapov struct kvm_async_pf *work =
120433da860SGeliang Tang list_first_entry(&vcpu->async_pf.queue,
121af585b92SGleb Natapov typeof(*work), queue);
122af585b92SGleb Natapov list_del(&work->queue);
1239f2ceda4SDominik Dingel
12422583f0dSPaolo Bonzini /*
12522583f0dSPaolo Bonzini * We know it's present in vcpu->async_pf.done, do
12622583f0dSPaolo Bonzini * nothing here.
12722583f0dSPaolo Bonzini */
12822583f0dSPaolo Bonzini if (!work->vcpu)
12922583f0dSPaolo Bonzini continue;
13022583f0dSPaolo Bonzini
13122583f0dSPaolo Bonzini spin_unlock(&vcpu->async_pf.lock);
1329f2ceda4SDominik Dingel #ifdef CONFIG_KVM_ASYNC_PF_SYNC
1339f2ceda4SDominik Dingel flush_work(&work->work);
1349f2ceda4SDominik Dingel #else
13598fda169SRadim Krčmář if (cancel_work_sync(&work->work)) {
13641c22f62SOleg Nesterov mmput(work->mm);
137af585b92SGleb Natapov kmem_cache_free(async_pf_cache, work);
138af585b92SGleb Natapov }
1399f2ceda4SDominik Dingel #endif
14022583f0dSPaolo Bonzini spin_lock(&vcpu->async_pf.lock);
14128b441e2SRadim Krčmář }
142af585b92SGleb Natapov
143af585b92SGleb Natapov while (!list_empty(&vcpu->async_pf.done)) {
144af585b92SGleb Natapov struct kvm_async_pf *work =
145433da860SGeliang Tang list_first_entry(&vcpu->async_pf.done,
146af585b92SGleb Natapov typeof(*work), link);
147af585b92SGleb Natapov list_del(&work->link);
148*a75afe48SSean Christopherson
149*a75afe48SSean Christopherson spin_unlock(&vcpu->async_pf.lock);
150*a75afe48SSean Christopherson kvm_flush_and_free_async_pf_work(work);
151*a75afe48SSean Christopherson spin_lock(&vcpu->async_pf.lock);
152af585b92SGleb Natapov }
153af585b92SGleb Natapov spin_unlock(&vcpu->async_pf.lock);
154af585b92SGleb Natapov
155af585b92SGleb Natapov vcpu->async_pf.queued = 0;
156af585b92SGleb Natapov }
157af585b92SGleb Natapov
kvm_check_async_pf_completion(struct kvm_vcpu * vcpu)158af585b92SGleb Natapov void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
159af585b92SGleb Natapov {
160af585b92SGleb Natapov struct kvm_async_pf *work;
161af585b92SGleb Natapov
16215096ffcSXiao Guangrong while (!list_empty_careful(&vcpu->async_pf.done) &&
1637c0ade6cSVitaly Kuznetsov kvm_arch_can_dequeue_async_page_present(vcpu)) {
164af585b92SGleb Natapov spin_lock(&vcpu->async_pf.lock);
16515096ffcSXiao Guangrong work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
16615096ffcSXiao Guangrong link);
167af585b92SGleb Natapov list_del(&work->link);
168af585b92SGleb Natapov spin_unlock(&vcpu->async_pf.lock);
169af585b92SGleb Natapov
17056028d08SGleb Natapov kvm_arch_async_page_ready(vcpu, work);
1714425f567SPaolo Bonzini if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
1724425f567SPaolo Bonzini kvm_arch_async_page_present(vcpu, work);
173af585b92SGleb Natapov
174af585b92SGleb Natapov list_del(&work->queue);
175af585b92SGleb Natapov vcpu->async_pf.queued--;
176*a75afe48SSean Christopherson kvm_flush_and_free_async_pf_work(work);
177af585b92SGleb Natapov }
17815096ffcSXiao Guangrong }
179af585b92SGleb Natapov
180e8c22266SVitaly Kuznetsov /*
181e8c22266SVitaly Kuznetsov * Try to schedule a job to handle page fault asynchronously. Returns 'true' on
182e8c22266SVitaly Kuznetsov * success, 'false' on failure (page fault has to be handled synchronously).
183e8c22266SVitaly Kuznetsov */
kvm_setup_async_pf(struct kvm_vcpu * vcpu,gpa_t cr2_or_gpa,unsigned long hva,struct kvm_arch_async_pf * arch)184e8c22266SVitaly Kuznetsov bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
185736c291cSSean Christopherson unsigned long hva, struct kvm_arch_async_pf *arch)
186af585b92SGleb Natapov {
187af585b92SGleb Natapov struct kvm_async_pf *work;
188af585b92SGleb Natapov
189af585b92SGleb Natapov if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
190e8c22266SVitaly Kuznetsov return false;
191af585b92SGleb Natapov
1927863e346SVitaly Kuznetsov /* Arch specific code should not do async PF in this case */
1937863e346SVitaly Kuznetsov if (unlikely(kvm_is_error_hva(hva)))
194e8c22266SVitaly Kuznetsov return false;
195af585b92SGleb Natapov
196af585b92SGleb Natapov /*
197af585b92SGleb Natapov * do alloc nowait since if we are going to sleep anyway we
198af585b92SGleb Natapov * may as well sleep faulting in page
199af585b92SGleb Natapov */
200d7444794SChristian Borntraeger work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
201af585b92SGleb Natapov if (!work)
202e8c22266SVitaly Kuznetsov return false;
203af585b92SGleb Natapov
204f2e10669Schai wen work->wakeup_all = false;
205af585b92SGleb Natapov work->vcpu = vcpu;
206736c291cSSean Christopherson work->cr2_or_gpa = cr2_or_gpa;
207e0ead41aSDominik Dingel work->addr = hva;
208af585b92SGleb Natapov work->arch = *arch;
209af585b92SGleb Natapov work->mm = current->mm;
2103fce371bSVegard Nossum mmget(work->mm);
211af585b92SGleb Natapov
212af585b92SGleb Natapov INIT_WORK(&work->work, async_pf_execute);
213af585b92SGleb Natapov
214af585b92SGleb Natapov list_add_tail(&work->queue, &vcpu->async_pf.queue);
215af585b92SGleb Natapov vcpu->async_pf.queued++;
2162a18b7e7SVitaly Kuznetsov work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work);
2177863e346SVitaly Kuznetsov
2187863e346SVitaly Kuznetsov schedule_work(&work->work);
2197863e346SVitaly Kuznetsov
220e8c22266SVitaly Kuznetsov return true;
221af585b92SGleb Natapov }
222344d9588SGleb Natapov
kvm_async_pf_wakeup_all(struct kvm_vcpu * vcpu)223344d9588SGleb Natapov int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
224344d9588SGleb Natapov {
225344d9588SGleb Natapov struct kvm_async_pf *work;
226557a961aSVitaly Kuznetsov bool first;
227344d9588SGleb Natapov
22864f638c7SXiao Guangrong if (!list_empty_careful(&vcpu->async_pf.done))
229344d9588SGleb Natapov return 0;
230344d9588SGleb Natapov
231344d9588SGleb Natapov work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
232344d9588SGleb Natapov if (!work)
233344d9588SGleb Natapov return -ENOMEM;
234344d9588SGleb Natapov
235f2e10669Schai wen work->wakeup_all = true;
236344d9588SGleb Natapov INIT_LIST_HEAD(&work->queue); /* for list_del to work */
237344d9588SGleb Natapov
23864f638c7SXiao Guangrong spin_lock(&vcpu->async_pf.lock);
239557a961aSVitaly Kuznetsov first = list_empty(&vcpu->async_pf.done);
240344d9588SGleb Natapov list_add_tail(&work->link, &vcpu->async_pf.done);
24164f638c7SXiao Guangrong spin_unlock(&vcpu->async_pf.lock);
24264f638c7SXiao Guangrong
243557a961aSVitaly Kuznetsov if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
244557a961aSVitaly Kuznetsov kvm_arch_async_page_present_queued(vcpu);
245557a961aSVitaly Kuznetsov
246344d9588SGleb Natapov vcpu->async_pf.queued++;
247344d9588SGleb Natapov return 0;
248344d9588SGleb Natapov }
249