xen.c (1cfc9c4b9d4606a1e90e7dbc50058b9f0c1d43a6) xen.c (14243b387137a4afbe1df5d9dc15182d6657bb79)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4 * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5 *
6 * KVM Xen emulation
7 */
8
9#include "x86.h"
10#include "xen.h"
11#include "hyperv.h"
12
13#include <linux/kvm_host.h>
14#include <linux/sched/stat.h>
15
16#include <trace/events/kvm.h>
17#include <xen/interface/xen.h>
18#include <xen/interface/vcpu.h>
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4 * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5 *
6 * KVM Xen emulation
7 */
8
9#include "x86.h"
10#include "xen.h"
11#include "hyperv.h"
12
13#include <linux/kvm_host.h>
14#include <linux/sched/stat.h>
15
16#include <trace/events/kvm.h>
17#include <xen/interface/xen.h>
18#include <xen/interface/vcpu.h>
19#include <xen/interface/event_channel.h>
19
20#include "trace.h"
21
22DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
23
24static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
25{
26 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;

--- 163 unchanged lines hidden (view full) ---

190 if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
191 &state_entry_time, offset,
192 sizeof(state_entry_time)))
193 return;
194}
195
196int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
197{
20
21#include "trace.h"
22
23DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
24
25static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
26{
27 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;

--- 163 unchanged lines hidden (view full) ---

191 if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
192 &state_entry_time, offset,
193 sizeof(state_entry_time)))
194 return;
195}
196
197int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
198{
199 unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
200 bool atomic = in_atomic() || !task_is_running(current);
198 int err;
199 u8 rc = 0;
200
201 /*
202 * If the global upcall vector (HVMIRQ_callback_vector) is set and
203 * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
204 */
205 struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
206 struct kvm_memslots *slots = kvm_memslots(v->kvm);
201 int err;
202 u8 rc = 0;
203
204 /*
205 * If the global upcall vector (HVMIRQ_callback_vector) is set and
206 * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
207 */
208 struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
209 struct kvm_memslots *slots = kvm_memslots(v->kvm);
210 bool ghc_valid = slots->generation == ghc->generation &&
211 !kvm_is_error_hva(ghc->hva) && ghc->memslot;
212
207 unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
208
209 /* No need for compat handling here */
210 BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
211 offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
212 BUILD_BUG_ON(sizeof(rc) !=
213 sizeof_field(struct vcpu_info, evtchn_upcall_pending));
214 BUILD_BUG_ON(sizeof(rc) !=
215 sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
216
217 /*
218 * For efficiency, this mirrors the checks for using the valid
219 * cache in kvm_read_guest_offset_cached(), but just uses
220 * __get_user() instead. And falls back to the slow path.
221 */
213 unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
214
215 /* No need for compat handling here */
216 BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
217 offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
218 BUILD_BUG_ON(sizeof(rc) !=
219 sizeof_field(struct vcpu_info, evtchn_upcall_pending));
220 BUILD_BUG_ON(sizeof(rc) !=
221 sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
222
223 /*
224 * For efficiency, this mirrors the checks for using the valid
225 * cache in kvm_read_guest_offset_cached(), but just uses
226 * __get_user() instead. And falls back to the slow path.
227 */
222 if (likely(slots->generation == ghc->generation &&
223 !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
228 if (!evtchn_pending_sel && ghc_valid) {
224 /* Fast path */
225 pagefault_disable();
226 err = __get_user(rc, (u8 __user *)ghc->hva + offset);
227 pagefault_enable();
228 if (!err)
229 return rc;
230 }
231
232 /* Slow path */
233
234 /*
235 * This function gets called from kvm_vcpu_block() after setting the
236 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
237 * from a HLT. So we really mustn't sleep. If the page ended up absent
238 * at that point, just return 1 in order to trigger an immediate wake,
239 * and we'll end up getting called again from a context where we *can*
240 * fault in the page and wait for it.
241 */
229 /* Fast path */
230 pagefault_disable();
231 err = __get_user(rc, (u8 __user *)ghc->hva + offset);
232 pagefault_enable();
233 if (!err)
234 return rc;
235 }
236
237 /* Slow path */
238
239 /*
240 * This function gets called from kvm_vcpu_block() after setting the
241 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
242 * from a HLT. So we really mustn't sleep. If the page ended up absent
243 * at that point, just return 1 in order to trigger an immediate wake,
244 * and we'll end up getting called again from a context where we *can*
245 * fault in the page and wait for it.
246 */
242 if (in_atomic() || !task_is_running(current))
247 if (atomic)
243 return 1;
244
248 return 1;
249
245 kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
246 sizeof(rc));
250 if (!ghc_valid) {
251 err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
252 if (err || !ghc->memslot) {
253 /*
254 * If this failed, userspace has screwed up the
255 * vcpu_info mapping. No interrupts for you.
256 */
257 return 0;
258 }
259 }
247
260
261 /*
262 * Now we have a valid (protected by srcu) userspace HVA in
263 * ghc->hva which points to the struct vcpu_info. If there
264 * are any bits in the in-kernel evtchn_pending_sel then
265 * we need to write those to the guest vcpu_info and set
266 * its evtchn_upcall_pending flag. If there aren't any bits
267 * to add, we only want to *check* evtchn_upcall_pending.
268 */
269 if (evtchn_pending_sel) {
270 bool long_mode = v->kvm->arch.xen.long_mode;
271
272 if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
273 return 0;
274
275 if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
276 struct vcpu_info __user *vi = (void __user *)ghc->hva;
277
278 /* Attempt to set the evtchn_pending_sel bits in the
279 * guest, and if that succeeds then clear the same
280 * bits in the in-kernel version. */
281 asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
282 "\tnotq %0\n"
283 "\t" LOCK_PREFIX "andq %0, %2\n"
284 "2:\n"
285 "\t.section .fixup,\"ax\"\n"
286 "3:\tjmp\t2b\n"
287 "\t.previous\n"
288 _ASM_EXTABLE_UA(1b, 3b)
289 : "=r" (evtchn_pending_sel),
290 "+m" (vi->evtchn_pending_sel),
291 "+m" (v->arch.xen.evtchn_pending_sel)
292 : "0" (evtchn_pending_sel));
293 } else {
294 struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
295 u32 evtchn_pending_sel32 = evtchn_pending_sel;
296
297 /* Attempt to set the evtchn_pending_sel bits in the
298 * guest, and if that succeeds then clear the same
299 * bits in the in-kernel version. */
300 asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
301 "\tnotl %0\n"
302 "\t" LOCK_PREFIX "andl %0, %2\n"
303 "2:\n"
304 "\t.section .fixup,\"ax\"\n"
305 "3:\tjmp\t2b\n"
306 "\t.previous\n"
307 _ASM_EXTABLE_UA(1b, 3b)
308 : "=r" (evtchn_pending_sel32),
309 "+m" (vi->evtchn_pending_sel),
310 "+m" (v->arch.xen.evtchn_pending_sel)
311 : "0" (evtchn_pending_sel32));
312 }
313 rc = 1;
314 unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
315
316 err:
317 user_access_end();
318
319 mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
320 } else {
321 __get_user(rc, (u8 __user *)ghc->hva + offset);
322 }
323
248 return rc;
249}
250
251int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
252{
253 int r = -ENOENT;
254
255 mutex_lock(&kvm->lock);

--- 479 unchanged lines hidden (view full) ---

735 vcpu->run->xen.u.hcall.params[4] = params[4];
736 vcpu->run->xen.u.hcall.params[5] = params[5];
737 vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
738 vcpu->arch.complete_userspace_io =
739 kvm_xen_hypercall_complete_userspace;
740
741 return 0;
742}
324 return rc;
325}
326
327int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
328{
329 int r = -ENOENT;
330
331 mutex_lock(&kvm->lock);

--- 479 unchanged lines hidden (view full) ---

811 vcpu->run->xen.u.hcall.params[4] = params[4];
812 vcpu->run->xen.u.hcall.params[5] = params[5];
813 vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
814 vcpu->arch.complete_userspace_io =
815 kvm_xen_hypercall_complete_userspace;
816
817 return 0;
818}
819
820static inline int max_evtchn_port(struct kvm *kvm)
821{
822 if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
823 return EVTCHN_2L_NR_CHANNELS;
824 else
825 return COMPAT_EVTCHN_2L_NR_CHANNELS;
826}
827
828/*
829 * This follows the kvm_set_irq() API, so it returns:
830 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
831 * = 0 Interrupt was coalesced (previous irq is still pending)
832 * > 0 Number of CPUs interrupt was delivered to
833 */
834int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
835 struct kvm *kvm)
836{
837 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
838 struct kvm_vcpu *vcpu;
839 unsigned long *pending_bits, *mask_bits;
840 unsigned long flags;
841 int port_word_bit;
842 bool kick_vcpu = false;
843 int idx;
844 int rc;
845
846 vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
847 if (!vcpu)
848 return -1;
849
850 if (!vcpu->arch.xen.vcpu_info_set)
851 return -1;
852
853 if (e->xen_evtchn.port >= max_evtchn_port(kvm))
854 return -1;
855
856 rc = -EWOULDBLOCK;
857 read_lock_irqsave(&gpc->lock, flags);
858
859 idx = srcu_read_lock(&kvm->srcu);
860 if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
861 goto out_rcu;
862
863 if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
864 struct shared_info *shinfo = gpc->khva;
865 pending_bits = (unsigned long *)&shinfo->evtchn_pending;
866 mask_bits = (unsigned long *)&shinfo->evtchn_mask;
867 port_word_bit = e->xen_evtchn.port / 64;
868 } else {
869 struct compat_shared_info *shinfo = gpc->khva;
870 pending_bits = (unsigned long *)&shinfo->evtchn_pending;
871 mask_bits = (unsigned long *)&shinfo->evtchn_mask;
872 port_word_bit = e->xen_evtchn.port / 32;
873 }
874
875 /*
876 * If this port wasn't already set, and if it isn't masked, then
877 * we try to set the corresponding bit in the in-kernel shadow of
878 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
879 * already set, then we kick the vCPU in question to write to the
880 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
881 */
882 if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
883 rc = 0; /* It was already raised */
884 } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
885 rc = -1; /* Masked */
886 } else {
887 rc = 1; /* Delivered. But was the vCPU waking already? */
888 if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
889 kick_vcpu = true;
890 }
891
892 out_rcu:
893 srcu_read_unlock(&kvm->srcu, idx);
894 read_unlock_irqrestore(&gpc->lock, flags);
895
896 if (kick_vcpu) {
897 kvm_make_request(KVM_REQ_EVENT, vcpu);
898 kvm_vcpu_kick(vcpu);
899 }
900
901 return rc;
902}
903
904/* This is the version called from kvm_set_irq() as the .set function */
905static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
906 int irq_source_id, int level, bool line_status)
907{
908 bool mm_borrowed = false;
909 int rc;
910
911 if (!level)
912 return -1;
913
914 rc = kvm_xen_set_evtchn_fast(e, kvm);
915 if (rc != -EWOULDBLOCK)
916 return rc;
917
918 if (current->mm != kvm->mm) {
919 /*
920 * If not on a thread which already belongs to this KVM,
921 * we'd better be in the irqfd workqueue.
922 */
923 if (WARN_ON_ONCE(current->mm))
924 return -EINVAL;
925
926 kthread_use_mm(kvm->mm);
927 mm_borrowed = true;
928 }
929
930 /*
931 * For the irqfd workqueue, using the main kvm->lock mutex is
932 * fine since this function is invoked from kvm_set_irq() with
933 * no other lock held, no srcu. In future if it will be called
934 * directly from a vCPU thread (e.g. on hypercall for an IPI)
935 * then it may need to switch to using a leaf-node mutex for
936 * serializing the shared_info mapping.
937 */
938 mutex_lock(&kvm->lock);
939
940 /*
941 * It is theoretically possible for the page to be unmapped
942 * and the MMU notifier to invalidate the shared_info before
943 * we even get to use it. In that case, this looks like an
944 * infinite loop. It was tempting to do it via the userspace
945 * HVA instead... but that just *hides* the fact that it's
946 * an infinite loop, because if a fault occurs and it waits
947 * for the page to come back, it can *still* immediately
948 * fault and have to wait again, repeatedly.
949 *
950 * Conversely, the page could also have been reinstated by
951 * another thread before we even obtain the mutex above, so
952 * check again *first* before remapping it.
953 */
954 do {
955 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
956 int idx;
957
958 rc = kvm_xen_set_evtchn_fast(e, kvm);
959 if (rc != -EWOULDBLOCK)
960 break;
961
962 idx = srcu_read_lock(&kvm->srcu);
963 rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
964 PAGE_SIZE, false);
965 srcu_read_unlock(&kvm->srcu, idx);
966 } while(!rc);
967
968 mutex_unlock(&kvm->lock);
969
970 if (mm_borrowed)
971 kthread_unuse_mm(kvm->mm);
972
973 return rc;
974}
975
976int kvm_xen_setup_evtchn(struct kvm *kvm,
977 struct kvm_kernel_irq_routing_entry *e,
978 const struct kvm_irq_routing_entry *ue)
979
980{
981 if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
982 return -EINVAL;
983
984 /* We only support 2 level event channels for now */
985 if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
986 return -EINVAL;
987
988 e->xen_evtchn.port = ue->u.xen_evtchn.port;
989 e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
990 e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
991 e->set = evtchn_set_fn;
992
993 return 0;
994}