1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * derived from drivers/kvm/kvm_main.c 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright (C) 2008 Qumranet, Inc. 9 * Copyright IBM Corporation, 2008 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 11 * 12 * Authors: 13 * Avi Kivity <avi@qumranet.com> 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Amit Shah <amit.shah@qumranet.com> 16 * Ben-Ami Yassour <benami@il.ibm.com> 17 */ 18 19 #include <linux/kvm_host.h> 20 #include "irq.h" 21 #include "ioapic.h" 22 #include "mmu.h" 23 #include "i8254.h" 24 #include "tss.h" 25 #include "kvm_cache_regs.h" 26 #include "kvm_emulate.h" 27 #include "x86.h" 28 #include "cpuid.h" 29 #include "pmu.h" 30 #include "hyperv.h" 31 #include "lapic.h" 32 #include "xen.h" 33 34 #include <linux/clocksource.h> 35 #include <linux/interrupt.h> 36 #include <linux/kvm.h> 37 #include <linux/fs.h> 38 #include <linux/vmalloc.h> 39 #include <linux/export.h> 40 #include <linux/moduleparam.h> 41 #include <linux/mman.h> 42 #include <linux/highmem.h> 43 #include <linux/iommu.h> 44 #include <linux/intel-iommu.h> 45 #include <linux/cpufreq.h> 46 #include <linux/user-return-notifier.h> 47 #include <linux/srcu.h> 48 #include <linux/slab.h> 49 #include <linux/perf_event.h> 50 #include <linux/uaccess.h> 51 #include <linux/hash.h> 52 #include <linux/pci.h> 53 #include <linux/timekeeper_internal.h> 54 #include <linux/pvclock_gtod.h> 55 #include <linux/kvm_irqfd.h> 56 #include <linux/irqbypass.h> 57 #include <linux/sched/stat.h> 58 #include <linux/sched/isolation.h> 59 #include <linux/mem_encrypt.h> 60 #include <linux/entry-kvm.h> 61 62 #include <trace/events/kvm.h> 63 64 #include <asm/debugreg.h> 65 #include <asm/msr.h> 66 #include <asm/desc.h> 67 #include <asm/mce.h> 68 #include <linux/kernel_stat.h> 69 #include <asm/fpu/internal.h> /* Ugh! */ 70 #include <asm/pvclock.h> 71 #include <asm/div64.h> 72 #include <asm/irq_remapping.h> 73 #include <asm/mshyperv.h> 74 #include <asm/hypervisor.h> 75 #include <asm/tlbflush.h> 76 #include <asm/intel_pt.h> 77 #include <asm/emulate_prefix.h> 78 #include <asm/sgx.h> 79 #include <clocksource/hyperv_timer.h> 80 81 #define CREATE_TRACE_POINTS 82 #include "trace.h" 83 84 #define MAX_IO_MSRS 256 85 #define KVM_MAX_MCE_BANKS 32 86 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; 87 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); 88 89 #define emul_to_vcpu(ctxt) \ 90 ((struct kvm_vcpu *)(ctxt)->vcpu) 91 92 /* EFER defaults: 93 * - enable syscall per default because its emulated by KVM 94 * - enable LME and LMA per default on 64 bit KVM 95 */ 96 #ifdef CONFIG_X86_64 97 static 98 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); 99 #else 100 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); 101 #endif 102 103 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; 104 105 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ 106 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) 107 108 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 109 static void process_nmi(struct kvm_vcpu *vcpu); 110 static void process_smi(struct kvm_vcpu *vcpu); 111 static void enter_smm(struct kvm_vcpu *vcpu); 112 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 113 static void store_regs(struct kvm_vcpu *vcpu); 114 static int sync_regs(struct kvm_vcpu *vcpu); 115 116 struct kvm_x86_ops kvm_x86_ops __read_mostly; 117 EXPORT_SYMBOL_GPL(kvm_x86_ops); 118 119 #define KVM_X86_OP(func) \ 120 DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ 121 *(((struct kvm_x86_ops *)0)->func)); 122 #define KVM_X86_OP_NULL KVM_X86_OP 123 #include <asm/kvm-x86-ops.h> 124 EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); 125 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); 126 EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); 127 128 static bool __read_mostly ignore_msrs = 0; 129 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); 130 131 bool __read_mostly report_ignored_msrs = true; 132 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); 133 EXPORT_SYMBOL_GPL(report_ignored_msrs); 134 135 unsigned int min_timer_period_us = 200; 136 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); 137 138 static bool __read_mostly kvmclock_periodic_sync = true; 139 module_param(kvmclock_periodic_sync, bool, S_IRUGO); 140 141 bool __read_mostly kvm_has_tsc_control; 142 EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 143 u32 __read_mostly kvm_max_guest_tsc_khz; 144 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 145 u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; 146 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); 147 u64 __read_mostly kvm_max_tsc_scaling_ratio; 148 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); 149 u64 __read_mostly kvm_default_tsc_scaling_ratio; 150 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); 151 bool __read_mostly kvm_has_bus_lock_exit; 152 EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); 153 154 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ 155 static u32 __read_mostly tsc_tolerance_ppm = 250; 156 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 157 158 /* 159 * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables 160 * adaptive tuning starting from default advancement of 1000ns. '0' disables 161 * advancement entirely. Any other value is used as-is and disables adaptive 162 * tuning, i.e. allows privileged userspace to set an exact advancement time. 163 */ 164 static int __read_mostly lapic_timer_advance_ns = -1; 165 module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); 166 167 static bool __read_mostly vector_hashing = true; 168 module_param(vector_hashing, bool, S_IRUGO); 169 170 bool __read_mostly enable_vmware_backdoor = false; 171 module_param(enable_vmware_backdoor, bool, S_IRUGO); 172 EXPORT_SYMBOL_GPL(enable_vmware_backdoor); 173 174 static bool __read_mostly force_emulation_prefix = false; 175 module_param(force_emulation_prefix, bool, S_IRUGO); 176 177 int __read_mostly pi_inject_timer = -1; 178 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); 179 180 /* 181 * Restoring the host value for MSRs that are only consumed when running in 182 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU 183 * returns to userspace, i.e. the kernel can run with the guest's value. 184 */ 185 #define KVM_MAX_NR_USER_RETURN_MSRS 16 186 187 struct kvm_user_return_msrs_global { 188 int nr; 189 u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; 190 }; 191 192 struct kvm_user_return_msrs { 193 struct user_return_notifier urn; 194 bool registered; 195 struct kvm_user_return_msr_values { 196 u64 host; 197 u64 curr; 198 } values[KVM_MAX_NR_USER_RETURN_MSRS]; 199 }; 200 201 static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; 202 static struct kvm_user_return_msrs __percpu *user_return_msrs; 203 204 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ 205 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ 206 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ 207 | XFEATURE_MASK_PKRU) 208 209 u64 __read_mostly host_efer; 210 EXPORT_SYMBOL_GPL(host_efer); 211 212 bool __read_mostly allow_smaller_maxphyaddr = 0; 213 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); 214 215 u64 __read_mostly host_xss; 216 EXPORT_SYMBOL_GPL(host_xss); 217 u64 __read_mostly supported_xss; 218 EXPORT_SYMBOL_GPL(supported_xss); 219 220 struct kvm_stats_debugfs_item debugfs_entries[] = { 221 VCPU_STAT("pf_fixed", pf_fixed), 222 VCPU_STAT("pf_guest", pf_guest), 223 VCPU_STAT("tlb_flush", tlb_flush), 224 VCPU_STAT("invlpg", invlpg), 225 VCPU_STAT("exits", exits), 226 VCPU_STAT("io_exits", io_exits), 227 VCPU_STAT("mmio_exits", mmio_exits), 228 VCPU_STAT("signal_exits", signal_exits), 229 VCPU_STAT("irq_window", irq_window_exits), 230 VCPU_STAT("nmi_window", nmi_window_exits), 231 VCPU_STAT("halt_exits", halt_exits), 232 VCPU_STAT("halt_successful_poll", halt_successful_poll), 233 VCPU_STAT("halt_attempted_poll", halt_attempted_poll), 234 VCPU_STAT("halt_poll_invalid", halt_poll_invalid), 235 VCPU_STAT("halt_wakeup", halt_wakeup), 236 VCPU_STAT("hypercalls", hypercalls), 237 VCPU_STAT("request_irq", request_irq_exits), 238 VCPU_STAT("irq_exits", irq_exits), 239 VCPU_STAT("host_state_reload", host_state_reload), 240 VCPU_STAT("fpu_reload", fpu_reload), 241 VCPU_STAT("insn_emulation", insn_emulation), 242 VCPU_STAT("insn_emulation_fail", insn_emulation_fail), 243 VCPU_STAT("irq_injections", irq_injections), 244 VCPU_STAT("nmi_injections", nmi_injections), 245 VCPU_STAT("req_event", req_event), 246 VCPU_STAT("l1d_flush", l1d_flush), 247 VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), 248 VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), 249 VCPU_STAT("nested_run", nested_run), 250 VCPU_STAT("directed_yield_attempted", directed_yield_attempted), 251 VCPU_STAT("directed_yield_successful", directed_yield_successful), 252 VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), 253 VM_STAT("mmu_pte_write", mmu_pte_write), 254 VM_STAT("mmu_pde_zapped", mmu_pde_zapped), 255 VM_STAT("mmu_flooded", mmu_flooded), 256 VM_STAT("mmu_recycled", mmu_recycled), 257 VM_STAT("mmu_cache_miss", mmu_cache_miss), 258 VM_STAT("mmu_unsync", mmu_unsync), 259 VM_STAT("remote_tlb_flush", remote_tlb_flush), 260 VM_STAT("largepages", lpages, .mode = 0444), 261 VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), 262 VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), 263 { NULL } 264 }; 265 266 u64 __read_mostly host_xcr0; 267 u64 __read_mostly supported_xcr0; 268 EXPORT_SYMBOL_GPL(supported_xcr0); 269 270 static struct kmem_cache *x86_fpu_cache; 271 272 static struct kmem_cache *x86_emulator_cache; 273 274 /* 275 * When called, it means the previous get/set msr reached an invalid msr. 276 * Return true if we want to ignore/silent this failed msr access. 277 */ 278 static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) 279 { 280 const char *op = write ? "wrmsr" : "rdmsr"; 281 282 if (ignore_msrs) { 283 if (report_ignored_msrs) 284 kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", 285 op, msr, data); 286 /* Mask the error */ 287 return true; 288 } else { 289 kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 290 op, msr, data); 291 return false; 292 } 293 } 294 295 static struct kmem_cache *kvm_alloc_emulator_cache(void) 296 { 297 unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); 298 unsigned int size = sizeof(struct x86_emulate_ctxt); 299 300 return kmem_cache_create_usercopy("x86_emulator", size, 301 __alignof__(struct x86_emulate_ctxt), 302 SLAB_ACCOUNT, useroffset, 303 size - useroffset, NULL); 304 } 305 306 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 307 308 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 309 { 310 int i; 311 for (i = 0; i < ASYNC_PF_PER_VCPU; i++) 312 vcpu->arch.apf.gfns[i] = ~0; 313 } 314 315 static void kvm_on_user_return(struct user_return_notifier *urn) 316 { 317 unsigned slot; 318 struct kvm_user_return_msrs *msrs 319 = container_of(urn, struct kvm_user_return_msrs, urn); 320 struct kvm_user_return_msr_values *values; 321 unsigned long flags; 322 323 /* 324 * Disabling irqs at this point since the following code could be 325 * interrupted and executed through kvm_arch_hardware_disable() 326 */ 327 local_irq_save(flags); 328 if (msrs->registered) { 329 msrs->registered = false; 330 user_return_notifier_unregister(urn); 331 } 332 local_irq_restore(flags); 333 for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { 334 values = &msrs->values[slot]; 335 if (values->host != values->curr) { 336 wrmsrl(user_return_msrs_global.msrs[slot], values->host); 337 values->curr = values->host; 338 } 339 } 340 } 341 342 void kvm_define_user_return_msr(unsigned slot, u32 msr) 343 { 344 BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); 345 user_return_msrs_global.msrs[slot] = msr; 346 if (slot >= user_return_msrs_global.nr) 347 user_return_msrs_global.nr = slot + 1; 348 } 349 EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); 350 351 static void kvm_user_return_msr_cpu_online(void) 352 { 353 unsigned int cpu = smp_processor_id(); 354 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 355 u64 value; 356 int i; 357 358 for (i = 0; i < user_return_msrs_global.nr; ++i) { 359 rdmsrl_safe(user_return_msrs_global.msrs[i], &value); 360 msrs->values[i].host = value; 361 msrs->values[i].curr = value; 362 } 363 } 364 365 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) 366 { 367 unsigned int cpu = smp_processor_id(); 368 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 369 int err; 370 371 value = (value & mask) | (msrs->values[slot].host & ~mask); 372 if (value == msrs->values[slot].curr) 373 return 0; 374 err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); 375 if (err) 376 return 1; 377 378 msrs->values[slot].curr = value; 379 if (!msrs->registered) { 380 msrs->urn.on_user_return = kvm_on_user_return; 381 user_return_notifier_register(&msrs->urn); 382 msrs->registered = true; 383 } 384 return 0; 385 } 386 EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); 387 388 static void drop_user_return_notifiers(void) 389 { 390 unsigned int cpu = smp_processor_id(); 391 struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 392 393 if (msrs->registered) 394 kvm_on_user_return(&msrs->urn); 395 } 396 397 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 398 { 399 return vcpu->arch.apic_base; 400 } 401 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 402 403 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) 404 { 405 return kvm_apic_mode(kvm_get_apic_base(vcpu)); 406 } 407 EXPORT_SYMBOL_GPL(kvm_get_apic_mode); 408 409 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 410 { 411 enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); 412 enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); 413 u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | 414 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); 415 416 if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) 417 return 1; 418 if (!msr_info->host_initiated) { 419 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) 420 return 1; 421 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) 422 return 1; 423 } 424 425 kvm_lapic_set_base(vcpu, msr_info->data); 426 kvm_recalculate_apic_map(vcpu->kvm); 427 return 0; 428 } 429 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 430 431 asmlinkage __visible noinstr void kvm_spurious_fault(void) 432 { 433 /* Fault while not rebooting. We want the trace. */ 434 BUG_ON(!kvm_rebooting); 435 } 436 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 437 438 #define EXCPT_BENIGN 0 439 #define EXCPT_CONTRIBUTORY 1 440 #define EXCPT_PF 2 441 442 static int exception_class(int vector) 443 { 444 switch (vector) { 445 case PF_VECTOR: 446 return EXCPT_PF; 447 case DE_VECTOR: 448 case TS_VECTOR: 449 case NP_VECTOR: 450 case SS_VECTOR: 451 case GP_VECTOR: 452 return EXCPT_CONTRIBUTORY; 453 default: 454 break; 455 } 456 return EXCPT_BENIGN; 457 } 458 459 #define EXCPT_FAULT 0 460 #define EXCPT_TRAP 1 461 #define EXCPT_ABORT 2 462 #define EXCPT_INTERRUPT 3 463 464 static int exception_type(int vector) 465 { 466 unsigned int mask; 467 468 if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) 469 return EXCPT_INTERRUPT; 470 471 mask = 1 << vector; 472 473 /* #DB is trap, as instruction watchpoints are handled elsewhere */ 474 if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) 475 return EXCPT_TRAP; 476 477 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) 478 return EXCPT_ABORT; 479 480 /* Reserved exceptions will result in fault */ 481 return EXCPT_FAULT; 482 } 483 484 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) 485 { 486 unsigned nr = vcpu->arch.exception.nr; 487 bool has_payload = vcpu->arch.exception.has_payload; 488 unsigned long payload = vcpu->arch.exception.payload; 489 490 if (!has_payload) 491 return; 492 493 switch (nr) { 494 case DB_VECTOR: 495 /* 496 * "Certain debug exceptions may clear bit 0-3. The 497 * remaining contents of the DR6 register are never 498 * cleared by the processor". 499 */ 500 vcpu->arch.dr6 &= ~DR_TRAP_BITS; 501 /* 502 * In order to reflect the #DB exception payload in guest 503 * dr6, three components need to be considered: active low 504 * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, 505 * DR6_BS and DR6_BT) 506 * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. 507 * In the target guest dr6: 508 * FIXED_1 bits should always be set. 509 * Active low bits should be cleared if 1-setting in payload. 510 * Active high bits should be set if 1-setting in payload. 511 * 512 * Note, the payload is compatible with the pending debug 513 * exceptions/exit qualification under VMX, that active_low bits 514 * are active high in payload. 515 * So they need to be flipped for DR6. 516 */ 517 vcpu->arch.dr6 |= DR6_ACTIVE_LOW; 518 vcpu->arch.dr6 |= payload; 519 vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; 520 521 /* 522 * The #DB payload is defined as compatible with the 'pending 523 * debug exceptions' field under VMX, not DR6. While bit 12 is 524 * defined in the 'pending debug exceptions' field (enabled 525 * breakpoint), it is reserved and must be zero in DR6. 526 */ 527 vcpu->arch.dr6 &= ~BIT(12); 528 break; 529 case PF_VECTOR: 530 vcpu->arch.cr2 = payload; 531 break; 532 } 533 534 vcpu->arch.exception.has_payload = false; 535 vcpu->arch.exception.payload = 0; 536 } 537 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); 538 539 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 540 unsigned nr, bool has_error, u32 error_code, 541 bool has_payload, unsigned long payload, bool reinject) 542 { 543 u32 prev_nr; 544 int class1, class2; 545 546 kvm_make_request(KVM_REQ_EVENT, vcpu); 547 548 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { 549 queue: 550 if (reinject) { 551 /* 552 * On vmentry, vcpu->arch.exception.pending is only 553 * true if an event injection was blocked by 554 * nested_run_pending. In that case, however, 555 * vcpu_enter_guest requests an immediate exit, 556 * and the guest shouldn't proceed far enough to 557 * need reinjection. 558 */ 559 WARN_ON_ONCE(vcpu->arch.exception.pending); 560 vcpu->arch.exception.injected = true; 561 if (WARN_ON_ONCE(has_payload)) { 562 /* 563 * A reinjected event has already 564 * delivered its payload. 565 */ 566 has_payload = false; 567 payload = 0; 568 } 569 } else { 570 vcpu->arch.exception.pending = true; 571 vcpu->arch.exception.injected = false; 572 } 573 vcpu->arch.exception.has_error_code = has_error; 574 vcpu->arch.exception.nr = nr; 575 vcpu->arch.exception.error_code = error_code; 576 vcpu->arch.exception.has_payload = has_payload; 577 vcpu->arch.exception.payload = payload; 578 if (!is_guest_mode(vcpu)) 579 kvm_deliver_exception_payload(vcpu); 580 return; 581 } 582 583 /* to check exception */ 584 prev_nr = vcpu->arch.exception.nr; 585 if (prev_nr == DF_VECTOR) { 586 /* triple fault -> shutdown */ 587 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 588 return; 589 } 590 class1 = exception_class(prev_nr); 591 class2 = exception_class(nr); 592 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 593 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 594 /* 595 * Generate double fault per SDM Table 5-5. Set 596 * exception.pending = true so that the double fault 597 * can trigger a nested vmexit. 598 */ 599 vcpu->arch.exception.pending = true; 600 vcpu->arch.exception.injected = false; 601 vcpu->arch.exception.has_error_code = true; 602 vcpu->arch.exception.nr = DF_VECTOR; 603 vcpu->arch.exception.error_code = 0; 604 vcpu->arch.exception.has_payload = false; 605 vcpu->arch.exception.payload = 0; 606 } else 607 /* replace previous exception with a new one in a hope 608 that instruction re-execution will regenerate lost 609 exception */ 610 goto queue; 611 } 612 613 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 614 { 615 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); 616 } 617 EXPORT_SYMBOL_GPL(kvm_queue_exception); 618 619 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) 620 { 621 kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); 622 } 623 EXPORT_SYMBOL_GPL(kvm_requeue_exception); 624 625 void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, 626 unsigned long payload) 627 { 628 kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); 629 } 630 EXPORT_SYMBOL_GPL(kvm_queue_exception_p); 631 632 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, 633 u32 error_code, unsigned long payload) 634 { 635 kvm_multiple_exception(vcpu, nr, true, error_code, 636 true, payload, false); 637 } 638 639 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) 640 { 641 if (err) 642 kvm_inject_gp(vcpu, 0); 643 else 644 return kvm_skip_emulated_instruction(vcpu); 645 646 return 1; 647 } 648 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); 649 650 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 651 { 652 ++vcpu->stat.pf_guest; 653 vcpu->arch.exception.nested_apf = 654 is_guest_mode(vcpu) && fault->async_page_fault; 655 if (vcpu->arch.exception.nested_apf) { 656 vcpu->arch.apf.nested_apf_token = fault->address; 657 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 658 } else { 659 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, 660 fault->address); 661 } 662 } 663 EXPORT_SYMBOL_GPL(kvm_inject_page_fault); 664 665 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, 666 struct x86_exception *fault) 667 { 668 struct kvm_mmu *fault_mmu; 669 WARN_ON_ONCE(fault->vector != PF_VECTOR); 670 671 fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : 672 vcpu->arch.walk_mmu; 673 674 /* 675 * Invalidate the TLB entry for the faulting address, if it exists, 676 * else the access will fault indefinitely (and to emulate hardware). 677 */ 678 if ((fault->error_code & PFERR_PRESENT_MASK) && 679 !(fault->error_code & PFERR_RSVD_MASK)) 680 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, 681 fault_mmu->root_hpa); 682 683 fault_mmu->inject_page_fault(vcpu, fault); 684 return fault->nested_page_fault; 685 } 686 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); 687 688 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 689 { 690 atomic_inc(&vcpu->arch.nmi_queued); 691 kvm_make_request(KVM_REQ_NMI, vcpu); 692 } 693 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 694 695 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 696 { 697 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); 698 } 699 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 700 701 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 702 { 703 kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); 704 } 705 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); 706 707 /* 708 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 709 * a #GP and return false. 710 */ 711 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 712 { 713 if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) 714 return true; 715 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 716 return false; 717 } 718 EXPORT_SYMBOL_GPL(kvm_require_cpl); 719 720 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) 721 { 722 if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 723 return true; 724 725 kvm_queue_exception(vcpu, UD_VECTOR); 726 return false; 727 } 728 EXPORT_SYMBOL_GPL(kvm_require_dr); 729 730 /* 731 * This function will be used to read from the physical memory of the currently 732 * running guest. The difference to kvm_vcpu_read_guest_page is that this function 733 * can read from guest physical or from the guest's guest physical memory. 734 */ 735 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 736 gfn_t ngfn, void *data, int offset, int len, 737 u32 access) 738 { 739 struct x86_exception exception; 740 gfn_t real_gfn; 741 gpa_t ngpa; 742 743 ngpa = gfn_to_gpa(ngfn); 744 real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); 745 if (real_gfn == UNMAPPED_GVA) 746 return -EFAULT; 747 748 real_gfn = gpa_to_gfn(real_gfn); 749 750 return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); 751 } 752 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 753 754 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 755 void *data, int offset, int len, u32 access) 756 { 757 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, 758 data, offset, len, access); 759 } 760 761 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) 762 { 763 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); 764 } 765 766 /* 767 * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. 768 */ 769 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) 770 { 771 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 772 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 773 int i; 774 int ret; 775 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; 776 777 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, 778 offset * sizeof(u64), sizeof(pdpte), 779 PFERR_USER_MASK|PFERR_WRITE_MASK); 780 if (ret < 0) { 781 ret = 0; 782 goto out; 783 } 784 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 785 if ((pdpte[i] & PT_PRESENT_MASK) && 786 (pdpte[i] & pdptr_rsvd_bits(vcpu))) { 787 ret = 0; 788 goto out; 789 } 790 } 791 ret = 1; 792 793 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); 794 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); 795 796 out: 797 798 return ret; 799 } 800 EXPORT_SYMBOL_GPL(load_pdptrs); 801 802 bool pdptrs_changed(struct kvm_vcpu *vcpu) 803 { 804 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; 805 int offset; 806 gfn_t gfn; 807 int r; 808 809 if (!is_pae_paging(vcpu)) 810 return false; 811 812 if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) 813 return true; 814 815 gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; 816 offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); 817 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 818 PFERR_USER_MASK | PFERR_WRITE_MASK); 819 if (r < 0) 820 return true; 821 822 return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; 823 } 824 EXPORT_SYMBOL_GPL(pdptrs_changed); 825 826 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) 827 { 828 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; 829 830 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 831 kvm_clear_async_pf_completion_queue(vcpu); 832 kvm_async_pf_hash_reset(vcpu); 833 } 834 835 if ((cr0 ^ old_cr0) & update_bits) 836 kvm_mmu_reset_context(vcpu); 837 838 if (((cr0 ^ old_cr0) & X86_CR0_CD) && 839 kvm_arch_has_noncoherent_dma(vcpu->kvm) && 840 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 841 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); 842 } 843 EXPORT_SYMBOL_GPL(kvm_post_set_cr0); 844 845 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 846 { 847 unsigned long old_cr0 = kvm_read_cr0(vcpu); 848 unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; 849 850 cr0 |= X86_CR0_ET; 851 852 #ifdef CONFIG_X86_64 853 if (cr0 & 0xffffffff00000000UL) 854 return 1; 855 #endif 856 857 cr0 &= ~CR0_RESERVED_BITS; 858 859 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) 860 return 1; 861 862 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) 863 return 1; 864 865 #ifdef CONFIG_X86_64 866 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && 867 (cr0 & X86_CR0_PG)) { 868 int cs_db, cs_l; 869 870 if (!is_pae(vcpu)) 871 return 1; 872 static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); 873 if (cs_l) 874 return 1; 875 } 876 #endif 877 if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && 878 is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && 879 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) 880 return 1; 881 882 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) 883 return 1; 884 885 static_call(kvm_x86_set_cr0)(vcpu, cr0); 886 887 kvm_post_set_cr0(vcpu, old_cr0, cr0); 888 889 return 0; 890 } 891 EXPORT_SYMBOL_GPL(kvm_set_cr0); 892 893 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 894 { 895 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 896 } 897 EXPORT_SYMBOL_GPL(kvm_lmsw); 898 899 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) 900 { 901 if (vcpu->arch.guest_state_protected) 902 return; 903 904 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { 905 906 if (vcpu->arch.xcr0 != host_xcr0) 907 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 908 909 if (vcpu->arch.xsaves_enabled && 910 vcpu->arch.ia32_xss != host_xss) 911 wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); 912 } 913 914 if (static_cpu_has(X86_FEATURE_PKU) && 915 (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || 916 (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && 917 vcpu->arch.pkru != vcpu->arch.host_pkru) 918 __write_pkru(vcpu->arch.pkru); 919 } 920 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); 921 922 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) 923 { 924 if (vcpu->arch.guest_state_protected) 925 return; 926 927 if (static_cpu_has(X86_FEATURE_PKU) && 928 (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || 929 (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { 930 vcpu->arch.pkru = rdpkru(); 931 if (vcpu->arch.pkru != vcpu->arch.host_pkru) 932 __write_pkru(vcpu->arch.host_pkru); 933 } 934 935 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { 936 937 if (vcpu->arch.xcr0 != host_xcr0) 938 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); 939 940 if (vcpu->arch.xsaves_enabled && 941 vcpu->arch.ia32_xss != host_xss) 942 wrmsrl(MSR_IA32_XSS, host_xss); 943 } 944 945 } 946 EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); 947 948 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 949 { 950 u64 xcr0 = xcr; 951 u64 old_xcr0 = vcpu->arch.xcr0; 952 u64 valid_bits; 953 954 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 955 if (index != XCR_XFEATURE_ENABLED_MASK) 956 return 1; 957 if (!(xcr0 & XFEATURE_MASK_FP)) 958 return 1; 959 if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) 960 return 1; 961 962 /* 963 * Do not allow the guest to set bits that we do not support 964 * saving. However, xcr0 bit 0 is always set, even if the 965 * emulated CPU does not support XSAVE (see fx_init). 966 */ 967 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; 968 if (xcr0 & ~valid_bits) 969 return 1; 970 971 if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != 972 (!(xcr0 & XFEATURE_MASK_BNDCSR))) 973 return 1; 974 975 if (xcr0 & XFEATURE_MASK_AVX512) { 976 if (!(xcr0 & XFEATURE_MASK_YMM)) 977 return 1; 978 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) 979 return 1; 980 } 981 vcpu->arch.xcr0 = xcr0; 982 983 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) 984 kvm_update_cpuid_runtime(vcpu); 985 return 0; 986 } 987 988 int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) 989 { 990 if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || 991 __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { 992 kvm_inject_gp(vcpu, 0); 993 return 1; 994 } 995 996 return kvm_skip_emulated_instruction(vcpu); 997 } 998 EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); 999 1000 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1001 { 1002 if (cr4 & cr4_reserved_bits) 1003 return false; 1004 1005 if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) 1006 return false; 1007 1008 return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); 1009 } 1010 EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); 1011 1012 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) 1013 { 1014 unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | 1015 X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; 1016 1017 if (((cr4 ^ old_cr4) & mmu_role_bits) || 1018 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) 1019 kvm_mmu_reset_context(vcpu); 1020 } 1021 EXPORT_SYMBOL_GPL(kvm_post_set_cr4); 1022 1023 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1024 { 1025 unsigned long old_cr4 = kvm_read_cr4(vcpu); 1026 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | 1027 X86_CR4_SMEP; 1028 1029 if (!kvm_is_valid_cr4(vcpu, cr4)) 1030 return 1; 1031 1032 if (is_long_mode(vcpu)) { 1033 if (!(cr4 & X86_CR4_PAE)) 1034 return 1; 1035 if ((cr4 ^ old_cr4) & X86_CR4_LA57) 1036 return 1; 1037 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 1038 && ((cr4 ^ old_cr4) & pdptr_bits) 1039 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 1040 kvm_read_cr3(vcpu))) 1041 return 1; 1042 1043 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { 1044 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) 1045 return 1; 1046 1047 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ 1048 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) 1049 return 1; 1050 } 1051 1052 static_call(kvm_x86_set_cr4)(vcpu, cr4); 1053 1054 kvm_post_set_cr4(vcpu, old_cr4, cr4); 1055 1056 return 0; 1057 } 1058 EXPORT_SYMBOL_GPL(kvm_set_cr4); 1059 1060 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1061 { 1062 bool skip_tlb_flush = false; 1063 #ifdef CONFIG_X86_64 1064 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 1065 1066 if (pcid_enabled) { 1067 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; 1068 cr3 &= ~X86_CR3_PCID_NOFLUSH; 1069 } 1070 #endif 1071 1072 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { 1073 if (!skip_tlb_flush) { 1074 kvm_mmu_sync_roots(vcpu); 1075 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1076 } 1077 return 0; 1078 } 1079 1080 /* 1081 * Do not condition the GPA check on long mode, this helper is used to 1082 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that 1083 * the current vCPU mode is accurate. 1084 */ 1085 if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) 1086 return 1; 1087 1088 if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 1089 return 1; 1090 1091 kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); 1092 vcpu->arch.cr3 = cr3; 1093 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1094 1095 return 0; 1096 } 1097 EXPORT_SYMBOL_GPL(kvm_set_cr3); 1098 1099 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 1100 { 1101 if (cr8 & CR8_RESERVED_BITS) 1102 return 1; 1103 if (lapic_in_kernel(vcpu)) 1104 kvm_lapic_set_tpr(vcpu, cr8); 1105 else 1106 vcpu->arch.cr8 = cr8; 1107 return 0; 1108 } 1109 EXPORT_SYMBOL_GPL(kvm_set_cr8); 1110 1111 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 1112 { 1113 if (lapic_in_kernel(vcpu)) 1114 return kvm_lapic_get_cr8(vcpu); 1115 else 1116 return vcpu->arch.cr8; 1117 } 1118 EXPORT_SYMBOL_GPL(kvm_get_cr8); 1119 1120 static void kvm_update_dr0123(struct kvm_vcpu *vcpu) 1121 { 1122 int i; 1123 1124 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1125 for (i = 0; i < KVM_NR_DB_REGS; i++) 1126 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 1127 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; 1128 } 1129 } 1130 1131 void kvm_update_dr7(struct kvm_vcpu *vcpu) 1132 { 1133 unsigned long dr7; 1134 1135 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1136 dr7 = vcpu->arch.guest_debug_dr7; 1137 else 1138 dr7 = vcpu->arch.dr7; 1139 static_call(kvm_x86_set_dr7)(vcpu, dr7); 1140 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; 1141 if (dr7 & DR7_BP_EN_MASK) 1142 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; 1143 } 1144 EXPORT_SYMBOL_GPL(kvm_update_dr7); 1145 1146 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) 1147 { 1148 u64 fixed = DR6_FIXED_1; 1149 1150 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) 1151 fixed |= DR6_RTM; 1152 return fixed; 1153 } 1154 1155 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 1156 { 1157 size_t size = ARRAY_SIZE(vcpu->arch.db); 1158 1159 switch (dr) { 1160 case 0 ... 3: 1161 vcpu->arch.db[array_index_nospec(dr, size)] = val; 1162 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1163 vcpu->arch.eff_db[dr] = val; 1164 break; 1165 case 4: 1166 case 6: 1167 if (!kvm_dr6_valid(val)) 1168 return 1; /* #GP */ 1169 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); 1170 break; 1171 case 5: 1172 default: /* 7 */ 1173 if (!kvm_dr7_valid(val)) 1174 return 1; /* #GP */ 1175 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 1176 kvm_update_dr7(vcpu); 1177 break; 1178 } 1179 1180 return 0; 1181 } 1182 EXPORT_SYMBOL_GPL(kvm_set_dr); 1183 1184 void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 1185 { 1186 size_t size = ARRAY_SIZE(vcpu->arch.db); 1187 1188 switch (dr) { 1189 case 0 ... 3: 1190 *val = vcpu->arch.db[array_index_nospec(dr, size)]; 1191 break; 1192 case 4: 1193 case 6: 1194 *val = vcpu->arch.dr6; 1195 break; 1196 case 5: 1197 default: /* 7 */ 1198 *val = vcpu->arch.dr7; 1199 break; 1200 } 1201 } 1202 EXPORT_SYMBOL_GPL(kvm_get_dr); 1203 1204 int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) 1205 { 1206 u32 ecx = kvm_rcx_read(vcpu); 1207 u64 data; 1208 1209 if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { 1210 kvm_inject_gp(vcpu, 0); 1211 return 1; 1212 } 1213 1214 kvm_rax_write(vcpu, (u32)data); 1215 kvm_rdx_write(vcpu, data >> 32); 1216 return kvm_skip_emulated_instruction(vcpu); 1217 } 1218 EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); 1219 1220 /* 1221 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 1222 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 1223 * 1224 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) 1225 * extract the supported MSRs from the related const lists. 1226 * msrs_to_save is selected from the msrs_to_save_all to reflect the 1227 * capabilities of the host cpu. This capabilities test skips MSRs that are 1228 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs 1229 * may depend on host virtualization features rather than host cpu features. 1230 */ 1231 1232 static const u32 msrs_to_save_all[] = { 1233 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 1234 MSR_STAR, 1235 #ifdef CONFIG_X86_64 1236 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 1237 #endif 1238 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1239 MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1240 MSR_IA32_SPEC_CTRL, 1241 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 1242 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 1243 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 1244 MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 1245 MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 1246 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 1247 MSR_IA32_UMWAIT_CONTROL, 1248 1249 MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1250 MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, 1251 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1252 MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1253 MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 1254 MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 1255 MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 1256 MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 1257 MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, 1258 MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, 1259 MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, 1260 MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, 1261 MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, 1262 MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1263 MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1264 MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 1265 MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 1266 MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, 1267 MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, 1268 MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, 1269 MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, 1270 MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, 1271 }; 1272 1273 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; 1274 static unsigned num_msrs_to_save; 1275 1276 static const u32 emulated_msrs_all[] = { 1277 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 1278 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1279 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1280 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 1281 HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 1282 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1283 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1284 HV_X64_MSR_RESET, 1285 HV_X64_MSR_VP_INDEX, 1286 HV_X64_MSR_VP_RUNTIME, 1287 HV_X64_MSR_SCONTROL, 1288 HV_X64_MSR_STIMER0_CONFIG, 1289 HV_X64_MSR_VP_ASSIST_PAGE, 1290 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 1291 HV_X64_MSR_TSC_EMULATION_STATUS, 1292 HV_X64_MSR_SYNDBG_OPTIONS, 1293 HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 1294 HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 1295 HV_X64_MSR_SYNDBG_PENDING_BUFFER, 1296 1297 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 1298 MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 1299 1300 MSR_IA32_TSC_ADJUST, 1301 MSR_IA32_TSC_DEADLINE, 1302 MSR_IA32_ARCH_CAPABILITIES, 1303 MSR_IA32_PERF_CAPABILITIES, 1304 MSR_IA32_MISC_ENABLE, 1305 MSR_IA32_MCG_STATUS, 1306 MSR_IA32_MCG_CTL, 1307 MSR_IA32_MCG_EXT_CTL, 1308 MSR_IA32_SMBASE, 1309 MSR_SMI_COUNT, 1310 MSR_PLATFORM_INFO, 1311 MSR_MISC_FEATURES_ENABLES, 1312 MSR_AMD64_VIRT_SPEC_CTRL, 1313 MSR_IA32_POWER_CTL, 1314 MSR_IA32_UCODE_REV, 1315 1316 /* 1317 * The following list leaves out MSRs whose values are determined 1318 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. 1319 * We always support the "true" VMX control MSRs, even if the host 1320 * processor does not, so I am putting these registers here rather 1321 * than in msrs_to_save_all. 1322 */ 1323 MSR_IA32_VMX_BASIC, 1324 MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1325 MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1326 MSR_IA32_VMX_TRUE_EXIT_CTLS, 1327 MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1328 MSR_IA32_VMX_MISC, 1329 MSR_IA32_VMX_CR0_FIXED0, 1330 MSR_IA32_VMX_CR4_FIXED0, 1331 MSR_IA32_VMX_VMCS_ENUM, 1332 MSR_IA32_VMX_PROCBASED_CTLS2, 1333 MSR_IA32_VMX_EPT_VPID_CAP, 1334 MSR_IA32_VMX_VMFUNC, 1335 1336 MSR_K7_HWCR, 1337 MSR_KVM_POLL_CONTROL, 1338 }; 1339 1340 static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 1341 static unsigned num_emulated_msrs; 1342 1343 /* 1344 * List of msr numbers which are used to expose MSR-based features that 1345 * can be used by a hypervisor to validate requested CPU features. 1346 */ 1347 static const u32 msr_based_features_all[] = { 1348 MSR_IA32_VMX_BASIC, 1349 MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1350 MSR_IA32_VMX_PINBASED_CTLS, 1351 MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1352 MSR_IA32_VMX_PROCBASED_CTLS, 1353 MSR_IA32_VMX_TRUE_EXIT_CTLS, 1354 MSR_IA32_VMX_EXIT_CTLS, 1355 MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1356 MSR_IA32_VMX_ENTRY_CTLS, 1357 MSR_IA32_VMX_MISC, 1358 MSR_IA32_VMX_CR0_FIXED0, 1359 MSR_IA32_VMX_CR0_FIXED1, 1360 MSR_IA32_VMX_CR4_FIXED0, 1361 MSR_IA32_VMX_CR4_FIXED1, 1362 MSR_IA32_VMX_VMCS_ENUM, 1363 MSR_IA32_VMX_PROCBASED_CTLS2, 1364 MSR_IA32_VMX_EPT_VPID_CAP, 1365 MSR_IA32_VMX_VMFUNC, 1366 1367 MSR_F10H_DECFG, 1368 MSR_IA32_UCODE_REV, 1369 MSR_IA32_ARCH_CAPABILITIES, 1370 MSR_IA32_PERF_CAPABILITIES, 1371 }; 1372 1373 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; 1374 static unsigned int num_msr_based_features; 1375 1376 static u64 kvm_get_arch_capabilities(void) 1377 { 1378 u64 data = 0; 1379 1380 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) 1381 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); 1382 1383 /* 1384 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that 1385 * the nested hypervisor runs with NX huge pages. If it is not, 1386 * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other 1387 * L1 guests, so it need not worry about its own (L2) guests. 1388 */ 1389 data |= ARCH_CAP_PSCHANGE_MC_NO; 1390 1391 /* 1392 * If we're doing cache flushes (either "always" or "cond") 1393 * we will do one whenever the guest does a vmlaunch/vmresume. 1394 * If an outer hypervisor is doing the cache flush for us 1395 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that 1396 * capability to the guest too, and if EPT is disabled we're not 1397 * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will 1398 * require a nested hypervisor to do a flush of its own. 1399 */ 1400 if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) 1401 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; 1402 1403 if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 1404 data |= ARCH_CAP_RDCL_NO; 1405 if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) 1406 data |= ARCH_CAP_SSB_NO; 1407 if (!boot_cpu_has_bug(X86_BUG_MDS)) 1408 data |= ARCH_CAP_MDS_NO; 1409 1410 if (!boot_cpu_has(X86_FEATURE_RTM)) { 1411 /* 1412 * If RTM=0 because the kernel has disabled TSX, the host might 1413 * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 1414 * and therefore knows that there cannot be TAA) but keep 1415 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, 1416 * and we want to allow migrating those guests to tsx=off hosts. 1417 */ 1418 data &= ~ARCH_CAP_TAA_NO; 1419 } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { 1420 data |= ARCH_CAP_TAA_NO; 1421 } else { 1422 /* 1423 * Nothing to do here; we emulate TSX_CTRL if present on the 1424 * host so the guest can choose between disabling TSX or 1425 * using VERW to clear CPU buffers. 1426 */ 1427 } 1428 1429 return data; 1430 } 1431 1432 static int kvm_get_msr_feature(struct kvm_msr_entry *msr) 1433 { 1434 switch (msr->index) { 1435 case MSR_IA32_ARCH_CAPABILITIES: 1436 msr->data = kvm_get_arch_capabilities(); 1437 break; 1438 case MSR_IA32_UCODE_REV: 1439 rdmsrl_safe(msr->index, &msr->data); 1440 break; 1441 default: 1442 return static_call(kvm_x86_get_msr_feature)(msr); 1443 } 1444 return 0; 1445 } 1446 1447 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1448 { 1449 struct kvm_msr_entry msr; 1450 int r; 1451 1452 msr.index = index; 1453 r = kvm_get_msr_feature(&msr); 1454 1455 if (r == KVM_MSR_RET_INVALID) { 1456 /* Unconditionally clear the output for simplicity */ 1457 *data = 0; 1458 if (kvm_msr_ignored_check(index, 0, false)) 1459 r = 0; 1460 } 1461 1462 if (r) 1463 return r; 1464 1465 *data = msr.data; 1466 1467 return 0; 1468 } 1469 1470 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 1471 { 1472 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) 1473 return false; 1474 1475 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 1476 return false; 1477 1478 if (efer & (EFER_LME | EFER_LMA) && 1479 !guest_cpuid_has(vcpu, X86_FEATURE_LM)) 1480 return false; 1481 1482 if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) 1483 return false; 1484 1485 return true; 1486 1487 } 1488 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 1489 { 1490 if (efer & efer_reserved_bits) 1491 return false; 1492 1493 return __kvm_valid_efer(vcpu, efer); 1494 } 1495 EXPORT_SYMBOL_GPL(kvm_valid_efer); 1496 1497 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1498 { 1499 u64 old_efer = vcpu->arch.efer; 1500 u64 efer = msr_info->data; 1501 int r; 1502 1503 if (efer & efer_reserved_bits) 1504 return 1; 1505 1506 if (!msr_info->host_initiated) { 1507 if (!__kvm_valid_efer(vcpu, efer)) 1508 return 1; 1509 1510 if (is_paging(vcpu) && 1511 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 1512 return 1; 1513 } 1514 1515 efer &= ~EFER_LMA; 1516 efer |= vcpu->arch.efer & EFER_LMA; 1517 1518 r = static_call(kvm_x86_set_efer)(vcpu, efer); 1519 if (r) { 1520 WARN_ON(r > 0); 1521 return r; 1522 } 1523 1524 /* Update reserved bits */ 1525 if ((efer ^ old_efer) & EFER_NX) 1526 kvm_mmu_reset_context(vcpu); 1527 1528 return 0; 1529 } 1530 1531 void kvm_enable_efer_bits(u64 mask) 1532 { 1533 efer_reserved_bits &= ~mask; 1534 } 1535 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 1536 1537 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) 1538 { 1539 struct kvm_x86_msr_filter *msr_filter; 1540 struct msr_bitmap_range *ranges; 1541 struct kvm *kvm = vcpu->kvm; 1542 bool allowed; 1543 int idx; 1544 u32 i; 1545 1546 /* x2APIC MSRs do not support filtering. */ 1547 if (index >= 0x800 && index <= 0x8ff) 1548 return true; 1549 1550 idx = srcu_read_lock(&kvm->srcu); 1551 1552 msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); 1553 if (!msr_filter) { 1554 allowed = true; 1555 goto out; 1556 } 1557 1558 allowed = msr_filter->default_allow; 1559 ranges = msr_filter->ranges; 1560 1561 for (i = 0; i < msr_filter->count; i++) { 1562 u32 start = ranges[i].base; 1563 u32 end = start + ranges[i].nmsrs; 1564 u32 flags = ranges[i].flags; 1565 unsigned long *bitmap = ranges[i].bitmap; 1566 1567 if ((index >= start) && (index < end) && (flags & type)) { 1568 allowed = !!test_bit(index - start, bitmap); 1569 break; 1570 } 1571 } 1572 1573 out: 1574 srcu_read_unlock(&kvm->srcu, idx); 1575 1576 return allowed; 1577 } 1578 EXPORT_SYMBOL_GPL(kvm_msr_allowed); 1579 1580 /* 1581 * Write @data into the MSR specified by @index. Select MSR specific fault 1582 * checks are bypassed if @host_initiated is %true. 1583 * Returns 0 on success, non-0 otherwise. 1584 * Assumes vcpu_load() was already called. 1585 */ 1586 static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, 1587 bool host_initiated) 1588 { 1589 struct msr_data msr; 1590 1591 if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) 1592 return KVM_MSR_RET_FILTERED; 1593 1594 switch (index) { 1595 case MSR_FS_BASE: 1596 case MSR_GS_BASE: 1597 case MSR_KERNEL_GS_BASE: 1598 case MSR_CSTAR: 1599 case MSR_LSTAR: 1600 if (is_noncanonical_address(data, vcpu)) 1601 return 1; 1602 break; 1603 case MSR_IA32_SYSENTER_EIP: 1604 case MSR_IA32_SYSENTER_ESP: 1605 /* 1606 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if 1607 * non-canonical address is written on Intel but not on 1608 * AMD (which ignores the top 32-bits, because it does 1609 * not implement 64-bit SYSENTER). 1610 * 1611 * 64-bit code should hence be able to write a non-canonical 1612 * value on AMD. Making the address canonical ensures that 1613 * vmentry does not fail on Intel after writing a non-canonical 1614 * value, and that something deterministic happens if the guest 1615 * invokes 64-bit SYSENTER. 1616 */ 1617 data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); 1618 } 1619 1620 msr.data = data; 1621 msr.index = index; 1622 msr.host_initiated = host_initiated; 1623 1624 return static_call(kvm_x86_set_msr)(vcpu, &msr); 1625 } 1626 1627 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, 1628 u32 index, u64 data, bool host_initiated) 1629 { 1630 int ret = __kvm_set_msr(vcpu, index, data, host_initiated); 1631 1632 if (ret == KVM_MSR_RET_INVALID) 1633 if (kvm_msr_ignored_check(index, data, true)) 1634 ret = 0; 1635 1636 return ret; 1637 } 1638 1639 /* 1640 * Read the MSR specified by @index into @data. Select MSR specific fault 1641 * checks are bypassed if @host_initiated is %true. 1642 * Returns 0 on success, non-0 otherwise. 1643 * Assumes vcpu_load() was already called. 1644 */ 1645 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1646 bool host_initiated) 1647 { 1648 struct msr_data msr; 1649 int ret; 1650 1651 if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1652 return KVM_MSR_RET_FILTERED; 1653 1654 msr.index = index; 1655 msr.host_initiated = host_initiated; 1656 1657 ret = static_call(kvm_x86_get_msr)(vcpu, &msr); 1658 if (!ret) 1659 *data = msr.data; 1660 return ret; 1661 } 1662 1663 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1664 u32 index, u64 *data, bool host_initiated) 1665 { 1666 int ret = __kvm_get_msr(vcpu, index, data, host_initiated); 1667 1668 if (ret == KVM_MSR_RET_INVALID) { 1669 /* Unconditionally clear *data for simplicity */ 1670 *data = 0; 1671 if (kvm_msr_ignored_check(index, 0, false)) 1672 ret = 0; 1673 } 1674 1675 return ret; 1676 } 1677 1678 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1679 { 1680 return kvm_get_msr_ignored_check(vcpu, index, data, false); 1681 } 1682 EXPORT_SYMBOL_GPL(kvm_get_msr); 1683 1684 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 1685 { 1686 return kvm_set_msr_ignored_check(vcpu, index, data, false); 1687 } 1688 EXPORT_SYMBOL_GPL(kvm_set_msr); 1689 1690 static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) 1691 { 1692 int err = vcpu->run->msr.error; 1693 if (!err) { 1694 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); 1695 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); 1696 } 1697 1698 return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); 1699 } 1700 1701 static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) 1702 { 1703 return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); 1704 } 1705 1706 static u64 kvm_msr_reason(int r) 1707 { 1708 switch (r) { 1709 case KVM_MSR_RET_INVALID: 1710 return KVM_MSR_EXIT_REASON_UNKNOWN; 1711 case KVM_MSR_RET_FILTERED: 1712 return KVM_MSR_EXIT_REASON_FILTER; 1713 default: 1714 return KVM_MSR_EXIT_REASON_INVAL; 1715 } 1716 } 1717 1718 static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, 1719 u32 exit_reason, u64 data, 1720 int (*completion)(struct kvm_vcpu *vcpu), 1721 int r) 1722 { 1723 u64 msr_reason = kvm_msr_reason(r); 1724 1725 /* Check if the user wanted to know about this MSR fault */ 1726 if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) 1727 return 0; 1728 1729 vcpu->run->exit_reason = exit_reason; 1730 vcpu->run->msr.error = 0; 1731 memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); 1732 vcpu->run->msr.reason = msr_reason; 1733 vcpu->run->msr.index = index; 1734 vcpu->run->msr.data = data; 1735 vcpu->arch.complete_userspace_io = completion; 1736 1737 return 1; 1738 } 1739 1740 static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) 1741 { 1742 return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, 1743 complete_emulated_rdmsr, r); 1744 } 1745 1746 static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) 1747 { 1748 return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, 1749 complete_emulated_wrmsr, r); 1750 } 1751 1752 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) 1753 { 1754 u32 ecx = kvm_rcx_read(vcpu); 1755 u64 data; 1756 int r; 1757 1758 r = kvm_get_msr(vcpu, ecx, &data); 1759 1760 /* MSR read failed? See if we should ask user space */ 1761 if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { 1762 /* Bounce to user space */ 1763 return 0; 1764 } 1765 1766 if (!r) { 1767 trace_kvm_msr_read(ecx, data); 1768 1769 kvm_rax_write(vcpu, data & -1u); 1770 kvm_rdx_write(vcpu, (data >> 32) & -1u); 1771 } else { 1772 trace_kvm_msr_read_ex(ecx); 1773 } 1774 1775 return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); 1776 } 1777 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); 1778 1779 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) 1780 { 1781 u32 ecx = kvm_rcx_read(vcpu); 1782 u64 data = kvm_read_edx_eax(vcpu); 1783 int r; 1784 1785 r = kvm_set_msr(vcpu, ecx, data); 1786 1787 /* MSR write failed? See if we should ask user space */ 1788 if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) 1789 /* Bounce to user space */ 1790 return 0; 1791 1792 /* Signal all other negative errors to userspace */ 1793 if (r < 0) 1794 return r; 1795 1796 if (!r) 1797 trace_kvm_msr_write(ecx, data); 1798 else 1799 trace_kvm_msr_write_ex(ecx, data); 1800 1801 return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); 1802 } 1803 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); 1804 1805 int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) 1806 { 1807 return kvm_skip_emulated_instruction(vcpu); 1808 } 1809 EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); 1810 1811 int kvm_emulate_invd(struct kvm_vcpu *vcpu) 1812 { 1813 /* Treat an INVD instruction as a NOP and just skip it. */ 1814 return kvm_emulate_as_nop(vcpu); 1815 } 1816 EXPORT_SYMBOL_GPL(kvm_emulate_invd); 1817 1818 int kvm_emulate_mwait(struct kvm_vcpu *vcpu) 1819 { 1820 pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); 1821 return kvm_emulate_as_nop(vcpu); 1822 } 1823 EXPORT_SYMBOL_GPL(kvm_emulate_mwait); 1824 1825 int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) 1826 { 1827 kvm_queue_exception(vcpu, UD_VECTOR); 1828 return 1; 1829 } 1830 EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); 1831 1832 int kvm_emulate_monitor(struct kvm_vcpu *vcpu) 1833 { 1834 pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); 1835 return kvm_emulate_as_nop(vcpu); 1836 } 1837 EXPORT_SYMBOL_GPL(kvm_emulate_monitor); 1838 1839 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) 1840 { 1841 xfer_to_guest_mode_prepare(); 1842 return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || 1843 xfer_to_guest_mode_work_pending(); 1844 } 1845 1846 /* 1847 * The fast path for frequent and performance sensitive wrmsr emulation, 1848 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces 1849 * the latency of virtual IPI by avoiding the expensive bits of transitioning 1850 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the 1851 * other cases which must be called after interrupts are enabled on the host. 1852 */ 1853 static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) 1854 { 1855 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) 1856 return 1; 1857 1858 if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && 1859 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && 1860 ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && 1861 ((u32)(data >> 32) != X2APIC_BROADCAST)) { 1862 1863 data &= ~(1 << 12); 1864 kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); 1865 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); 1866 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); 1867 trace_kvm_apic_write(APIC_ICR, (u32)data); 1868 return 0; 1869 } 1870 1871 return 1; 1872 } 1873 1874 static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) 1875 { 1876 if (!kvm_can_use_hv_timer(vcpu)) 1877 return 1; 1878 1879 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1880 return 0; 1881 } 1882 1883 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) 1884 { 1885 u32 msr = kvm_rcx_read(vcpu); 1886 u64 data; 1887 fastpath_t ret = EXIT_FASTPATH_NONE; 1888 1889 switch (msr) { 1890 case APIC_BASE_MSR + (APIC_ICR >> 4): 1891 data = kvm_read_edx_eax(vcpu); 1892 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { 1893 kvm_skip_emulated_instruction(vcpu); 1894 ret = EXIT_FASTPATH_EXIT_HANDLED; 1895 } 1896 break; 1897 case MSR_IA32_TSC_DEADLINE: 1898 data = kvm_read_edx_eax(vcpu); 1899 if (!handle_fastpath_set_tscdeadline(vcpu, data)) { 1900 kvm_skip_emulated_instruction(vcpu); 1901 ret = EXIT_FASTPATH_REENTER_GUEST; 1902 } 1903 break; 1904 default: 1905 break; 1906 } 1907 1908 if (ret != EXIT_FASTPATH_NONE) 1909 trace_kvm_msr_write(msr, data); 1910 1911 return ret; 1912 } 1913 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); 1914 1915 /* 1916 * Adapt set_msr() to msr_io()'s calling convention 1917 */ 1918 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1919 { 1920 return kvm_get_msr_ignored_check(vcpu, index, data, true); 1921 } 1922 1923 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1924 { 1925 return kvm_set_msr_ignored_check(vcpu, index, *data, true); 1926 } 1927 1928 #ifdef CONFIG_X86_64 1929 struct pvclock_clock { 1930 int vclock_mode; 1931 u64 cycle_last; 1932 u64 mask; 1933 u32 mult; 1934 u32 shift; 1935 u64 base_cycles; 1936 u64 offset; 1937 }; 1938 1939 struct pvclock_gtod_data { 1940 seqcount_t seq; 1941 1942 struct pvclock_clock clock; /* extract of a clocksource struct */ 1943 struct pvclock_clock raw_clock; /* extract of a clocksource struct */ 1944 1945 ktime_t offs_boot; 1946 u64 wall_time_sec; 1947 }; 1948 1949 static struct pvclock_gtod_data pvclock_gtod_data; 1950 1951 static void update_pvclock_gtod(struct timekeeper *tk) 1952 { 1953 struct pvclock_gtod_data *vdata = &pvclock_gtod_data; 1954 1955 write_seqcount_begin(&vdata->seq); 1956 1957 /* copy pvclock gtod data */ 1958 vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; 1959 vdata->clock.cycle_last = tk->tkr_mono.cycle_last; 1960 vdata->clock.mask = tk->tkr_mono.mask; 1961 vdata->clock.mult = tk->tkr_mono.mult; 1962 vdata->clock.shift = tk->tkr_mono.shift; 1963 vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; 1964 vdata->clock.offset = tk->tkr_mono.base; 1965 1966 vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; 1967 vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; 1968 vdata->raw_clock.mask = tk->tkr_raw.mask; 1969 vdata->raw_clock.mult = tk->tkr_raw.mult; 1970 vdata->raw_clock.shift = tk->tkr_raw.shift; 1971 vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; 1972 vdata->raw_clock.offset = tk->tkr_raw.base; 1973 1974 vdata->wall_time_sec = tk->xtime_sec; 1975 1976 vdata->offs_boot = tk->offs_boot; 1977 1978 write_seqcount_end(&vdata->seq); 1979 } 1980 1981 static s64 get_kvmclock_base_ns(void) 1982 { 1983 /* Count up from boot time, but with the frequency of the raw clock. */ 1984 return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); 1985 } 1986 #else 1987 static s64 get_kvmclock_base_ns(void) 1988 { 1989 /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ 1990 return ktime_get_boottime_ns(); 1991 } 1992 #endif 1993 1994 void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) 1995 { 1996 int version; 1997 int r; 1998 struct pvclock_wall_clock wc; 1999 u32 wc_sec_hi; 2000 u64 wall_nsec; 2001 2002 if (!wall_clock) 2003 return; 2004 2005 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); 2006 if (r) 2007 return; 2008 2009 if (version & 1) 2010 ++version; /* first time write, random junk */ 2011 2012 ++version; 2013 2014 if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) 2015 return; 2016 2017 /* 2018 * The guest calculates current wall clock time by adding 2019 * system time (updated by kvm_guest_time_update below) to the 2020 * wall clock specified here. We do the reverse here. 2021 */ 2022 wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); 2023 2024 wc.nsec = do_div(wall_nsec, 1000000000); 2025 wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ 2026 wc.version = version; 2027 2028 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 2029 2030 if (sec_hi_ofs) { 2031 wc_sec_hi = wall_nsec >> 32; 2032 kvm_write_guest(kvm, wall_clock + sec_hi_ofs, 2033 &wc_sec_hi, sizeof(wc_sec_hi)); 2034 } 2035 2036 version++; 2037 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 2038 } 2039 2040 static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, 2041 bool old_msr, bool host_initiated) 2042 { 2043 struct kvm_arch *ka = &vcpu->kvm->arch; 2044 2045 if (vcpu->vcpu_id == 0 && !host_initiated) { 2046 if (ka->boot_vcpu_runs_old_kvmclock != old_msr) 2047 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 2048 2049 ka->boot_vcpu_runs_old_kvmclock = old_msr; 2050 } 2051 2052 vcpu->arch.time = system_time; 2053 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); 2054 2055 /* we verify if the enable bit is set... */ 2056 vcpu->arch.pv_time_enabled = false; 2057 if (!(system_time & 1)) 2058 return; 2059 2060 if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, 2061 &vcpu->arch.pv_time, system_time & ~1ULL, 2062 sizeof(struct pvclock_vcpu_time_info))) 2063 vcpu->arch.pv_time_enabled = true; 2064 2065 return; 2066 } 2067 2068 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 2069 { 2070 do_shl32_div32(dividend, divisor); 2071 return dividend; 2072 } 2073 2074 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, 2075 s8 *pshift, u32 *pmultiplier) 2076 { 2077 uint64_t scaled64; 2078 int32_t shift = 0; 2079 uint64_t tps64; 2080 uint32_t tps32; 2081 2082 tps64 = base_hz; 2083 scaled64 = scaled_hz; 2084 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { 2085 tps64 >>= 1; 2086 shift--; 2087 } 2088 2089 tps32 = (uint32_t)tps64; 2090 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { 2091 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) 2092 scaled64 >>= 1; 2093 else 2094 tps32 <<= 1; 2095 shift++; 2096 } 2097 2098 *pshift = shift; 2099 *pmultiplier = div_frac(scaled64, tps32); 2100 } 2101 2102 #ifdef CONFIG_X86_64 2103 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); 2104 #endif 2105 2106 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 2107 static unsigned long max_tsc_khz; 2108 2109 static u32 adjust_tsc_khz(u32 khz, s32 ppm) 2110 { 2111 u64 v = (u64)khz * (1000000 + ppm); 2112 do_div(v, 1000000); 2113 return v; 2114 } 2115 2116 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) 2117 { 2118 u64 ratio; 2119 2120 /* Guest TSC same frequency as host TSC? */ 2121 if (!scale) { 2122 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; 2123 return 0; 2124 } 2125 2126 /* TSC scaling supported? */ 2127 if (!kvm_has_tsc_control) { 2128 if (user_tsc_khz > tsc_khz) { 2129 vcpu->arch.tsc_catchup = 1; 2130 vcpu->arch.tsc_always_catchup = 1; 2131 return 0; 2132 } else { 2133 pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); 2134 return -1; 2135 } 2136 } 2137 2138 /* TSC scaling required - calculate ratio */ 2139 ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, 2140 user_tsc_khz, tsc_khz); 2141 2142 if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { 2143 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", 2144 user_tsc_khz); 2145 return -1; 2146 } 2147 2148 vcpu->arch.tsc_scaling_ratio = ratio; 2149 return 0; 2150 } 2151 2152 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 2153 { 2154 u32 thresh_lo, thresh_hi; 2155 int use_scaling = 0; 2156 2157 /* tsc_khz can be zero if TSC calibration fails */ 2158 if (user_tsc_khz == 0) { 2159 /* set tsc_scaling_ratio to a safe value */ 2160 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; 2161 return -1; 2162 } 2163 2164 /* Compute a scale to convert nanoseconds in TSC cycles */ 2165 kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, 2166 &vcpu->arch.virtual_tsc_shift, 2167 &vcpu->arch.virtual_tsc_mult); 2168 vcpu->arch.virtual_tsc_khz = user_tsc_khz; 2169 2170 /* 2171 * Compute the variation in TSC rate which is acceptable 2172 * within the range of tolerance and decide if the 2173 * rate being applied is within that bounds of the hardware 2174 * rate. If so, no scaling or compensation need be done. 2175 */ 2176 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); 2177 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); 2178 if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { 2179 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); 2180 use_scaling = 1; 2181 } 2182 return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); 2183 } 2184 2185 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 2186 { 2187 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, 2188 vcpu->arch.virtual_tsc_mult, 2189 vcpu->arch.virtual_tsc_shift); 2190 tsc += vcpu->arch.this_tsc_write; 2191 return tsc; 2192 } 2193 2194 static inline int gtod_is_based_on_tsc(int mode) 2195 { 2196 return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; 2197 } 2198 2199 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) 2200 { 2201 #ifdef CONFIG_X86_64 2202 bool vcpus_matched; 2203 struct kvm_arch *ka = &vcpu->kvm->arch; 2204 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2205 2206 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 2207 atomic_read(&vcpu->kvm->online_vcpus)); 2208 2209 /* 2210 * Once the masterclock is enabled, always perform request in 2211 * order to update it. 2212 * 2213 * In order to enable masterclock, the host clocksource must be TSC 2214 * and the vcpus need to have matched TSCs. When that happens, 2215 * perform request to enable masterclock. 2216 */ 2217 if (ka->use_master_clock || 2218 (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) 2219 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 2220 2221 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, 2222 atomic_read(&vcpu->kvm->online_vcpus), 2223 ka->use_master_clock, gtod->clock.vclock_mode); 2224 #endif 2225 } 2226 2227 /* 2228 * Multiply tsc by a fixed point number represented by ratio. 2229 * 2230 * The most significant 64-N bits (mult) of ratio represent the 2231 * integral part of the fixed point number; the remaining N bits 2232 * (frac) represent the fractional part, ie. ratio represents a fixed 2233 * point number (mult + frac * 2^(-N)). 2234 * 2235 * N equals to kvm_tsc_scaling_ratio_frac_bits. 2236 */ 2237 static inline u64 __scale_tsc(u64 ratio, u64 tsc) 2238 { 2239 return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); 2240 } 2241 2242 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) 2243 { 2244 u64 _tsc = tsc; 2245 u64 ratio = vcpu->arch.tsc_scaling_ratio; 2246 2247 if (ratio != kvm_default_tsc_scaling_ratio) 2248 _tsc = __scale_tsc(ratio, tsc); 2249 2250 return _tsc; 2251 } 2252 EXPORT_SYMBOL_GPL(kvm_scale_tsc); 2253 2254 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2255 { 2256 u64 tsc; 2257 2258 tsc = kvm_scale_tsc(vcpu, rdtsc()); 2259 2260 return target_tsc - tsc; 2261 } 2262 2263 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2264 { 2265 return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); 2266 } 2267 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); 2268 2269 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 2270 { 2271 vcpu->arch.l1_tsc_offset = offset; 2272 vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); 2273 } 2274 2275 static inline bool kvm_check_tsc_unstable(void) 2276 { 2277 #ifdef CONFIG_X86_64 2278 /* 2279 * TSC is marked unstable when we're running on Hyper-V, 2280 * 'TSC page' clocksource is good. 2281 */ 2282 if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) 2283 return false; 2284 #endif 2285 return check_tsc_unstable(); 2286 } 2287 2288 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) 2289 { 2290 struct kvm *kvm = vcpu->kvm; 2291 u64 offset, ns, elapsed; 2292 unsigned long flags; 2293 bool matched; 2294 bool already_matched; 2295 bool synchronizing = false; 2296 2297 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 2298 offset = kvm_compute_tsc_offset(vcpu, data); 2299 ns = get_kvmclock_base_ns(); 2300 elapsed = ns - kvm->arch.last_tsc_nsec; 2301 2302 if (vcpu->arch.virtual_tsc_khz) { 2303 if (data == 0) { 2304 /* 2305 * detection of vcpu initialization -- need to sync 2306 * with other vCPUs. This particularly helps to keep 2307 * kvm_clock stable after CPU hotplug 2308 */ 2309 synchronizing = true; 2310 } else { 2311 u64 tsc_exp = kvm->arch.last_tsc_write + 2312 nsec_to_cycles(vcpu, elapsed); 2313 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; 2314 /* 2315 * Special case: TSC write with a small delta (1 second) 2316 * of virtual cycle time against real time is 2317 * interpreted as an attempt to synchronize the CPU. 2318 */ 2319 synchronizing = data < tsc_exp + tsc_hz && 2320 data + tsc_hz > tsc_exp; 2321 } 2322 } 2323 2324 /* 2325 * For a reliable TSC, we can match TSC offsets, and for an unstable 2326 * TSC, we add elapsed time in this computation. We could let the 2327 * compensation code attempt to catch up if we fall behind, but 2328 * it's better to try to match offsets from the beginning. 2329 */ 2330 if (synchronizing && 2331 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { 2332 if (!kvm_check_tsc_unstable()) { 2333 offset = kvm->arch.cur_tsc_offset; 2334 } else { 2335 u64 delta = nsec_to_cycles(vcpu, elapsed); 2336 data += delta; 2337 offset = kvm_compute_tsc_offset(vcpu, data); 2338 } 2339 matched = true; 2340 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); 2341 } else { 2342 /* 2343 * We split periods of matched TSC writes into generations. 2344 * For each generation, we track the original measured 2345 * nanosecond time, offset, and write, so if TSCs are in 2346 * sync, we can match exact offset, and if not, we can match 2347 * exact software computation in compute_guest_tsc() 2348 * 2349 * These values are tracked in kvm->arch.cur_xxx variables. 2350 */ 2351 kvm->arch.cur_tsc_generation++; 2352 kvm->arch.cur_tsc_nsec = ns; 2353 kvm->arch.cur_tsc_write = data; 2354 kvm->arch.cur_tsc_offset = offset; 2355 matched = false; 2356 } 2357 2358 /* 2359 * We also track th most recent recorded KHZ, write and time to 2360 * allow the matching interval to be extended at each write. 2361 */ 2362 kvm->arch.last_tsc_nsec = ns; 2363 kvm->arch.last_tsc_write = data; 2364 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; 2365 2366 vcpu->arch.last_guest_tsc = data; 2367 2368 /* Keep track of which generation this VCPU has synchronized to */ 2369 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; 2370 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 2371 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 2372 2373 kvm_vcpu_write_tsc_offset(vcpu, offset); 2374 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 2375 2376 spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); 2377 if (!matched) { 2378 kvm->arch.nr_vcpus_matched_tsc = 0; 2379 } else if (!already_matched) { 2380 kvm->arch.nr_vcpus_matched_tsc++; 2381 } 2382 2383 kvm_track_tsc_matching(vcpu); 2384 spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); 2385 } 2386 2387 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, 2388 s64 adjustment) 2389 { 2390 u64 tsc_offset = vcpu->arch.l1_tsc_offset; 2391 kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); 2392 } 2393 2394 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) 2395 { 2396 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) 2397 WARN_ON(adjustment < 0); 2398 adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); 2399 adjust_tsc_offset_guest(vcpu, adjustment); 2400 } 2401 2402 #ifdef CONFIG_X86_64 2403 2404 static u64 read_tsc(void) 2405 { 2406 u64 ret = (u64)rdtsc_ordered(); 2407 u64 last = pvclock_gtod_data.clock.cycle_last; 2408 2409 if (likely(ret >= last)) 2410 return ret; 2411 2412 /* 2413 * GCC likes to generate cmov here, but this branch is extremely 2414 * predictable (it's just a function of time and the likely is 2415 * very likely) and there's a data dependence, so force GCC 2416 * to generate a branch instead. I don't barrier() because 2417 * we don't actually need a barrier, and if this function 2418 * ever gets inlined it will generate worse code. 2419 */ 2420 asm volatile (""); 2421 return last; 2422 } 2423 2424 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, 2425 int *mode) 2426 { 2427 long v; 2428 u64 tsc_pg_val; 2429 2430 switch (clock->vclock_mode) { 2431 case VDSO_CLOCKMODE_HVCLOCK: 2432 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), 2433 tsc_timestamp); 2434 if (tsc_pg_val != U64_MAX) { 2435 /* TSC page valid */ 2436 *mode = VDSO_CLOCKMODE_HVCLOCK; 2437 v = (tsc_pg_val - clock->cycle_last) & 2438 clock->mask; 2439 } else { 2440 /* TSC page invalid */ 2441 *mode = VDSO_CLOCKMODE_NONE; 2442 } 2443 break; 2444 case VDSO_CLOCKMODE_TSC: 2445 *mode = VDSO_CLOCKMODE_TSC; 2446 *tsc_timestamp = read_tsc(); 2447 v = (*tsc_timestamp - clock->cycle_last) & 2448 clock->mask; 2449 break; 2450 default: 2451 *mode = VDSO_CLOCKMODE_NONE; 2452 } 2453 2454 if (*mode == VDSO_CLOCKMODE_NONE) 2455 *tsc_timestamp = v = 0; 2456 2457 return v * clock->mult; 2458 } 2459 2460 static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) 2461 { 2462 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2463 unsigned long seq; 2464 int mode; 2465 u64 ns; 2466 2467 do { 2468 seq = read_seqcount_begin(>od->seq); 2469 ns = gtod->raw_clock.base_cycles; 2470 ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); 2471 ns >>= gtod->raw_clock.shift; 2472 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); 2473 } while (unlikely(read_seqcount_retry(>od->seq, seq))); 2474 *t = ns; 2475 2476 return mode; 2477 } 2478 2479 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) 2480 { 2481 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2482 unsigned long seq; 2483 int mode; 2484 u64 ns; 2485 2486 do { 2487 seq = read_seqcount_begin(>od->seq); 2488 ts->tv_sec = gtod->wall_time_sec; 2489 ns = gtod->clock.base_cycles; 2490 ns += vgettsc(>od->clock, tsc_timestamp, &mode); 2491 ns >>= gtod->clock.shift; 2492 } while (unlikely(read_seqcount_retry(>od->seq, seq))); 2493 2494 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); 2495 ts->tv_nsec = ns; 2496 2497 return mode; 2498 } 2499 2500 /* returns true if host is using TSC based clocksource */ 2501 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) 2502 { 2503 /* checked again under seqlock below */ 2504 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) 2505 return false; 2506 2507 return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, 2508 tsc_timestamp)); 2509 } 2510 2511 /* returns true if host is using TSC based clocksource */ 2512 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, 2513 u64 *tsc_timestamp) 2514 { 2515 /* checked again under seqlock below */ 2516 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) 2517 return false; 2518 2519 return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); 2520 } 2521 #endif 2522 2523 /* 2524 * 2525 * Assuming a stable TSC across physical CPUS, and a stable TSC 2526 * across virtual CPUs, the following condition is possible. 2527 * Each numbered line represents an event visible to both 2528 * CPUs at the next numbered event. 2529 * 2530 * "timespecX" represents host monotonic time. "tscX" represents 2531 * RDTSC value. 2532 * 2533 * VCPU0 on CPU0 | VCPU1 on CPU1 2534 * 2535 * 1. read timespec0,tsc0 2536 * 2. | timespec1 = timespec0 + N 2537 * | tsc1 = tsc0 + M 2538 * 3. transition to guest | transition to guest 2539 * 4. ret0 = timespec0 + (rdtsc - tsc0) | 2540 * 5. | ret1 = timespec1 + (rdtsc - tsc1) 2541 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) 2542 * 2543 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: 2544 * 2545 * - ret0 < ret1 2546 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) 2547 * ... 2548 * - 0 < N - M => M < N 2549 * 2550 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not 2551 * always the case (the difference between two distinct xtime instances 2552 * might be smaller then the difference between corresponding TSC reads, 2553 * when updating guest vcpus pvclock areas). 2554 * 2555 * To avoid that problem, do not allow visibility of distinct 2556 * system_timestamp/tsc_timestamp values simultaneously: use a master 2557 * copy of host monotonic time values. Update that master copy 2558 * in lockstep. 2559 * 2560 * Rely on synchronization of host TSCs and guest TSCs for monotonicity. 2561 * 2562 */ 2563 2564 static void pvclock_update_vm_gtod_copy(struct kvm *kvm) 2565 { 2566 #ifdef CONFIG_X86_64 2567 struct kvm_arch *ka = &kvm->arch; 2568 int vclock_mode; 2569 bool host_tsc_clocksource, vcpus_matched; 2570 2571 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 2572 atomic_read(&kvm->online_vcpus)); 2573 2574 /* 2575 * If the host uses TSC clock, then passthrough TSC as stable 2576 * to the guest. 2577 */ 2578 host_tsc_clocksource = kvm_get_time_and_clockread( 2579 &ka->master_kernel_ns, 2580 &ka->master_cycle_now); 2581 2582 ka->use_master_clock = host_tsc_clocksource && vcpus_matched 2583 && !ka->backwards_tsc_observed 2584 && !ka->boot_vcpu_runs_old_kvmclock; 2585 2586 if (ka->use_master_clock) 2587 atomic_set(&kvm_guest_has_master_clock, 1); 2588 2589 vclock_mode = pvclock_gtod_data.clock.vclock_mode; 2590 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, 2591 vcpus_matched); 2592 #endif 2593 } 2594 2595 void kvm_make_mclock_inprogress_request(struct kvm *kvm) 2596 { 2597 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 2598 } 2599 2600 static void kvm_gen_update_masterclock(struct kvm *kvm) 2601 { 2602 #ifdef CONFIG_X86_64 2603 int i; 2604 struct kvm_vcpu *vcpu; 2605 struct kvm_arch *ka = &kvm->arch; 2606 unsigned long flags; 2607 2608 kvm_hv_invalidate_tsc_page(kvm); 2609 2610 kvm_make_mclock_inprogress_request(kvm); 2611 2612 /* no guest entries from this point */ 2613 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 2614 pvclock_update_vm_gtod_copy(kvm); 2615 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2616 2617 kvm_for_each_vcpu(i, vcpu, kvm) 2618 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2619 2620 /* guest entries allowed */ 2621 kvm_for_each_vcpu(i, vcpu, kvm) 2622 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); 2623 #endif 2624 } 2625 2626 u64 get_kvmclock_ns(struct kvm *kvm) 2627 { 2628 struct kvm_arch *ka = &kvm->arch; 2629 struct pvclock_vcpu_time_info hv_clock; 2630 unsigned long flags; 2631 u64 ret; 2632 2633 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 2634 if (!ka->use_master_clock) { 2635 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2636 return get_kvmclock_base_ns() + ka->kvmclock_offset; 2637 } 2638 2639 hv_clock.tsc_timestamp = ka->master_cycle_now; 2640 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; 2641 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2642 2643 /* both __this_cpu_read() and rdtsc() should be on the same cpu */ 2644 get_cpu(); 2645 2646 if (__this_cpu_read(cpu_tsc_khz)) { 2647 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, 2648 &hv_clock.tsc_shift, 2649 &hv_clock.tsc_to_system_mul); 2650 ret = __pvclock_read_cycles(&hv_clock, rdtsc()); 2651 } else 2652 ret = get_kvmclock_base_ns() + ka->kvmclock_offset; 2653 2654 put_cpu(); 2655 2656 return ret; 2657 } 2658 2659 static void kvm_setup_pvclock_page(struct kvm_vcpu *v, 2660 struct gfn_to_hva_cache *cache, 2661 unsigned int offset) 2662 { 2663 struct kvm_vcpu_arch *vcpu = &v->arch; 2664 struct pvclock_vcpu_time_info guest_hv_clock; 2665 2666 if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, 2667 &guest_hv_clock, offset, sizeof(guest_hv_clock)))) 2668 return; 2669 2670 /* This VCPU is paused, but it's legal for a guest to read another 2671 * VCPU's kvmclock, so we really have to follow the specification where 2672 * it says that version is odd if data is being modified, and even after 2673 * it is consistent. 2674 * 2675 * Version field updates must be kept separate. This is because 2676 * kvm_write_guest_cached might use a "rep movs" instruction, and 2677 * writes within a string instruction are weakly ordered. So there 2678 * are three writes overall. 2679 * 2680 * As a small optimization, only write the version field in the first 2681 * and third write. The vcpu->pv_time cache is still valid, because the 2682 * version field is the first in the struct. 2683 */ 2684 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); 2685 2686 if (guest_hv_clock.version & 1) 2687 ++guest_hv_clock.version; /* first time write, random junk */ 2688 2689 vcpu->hv_clock.version = guest_hv_clock.version + 1; 2690 kvm_write_guest_offset_cached(v->kvm, cache, 2691 &vcpu->hv_clock, offset, 2692 sizeof(vcpu->hv_clock.version)); 2693 2694 smp_wmb(); 2695 2696 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ 2697 vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); 2698 2699 if (vcpu->pvclock_set_guest_stopped_request) { 2700 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; 2701 vcpu->pvclock_set_guest_stopped_request = false; 2702 } 2703 2704 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); 2705 2706 kvm_write_guest_offset_cached(v->kvm, cache, 2707 &vcpu->hv_clock, offset, 2708 sizeof(vcpu->hv_clock)); 2709 2710 smp_wmb(); 2711 2712 vcpu->hv_clock.version++; 2713 kvm_write_guest_offset_cached(v->kvm, cache, 2714 &vcpu->hv_clock, offset, 2715 sizeof(vcpu->hv_clock.version)); 2716 } 2717 2718 static int kvm_guest_time_update(struct kvm_vcpu *v) 2719 { 2720 unsigned long flags, tgt_tsc_khz; 2721 struct kvm_vcpu_arch *vcpu = &v->arch; 2722 struct kvm_arch *ka = &v->kvm->arch; 2723 s64 kernel_ns; 2724 u64 tsc_timestamp, host_tsc; 2725 u8 pvclock_flags; 2726 bool use_master_clock; 2727 2728 kernel_ns = 0; 2729 host_tsc = 0; 2730 2731 /* 2732 * If the host uses TSC clock, then passthrough TSC as stable 2733 * to the guest. 2734 */ 2735 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 2736 use_master_clock = ka->use_master_clock; 2737 if (use_master_clock) { 2738 host_tsc = ka->master_cycle_now; 2739 kernel_ns = ka->master_kernel_ns; 2740 } 2741 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 2742 2743 /* Keep irq disabled to prevent changes to the clock */ 2744 local_irq_save(flags); 2745 tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); 2746 if (unlikely(tgt_tsc_khz == 0)) { 2747 local_irq_restore(flags); 2748 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 2749 return 1; 2750 } 2751 if (!use_master_clock) { 2752 host_tsc = rdtsc(); 2753 kernel_ns = get_kvmclock_base_ns(); 2754 } 2755 2756 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); 2757 2758 /* 2759 * We may have to catch up the TSC to match elapsed wall clock 2760 * time for two reasons, even if kvmclock is used. 2761 * 1) CPU could have been running below the maximum TSC rate 2762 * 2) Broken TSC compensation resets the base at each VCPU 2763 * entry to avoid unknown leaps of TSC even when running 2764 * again on the same CPU. This may cause apparent elapsed 2765 * time to disappear, and the guest to stand still or run 2766 * very slowly. 2767 */ 2768 if (vcpu->tsc_catchup) { 2769 u64 tsc = compute_guest_tsc(v, kernel_ns); 2770 if (tsc > tsc_timestamp) { 2771 adjust_tsc_offset_guest(v, tsc - tsc_timestamp); 2772 tsc_timestamp = tsc; 2773 } 2774 } 2775 2776 local_irq_restore(flags); 2777 2778 /* With all the info we got, fill in the values */ 2779 2780 if (kvm_has_tsc_control) 2781 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); 2782 2783 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { 2784 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, 2785 &vcpu->hv_clock.tsc_shift, 2786 &vcpu->hv_clock.tsc_to_system_mul); 2787 vcpu->hw_tsc_khz = tgt_tsc_khz; 2788 } 2789 2790 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 2791 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 2792 vcpu->last_guest_tsc = tsc_timestamp; 2793 2794 /* If the host uses TSC clocksource, then it is stable */ 2795 pvclock_flags = 0; 2796 if (use_master_clock) 2797 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; 2798 2799 vcpu->hv_clock.flags = pvclock_flags; 2800 2801 if (vcpu->pv_time_enabled) 2802 kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); 2803 if (vcpu->xen.vcpu_info_set) 2804 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, 2805 offsetof(struct compat_vcpu_info, time)); 2806 if (vcpu->xen.vcpu_time_info_set) 2807 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); 2808 if (v == kvm_get_vcpu(v->kvm, 0)) 2809 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); 2810 return 0; 2811 } 2812 2813 /* 2814 * kvmclock updates which are isolated to a given vcpu, such as 2815 * vcpu->cpu migration, should not allow system_timestamp from 2816 * the rest of the vcpus to remain static. Otherwise ntp frequency 2817 * correction applies to one vcpu's system_timestamp but not 2818 * the others. 2819 * 2820 * So in those cases, request a kvmclock update for all vcpus. 2821 * We need to rate-limit these requests though, as they can 2822 * considerably slow guests that have a large number of vcpus. 2823 * The time for a remote vcpu to update its kvmclock is bound 2824 * by the delay we use to rate-limit the updates. 2825 */ 2826 2827 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) 2828 2829 static void kvmclock_update_fn(struct work_struct *work) 2830 { 2831 int i; 2832 struct delayed_work *dwork = to_delayed_work(work); 2833 struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 2834 kvmclock_update_work); 2835 struct kvm *kvm = container_of(ka, struct kvm, arch); 2836 struct kvm_vcpu *vcpu; 2837 2838 kvm_for_each_vcpu(i, vcpu, kvm) { 2839 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2840 kvm_vcpu_kick(vcpu); 2841 } 2842 } 2843 2844 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 2845 { 2846 struct kvm *kvm = v->kvm; 2847 2848 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 2849 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 2850 KVMCLOCK_UPDATE_DELAY); 2851 } 2852 2853 #define KVMCLOCK_SYNC_PERIOD (300 * HZ) 2854 2855 static void kvmclock_sync_fn(struct work_struct *work) 2856 { 2857 struct delayed_work *dwork = to_delayed_work(work); 2858 struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 2859 kvmclock_sync_work); 2860 struct kvm *kvm = container_of(ka, struct kvm, arch); 2861 2862 if (!kvmclock_periodic_sync) 2863 return; 2864 2865 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 2866 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 2867 KVMCLOCK_SYNC_PERIOD); 2868 } 2869 2870 /* 2871 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. 2872 */ 2873 static bool can_set_mci_status(struct kvm_vcpu *vcpu) 2874 { 2875 /* McStatusWrEn enabled? */ 2876 if (guest_cpuid_is_amd_or_hygon(vcpu)) 2877 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); 2878 2879 return false; 2880 } 2881 2882 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2883 { 2884 u64 mcg_cap = vcpu->arch.mcg_cap; 2885 unsigned bank_num = mcg_cap & 0xff; 2886 u32 msr = msr_info->index; 2887 u64 data = msr_info->data; 2888 2889 switch (msr) { 2890 case MSR_IA32_MCG_STATUS: 2891 vcpu->arch.mcg_status = data; 2892 break; 2893 case MSR_IA32_MCG_CTL: 2894 if (!(mcg_cap & MCG_CTL_P) && 2895 (data || !msr_info->host_initiated)) 2896 return 1; 2897 if (data != 0 && data != ~(u64)0) 2898 return 1; 2899 vcpu->arch.mcg_ctl = data; 2900 break; 2901 default: 2902 if (msr >= MSR_IA32_MC0_CTL && 2903 msr < MSR_IA32_MCx_CTL(bank_num)) { 2904 u32 offset = array_index_nospec( 2905 msr - MSR_IA32_MC0_CTL, 2906 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); 2907 2908 /* only 0 or all 1s can be written to IA32_MCi_CTL 2909 * some Linux kernels though clear bit 10 in bank 4 to 2910 * workaround a BIOS/GART TBL issue on AMD K8s, ignore 2911 * this to avoid an uncatched #GP in the guest 2912 */ 2913 if ((offset & 0x3) == 0 && 2914 data != 0 && (data | (1 << 10)) != ~(u64)0) 2915 return -1; 2916 2917 /* MCi_STATUS */ 2918 if (!msr_info->host_initiated && 2919 (offset & 0x3) == 1 && data != 0) { 2920 if (!can_set_mci_status(vcpu)) 2921 return -1; 2922 } 2923 2924 vcpu->arch.mce_banks[offset] = data; 2925 break; 2926 } 2927 return 1; 2928 } 2929 return 0; 2930 } 2931 2932 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) 2933 { 2934 u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; 2935 2936 return (vcpu->arch.apf.msr_en_val & mask) == mask; 2937 } 2938 2939 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) 2940 { 2941 gpa_t gpa = data & ~0x3f; 2942 2943 /* Bits 4:5 are reserved, Should be zero */ 2944 if (data & 0x30) 2945 return 1; 2946 2947 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && 2948 (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) 2949 return 1; 2950 2951 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && 2952 (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) 2953 return 1; 2954 2955 if (!lapic_in_kernel(vcpu)) 2956 return data ? 1 : 0; 2957 2958 vcpu->arch.apf.msr_en_val = data; 2959 2960 if (!kvm_pv_async_pf_enabled(vcpu)) { 2961 kvm_clear_async_pf_completion_queue(vcpu); 2962 kvm_async_pf_hash_reset(vcpu); 2963 return 0; 2964 } 2965 2966 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, 2967 sizeof(u64))) 2968 return 1; 2969 2970 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); 2971 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; 2972 2973 kvm_async_pf_wakeup_all(vcpu); 2974 2975 return 0; 2976 } 2977 2978 static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) 2979 { 2980 /* Bits 8-63 are reserved */ 2981 if (data >> 8) 2982 return 1; 2983 2984 if (!lapic_in_kernel(vcpu)) 2985 return 1; 2986 2987 vcpu->arch.apf.msr_int_val = data; 2988 2989 vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; 2990 2991 return 0; 2992 } 2993 2994 static void kvmclock_reset(struct kvm_vcpu *vcpu) 2995 { 2996 vcpu->arch.pv_time_enabled = false; 2997 vcpu->arch.time = 0; 2998 } 2999 3000 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) 3001 { 3002 ++vcpu->stat.tlb_flush; 3003 static_call(kvm_x86_tlb_flush_all)(vcpu); 3004 } 3005 3006 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) 3007 { 3008 ++vcpu->stat.tlb_flush; 3009 static_call(kvm_x86_tlb_flush_guest)(vcpu); 3010 } 3011 3012 static void record_steal_time(struct kvm_vcpu *vcpu) 3013 { 3014 struct kvm_host_map map; 3015 struct kvm_steal_time *st; 3016 3017 if (kvm_xen_msr_enabled(vcpu->kvm)) { 3018 kvm_xen_runstate_set_running(vcpu); 3019 return; 3020 } 3021 3022 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 3023 return; 3024 3025 /* -EAGAIN is returned in atomic context so we can just return. */ 3026 if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, 3027 &map, &vcpu->arch.st.cache, false)) 3028 return; 3029 3030 st = map.hva + 3031 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); 3032 3033 /* 3034 * Doing a TLB flush here, on the guest's behalf, can avoid 3035 * expensive IPIs. 3036 */ 3037 if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { 3038 trace_kvm_pv_tlb_flush(vcpu->vcpu_id, 3039 st->preempted & KVM_VCPU_FLUSH_TLB); 3040 if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) 3041 kvm_vcpu_flush_tlb_guest(vcpu); 3042 } 3043 3044 vcpu->arch.st.preempted = 0; 3045 3046 if (st->version & 1) 3047 st->version += 1; /* first time write, random junk */ 3048 3049 st->version += 1; 3050 3051 smp_wmb(); 3052 3053 st->steal += current->sched_info.run_delay - 3054 vcpu->arch.st.last_steal; 3055 vcpu->arch.st.last_steal = current->sched_info.run_delay; 3056 3057 smp_wmb(); 3058 3059 st->version += 1; 3060 3061 kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); 3062 } 3063 3064 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3065 { 3066 bool pr = false; 3067 u32 msr = msr_info->index; 3068 u64 data = msr_info->data; 3069 3070 if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) 3071 return kvm_xen_write_hypercall_page(vcpu, data); 3072 3073 switch (msr) { 3074 case MSR_AMD64_NB_CFG: 3075 case MSR_IA32_UCODE_WRITE: 3076 case MSR_VM_HSAVE_PA: 3077 case MSR_AMD64_PATCH_LOADER: 3078 case MSR_AMD64_BU_CFG2: 3079 case MSR_AMD64_DC_CFG: 3080 case MSR_F15H_EX_CFG: 3081 break; 3082 3083 case MSR_IA32_UCODE_REV: 3084 if (msr_info->host_initiated) 3085 vcpu->arch.microcode_version = data; 3086 break; 3087 case MSR_IA32_ARCH_CAPABILITIES: 3088 if (!msr_info->host_initiated) 3089 return 1; 3090 vcpu->arch.arch_capabilities = data; 3091 break; 3092 case MSR_IA32_PERF_CAPABILITIES: { 3093 struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; 3094 3095 if (!msr_info->host_initiated) 3096 return 1; 3097 if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) 3098 return 1; 3099 if (data & ~msr_ent.data) 3100 return 1; 3101 3102 vcpu->arch.perf_capabilities = data; 3103 3104 return 0; 3105 } 3106 case MSR_EFER: 3107 return set_efer(vcpu, msr_info); 3108 case MSR_K7_HWCR: 3109 data &= ~(u64)0x40; /* ignore flush filter disable */ 3110 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 3111 data &= ~(u64)0x8; /* ignore TLB cache disable */ 3112 3113 /* Handle McStatusWrEn */ 3114 if (data == BIT_ULL(18)) { 3115 vcpu->arch.msr_hwcr = data; 3116 } else if (data != 0) { 3117 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 3118 data); 3119 return 1; 3120 } 3121 break; 3122 case MSR_FAM10H_MMIO_CONF_BASE: 3123 if (data != 0) { 3124 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 3125 "0x%llx\n", data); 3126 return 1; 3127 } 3128 break; 3129 case 0x200 ... 0x2ff: 3130 return kvm_mtrr_set_msr(vcpu, msr, data); 3131 case MSR_IA32_APICBASE: 3132 return kvm_set_apic_base(vcpu, msr_info); 3133 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 3134 return kvm_x2apic_msr_write(vcpu, msr, data); 3135 case MSR_IA32_TSC_DEADLINE: 3136 kvm_set_lapic_tscdeadline_msr(vcpu, data); 3137 break; 3138 case MSR_IA32_TSC_ADJUST: 3139 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { 3140 if (!msr_info->host_initiated) { 3141 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 3142 adjust_tsc_offset_guest(vcpu, adj); 3143 } 3144 vcpu->arch.ia32_tsc_adjust_msr = data; 3145 } 3146 break; 3147 case MSR_IA32_MISC_ENABLE: 3148 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && 3149 ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { 3150 if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) 3151 return 1; 3152 vcpu->arch.ia32_misc_enable_msr = data; 3153 kvm_update_cpuid_runtime(vcpu); 3154 } else { 3155 vcpu->arch.ia32_misc_enable_msr = data; 3156 } 3157 break; 3158 case MSR_IA32_SMBASE: 3159 if (!msr_info->host_initiated) 3160 return 1; 3161 vcpu->arch.smbase = data; 3162 break; 3163 case MSR_IA32_POWER_CTL: 3164 vcpu->arch.msr_ia32_power_ctl = data; 3165 break; 3166 case MSR_IA32_TSC: 3167 if (msr_info->host_initiated) { 3168 kvm_synchronize_tsc(vcpu, data); 3169 } else { 3170 u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; 3171 adjust_tsc_offset_guest(vcpu, adj); 3172 vcpu->arch.ia32_tsc_adjust_msr += adj; 3173 } 3174 break; 3175 case MSR_IA32_XSS: 3176 if (!msr_info->host_initiated && 3177 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 3178 return 1; 3179 /* 3180 * KVM supports exposing PT to the guest, but does not support 3181 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than 3182 * XSAVES/XRSTORS to save/restore PT MSRs. 3183 */ 3184 if (data & ~supported_xss) 3185 return 1; 3186 vcpu->arch.ia32_xss = data; 3187 break; 3188 case MSR_SMI_COUNT: 3189 if (!msr_info->host_initiated) 3190 return 1; 3191 vcpu->arch.smi_count = data; 3192 break; 3193 case MSR_KVM_WALL_CLOCK_NEW: 3194 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3195 return 1; 3196 3197 vcpu->kvm->arch.wall_clock = data; 3198 kvm_write_wall_clock(vcpu->kvm, data, 0); 3199 break; 3200 case MSR_KVM_WALL_CLOCK: 3201 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3202 return 1; 3203 3204 vcpu->kvm->arch.wall_clock = data; 3205 kvm_write_wall_clock(vcpu->kvm, data, 0); 3206 break; 3207 case MSR_KVM_SYSTEM_TIME_NEW: 3208 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3209 return 1; 3210 3211 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); 3212 break; 3213 case MSR_KVM_SYSTEM_TIME: 3214 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3215 return 1; 3216 3217 kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); 3218 break; 3219 case MSR_KVM_ASYNC_PF_EN: 3220 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3221 return 1; 3222 3223 if (kvm_pv_enable_async_pf(vcpu, data)) 3224 return 1; 3225 break; 3226 case MSR_KVM_ASYNC_PF_INT: 3227 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 3228 return 1; 3229 3230 if (kvm_pv_enable_async_pf_int(vcpu, data)) 3231 return 1; 3232 break; 3233 case MSR_KVM_ASYNC_PF_ACK: 3234 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3235 return 1; 3236 if (data & 0x1) { 3237 vcpu->arch.apf.pageready_pending = false; 3238 kvm_check_async_pf_completion(vcpu); 3239 } 3240 break; 3241 case MSR_KVM_STEAL_TIME: 3242 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) 3243 return 1; 3244 3245 if (unlikely(!sched_info_on())) 3246 return 1; 3247 3248 if (data & KVM_STEAL_RESERVED_MASK) 3249 return 1; 3250 3251 vcpu->arch.st.msr_val = data; 3252 3253 if (!(data & KVM_MSR_ENABLED)) 3254 break; 3255 3256 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 3257 3258 break; 3259 case MSR_KVM_PV_EOI_EN: 3260 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) 3261 return 1; 3262 3263 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) 3264 return 1; 3265 break; 3266 3267 case MSR_KVM_POLL_CONTROL: 3268 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) 3269 return 1; 3270 3271 /* only enable bit supported */ 3272 if (data & (-1ULL << 1)) 3273 return 1; 3274 3275 vcpu->arch.msr_kvm_poll_control = data; 3276 break; 3277 3278 case MSR_IA32_MCG_CTL: 3279 case MSR_IA32_MCG_STATUS: 3280 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 3281 return set_msr_mce(vcpu, msr_info); 3282 3283 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 3284 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 3285 pr = true; 3286 fallthrough; 3287 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: 3288 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: 3289 if (kvm_pmu_is_valid_msr(vcpu, msr)) 3290 return kvm_pmu_set_msr(vcpu, msr_info); 3291 3292 if (pr || data != 0) 3293 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 3294 "0x%x data 0x%llx\n", msr, data); 3295 break; 3296 case MSR_K7_CLK_CTL: 3297 /* 3298 * Ignore all writes to this no longer documented MSR. 3299 * Writes are only relevant for old K7 processors, 3300 * all pre-dating SVM, but a recommended workaround from 3301 * AMD for these chips. It is possible to specify the 3302 * affected processor models on the command line, hence 3303 * the need to ignore the workaround. 3304 */ 3305 break; 3306 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 3307 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: 3308 case HV_X64_MSR_SYNDBG_OPTIONS: 3309 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 3310 case HV_X64_MSR_CRASH_CTL: 3311 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: 3312 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 3313 case HV_X64_MSR_TSC_EMULATION_CONTROL: 3314 case HV_X64_MSR_TSC_EMULATION_STATUS: 3315 return kvm_hv_set_msr_common(vcpu, msr, data, 3316 msr_info->host_initiated); 3317 case MSR_IA32_BBL_CR_CTL3: 3318 /* Drop writes to this legacy MSR -- see rdmsr 3319 * counterpart for further detail. 3320 */ 3321 if (report_ignored_msrs) 3322 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", 3323 msr, data); 3324 break; 3325 case MSR_AMD64_OSVW_ID_LENGTH: 3326 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3327 return 1; 3328 vcpu->arch.osvw.length = data; 3329 break; 3330 case MSR_AMD64_OSVW_STATUS: 3331 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3332 return 1; 3333 vcpu->arch.osvw.status = data; 3334 break; 3335 case MSR_PLATFORM_INFO: 3336 if (!msr_info->host_initiated || 3337 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && 3338 cpuid_fault_enabled(vcpu))) 3339 return 1; 3340 vcpu->arch.msr_platform_info = data; 3341 break; 3342 case MSR_MISC_FEATURES_ENABLES: 3343 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || 3344 (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && 3345 !supports_cpuid_fault(vcpu))) 3346 return 1; 3347 vcpu->arch.msr_misc_features_enables = data; 3348 break; 3349 default: 3350 if (kvm_pmu_is_valid_msr(vcpu, msr)) 3351 return kvm_pmu_set_msr(vcpu, msr_info); 3352 return KVM_MSR_RET_INVALID; 3353 } 3354 return 0; 3355 } 3356 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 3357 3358 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) 3359 { 3360 u64 data; 3361 u64 mcg_cap = vcpu->arch.mcg_cap; 3362 unsigned bank_num = mcg_cap & 0xff; 3363 3364 switch (msr) { 3365 case MSR_IA32_P5_MC_ADDR: 3366 case MSR_IA32_P5_MC_TYPE: 3367 data = 0; 3368 break; 3369 case MSR_IA32_MCG_CAP: 3370 data = vcpu->arch.mcg_cap; 3371 break; 3372 case MSR_IA32_MCG_CTL: 3373 if (!(mcg_cap & MCG_CTL_P) && !host) 3374 return 1; 3375 data = vcpu->arch.mcg_ctl; 3376 break; 3377 case MSR_IA32_MCG_STATUS: 3378 data = vcpu->arch.mcg_status; 3379 break; 3380 default: 3381 if (msr >= MSR_IA32_MC0_CTL && 3382 msr < MSR_IA32_MCx_CTL(bank_num)) { 3383 u32 offset = array_index_nospec( 3384 msr - MSR_IA32_MC0_CTL, 3385 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); 3386 3387 data = vcpu->arch.mce_banks[offset]; 3388 break; 3389 } 3390 return 1; 3391 } 3392 *pdata = data; 3393 return 0; 3394 } 3395 3396 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3397 { 3398 switch (msr_info->index) { 3399 case MSR_IA32_PLATFORM_ID: 3400 case MSR_IA32_EBL_CR_POWERON: 3401 case MSR_IA32_LASTBRANCHFROMIP: 3402 case MSR_IA32_LASTBRANCHTOIP: 3403 case MSR_IA32_LASTINTFROMIP: 3404 case MSR_IA32_LASTINTTOIP: 3405 case MSR_K8_SYSCFG: 3406 case MSR_K8_TSEG_ADDR: 3407 case MSR_K8_TSEG_MASK: 3408 case MSR_VM_HSAVE_PA: 3409 case MSR_K8_INT_PENDING_MSG: 3410 case MSR_AMD64_NB_CFG: 3411 case MSR_FAM10H_MMIO_CONF_BASE: 3412 case MSR_AMD64_BU_CFG2: 3413 case MSR_IA32_PERF_CTL: 3414 case MSR_AMD64_DC_CFG: 3415 case MSR_F15H_EX_CFG: 3416 /* 3417 * Intel Sandy Bridge CPUs must support the RAPL (running average power 3418 * limit) MSRs. Just return 0, as we do not want to expose the host 3419 * data here. Do not conditionalize this on CPUID, as KVM does not do 3420 * so for existing CPU-specific MSRs. 3421 */ 3422 case MSR_RAPL_POWER_UNIT: 3423 case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ 3424 case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ 3425 case MSR_PKG_ENERGY_STATUS: /* Total package */ 3426 case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ 3427 msr_info->data = 0; 3428 break; 3429 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: 3430 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 3431 return kvm_pmu_get_msr(vcpu, msr_info); 3432 if (!msr_info->host_initiated) 3433 return 1; 3434 msr_info->data = 0; 3435 break; 3436 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: 3437 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 3438 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 3439 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: 3440 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 3441 return kvm_pmu_get_msr(vcpu, msr_info); 3442 msr_info->data = 0; 3443 break; 3444 case MSR_IA32_UCODE_REV: 3445 msr_info->data = vcpu->arch.microcode_version; 3446 break; 3447 case MSR_IA32_ARCH_CAPABILITIES: 3448 if (!msr_info->host_initiated && 3449 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) 3450 return 1; 3451 msr_info->data = vcpu->arch.arch_capabilities; 3452 break; 3453 case MSR_IA32_PERF_CAPABILITIES: 3454 if (!msr_info->host_initiated && 3455 !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) 3456 return 1; 3457 msr_info->data = vcpu->arch.perf_capabilities; 3458 break; 3459 case MSR_IA32_POWER_CTL: 3460 msr_info->data = vcpu->arch.msr_ia32_power_ctl; 3461 break; 3462 case MSR_IA32_TSC: { 3463 /* 3464 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset 3465 * even when not intercepted. AMD manual doesn't explicitly 3466 * state this but appears to behave the same. 3467 * 3468 * On userspace reads and writes, however, we unconditionally 3469 * return L1's TSC value to ensure backwards-compatible 3470 * behavior for migration. 3471 */ 3472 u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : 3473 vcpu->arch.tsc_offset; 3474 3475 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; 3476 break; 3477 } 3478 case MSR_MTRRcap: 3479 case 0x200 ... 0x2ff: 3480 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); 3481 case 0xcd: /* fsb frequency */ 3482 msr_info->data = 3; 3483 break; 3484 /* 3485 * MSR_EBC_FREQUENCY_ID 3486 * Conservative value valid for even the basic CPU models. 3487 * Models 0,1: 000 in bits 23:21 indicating a bus speed of 3488 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, 3489 * and 266MHz for model 3, or 4. Set Core Clock 3490 * Frequency to System Bus Frequency Ratio to 1 (bits 3491 * 31:24) even though these are only valid for CPU 3492 * models > 2, however guests may end up dividing or 3493 * multiplying by zero otherwise. 3494 */ 3495 case MSR_EBC_FREQUENCY_ID: 3496 msr_info->data = 1 << 24; 3497 break; 3498 case MSR_IA32_APICBASE: 3499 msr_info->data = kvm_get_apic_base(vcpu); 3500 break; 3501 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: 3502 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); 3503 case MSR_IA32_TSC_DEADLINE: 3504 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); 3505 break; 3506 case MSR_IA32_TSC_ADJUST: 3507 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; 3508 break; 3509 case MSR_IA32_MISC_ENABLE: 3510 msr_info->data = vcpu->arch.ia32_misc_enable_msr; 3511 break; 3512 case MSR_IA32_SMBASE: 3513 if (!msr_info->host_initiated) 3514 return 1; 3515 msr_info->data = vcpu->arch.smbase; 3516 break; 3517 case MSR_SMI_COUNT: 3518 msr_info->data = vcpu->arch.smi_count; 3519 break; 3520 case MSR_IA32_PERF_STATUS: 3521 /* TSC increment by tick */ 3522 msr_info->data = 1000ULL; 3523 /* CPU multiplier */ 3524 msr_info->data |= (((uint64_t)4ULL) << 40); 3525 break; 3526 case MSR_EFER: 3527 msr_info->data = vcpu->arch.efer; 3528 break; 3529 case MSR_KVM_WALL_CLOCK: 3530 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3531 return 1; 3532 3533 msr_info->data = vcpu->kvm->arch.wall_clock; 3534 break; 3535 case MSR_KVM_WALL_CLOCK_NEW: 3536 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3537 return 1; 3538 3539 msr_info->data = vcpu->kvm->arch.wall_clock; 3540 break; 3541 case MSR_KVM_SYSTEM_TIME: 3542 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 3543 return 1; 3544 3545 msr_info->data = vcpu->arch.time; 3546 break; 3547 case MSR_KVM_SYSTEM_TIME_NEW: 3548 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 3549 return 1; 3550 3551 msr_info->data = vcpu->arch.time; 3552 break; 3553 case MSR_KVM_ASYNC_PF_EN: 3554 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3555 return 1; 3556 3557 msr_info->data = vcpu->arch.apf.msr_en_val; 3558 break; 3559 case MSR_KVM_ASYNC_PF_INT: 3560 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 3561 return 1; 3562 3563 msr_info->data = vcpu->arch.apf.msr_int_val; 3564 break; 3565 case MSR_KVM_ASYNC_PF_ACK: 3566 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 3567 return 1; 3568 3569 msr_info->data = 0; 3570 break; 3571 case MSR_KVM_STEAL_TIME: 3572 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) 3573 return 1; 3574 3575 msr_info->data = vcpu->arch.st.msr_val; 3576 break; 3577 case MSR_KVM_PV_EOI_EN: 3578 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) 3579 return 1; 3580 3581 msr_info->data = vcpu->arch.pv_eoi.msr_val; 3582 break; 3583 case MSR_KVM_POLL_CONTROL: 3584 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) 3585 return 1; 3586 3587 msr_info->data = vcpu->arch.msr_kvm_poll_control; 3588 break; 3589 case MSR_IA32_P5_MC_ADDR: 3590 case MSR_IA32_P5_MC_TYPE: 3591 case MSR_IA32_MCG_CAP: 3592 case MSR_IA32_MCG_CTL: 3593 case MSR_IA32_MCG_STATUS: 3594 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 3595 return get_msr_mce(vcpu, msr_info->index, &msr_info->data, 3596 msr_info->host_initiated); 3597 case MSR_IA32_XSS: 3598 if (!msr_info->host_initiated && 3599 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 3600 return 1; 3601 msr_info->data = vcpu->arch.ia32_xss; 3602 break; 3603 case MSR_K7_CLK_CTL: 3604 /* 3605 * Provide expected ramp-up count for K7. All other 3606 * are set to zero, indicating minimum divisors for 3607 * every field. 3608 * 3609 * This prevents guest kernels on AMD host with CPU 3610 * type 6, model 8 and higher from exploding due to 3611 * the rdmsr failing. 3612 */ 3613 msr_info->data = 0x20000000; 3614 break; 3615 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 3616 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: 3617 case HV_X64_MSR_SYNDBG_OPTIONS: 3618 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 3619 case HV_X64_MSR_CRASH_CTL: 3620 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: 3621 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 3622 case HV_X64_MSR_TSC_EMULATION_CONTROL: 3623 case HV_X64_MSR_TSC_EMULATION_STATUS: 3624 return kvm_hv_get_msr_common(vcpu, 3625 msr_info->index, &msr_info->data, 3626 msr_info->host_initiated); 3627 case MSR_IA32_BBL_CR_CTL3: 3628 /* This legacy MSR exists but isn't fully documented in current 3629 * silicon. It is however accessed by winxp in very narrow 3630 * scenarios where it sets bit #19, itself documented as 3631 * a "reserved" bit. Best effort attempt to source coherent 3632 * read data here should the balance of the register be 3633 * interpreted by the guest: 3634 * 3635 * L2 cache control register 3: 64GB range, 256KB size, 3636 * enabled, latency 0x1, configured 3637 */ 3638 msr_info->data = 0xbe702111; 3639 break; 3640 case MSR_AMD64_OSVW_ID_LENGTH: 3641 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3642 return 1; 3643 msr_info->data = vcpu->arch.osvw.length; 3644 break; 3645 case MSR_AMD64_OSVW_STATUS: 3646 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) 3647 return 1; 3648 msr_info->data = vcpu->arch.osvw.status; 3649 break; 3650 case MSR_PLATFORM_INFO: 3651 if (!msr_info->host_initiated && 3652 !vcpu->kvm->arch.guest_can_read_msr_platform_info) 3653 return 1; 3654 msr_info->data = vcpu->arch.msr_platform_info; 3655 break; 3656 case MSR_MISC_FEATURES_ENABLES: 3657 msr_info->data = vcpu->arch.msr_misc_features_enables; 3658 break; 3659 case MSR_K7_HWCR: 3660 msr_info->data = vcpu->arch.msr_hwcr; 3661 break; 3662 default: 3663 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 3664 return kvm_pmu_get_msr(vcpu, msr_info); 3665 return KVM_MSR_RET_INVALID; 3666 } 3667 return 0; 3668 } 3669 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 3670 3671 /* 3672 * Read or write a bunch of msrs. All parameters are kernel addresses. 3673 * 3674 * @return number of msrs set successfully. 3675 */ 3676 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 3677 struct kvm_msr_entry *entries, 3678 int (*do_msr)(struct kvm_vcpu *vcpu, 3679 unsigned index, u64 *data)) 3680 { 3681 int i; 3682 3683 for (i = 0; i < msrs->nmsrs; ++i) 3684 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 3685 break; 3686 3687 return i; 3688 } 3689 3690 /* 3691 * Read or write a bunch of msrs. Parameters are user addresses. 3692 * 3693 * @return number of msrs set successfully. 3694 */ 3695 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 3696 int (*do_msr)(struct kvm_vcpu *vcpu, 3697 unsigned index, u64 *data), 3698 int writeback) 3699 { 3700 struct kvm_msrs msrs; 3701 struct kvm_msr_entry *entries; 3702 int r, n; 3703 unsigned size; 3704 3705 r = -EFAULT; 3706 if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) 3707 goto out; 3708 3709 r = -E2BIG; 3710 if (msrs.nmsrs >= MAX_IO_MSRS) 3711 goto out; 3712 3713 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 3714 entries = memdup_user(user_msrs->entries, size); 3715 if (IS_ERR(entries)) { 3716 r = PTR_ERR(entries); 3717 goto out; 3718 } 3719 3720 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 3721 if (r < 0) 3722 goto out_free; 3723 3724 r = -EFAULT; 3725 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 3726 goto out_free; 3727 3728 r = n; 3729 3730 out_free: 3731 kfree(entries); 3732 out: 3733 return r; 3734 } 3735 3736 static inline bool kvm_can_mwait_in_guest(void) 3737 { 3738 return boot_cpu_has(X86_FEATURE_MWAIT) && 3739 !boot_cpu_has_bug(X86_BUG_MONITOR) && 3740 boot_cpu_has(X86_FEATURE_ARAT); 3741 } 3742 3743 static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, 3744 struct kvm_cpuid2 __user *cpuid_arg) 3745 { 3746 struct kvm_cpuid2 cpuid; 3747 int r; 3748 3749 r = -EFAULT; 3750 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 3751 return r; 3752 3753 r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3754 if (r) 3755 return r; 3756 3757 r = -EFAULT; 3758 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) 3759 return r; 3760 3761 return 0; 3762 } 3763 3764 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) 3765 { 3766 int r = 0; 3767 3768 switch (ext) { 3769 case KVM_CAP_IRQCHIP: 3770 case KVM_CAP_HLT: 3771 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 3772 case KVM_CAP_SET_TSS_ADDR: 3773 case KVM_CAP_EXT_CPUID: 3774 case KVM_CAP_EXT_EMUL_CPUID: 3775 case KVM_CAP_CLOCKSOURCE: 3776 case KVM_CAP_PIT: 3777 case KVM_CAP_NOP_IO_DELAY: 3778 case KVM_CAP_MP_STATE: 3779 case KVM_CAP_SYNC_MMU: 3780 case KVM_CAP_USER_NMI: 3781 case KVM_CAP_REINJECT_CONTROL: 3782 case KVM_CAP_IRQ_INJECT_STATUS: 3783 case KVM_CAP_IOEVENTFD: 3784 case KVM_CAP_IOEVENTFD_NO_LENGTH: 3785 case KVM_CAP_PIT2: 3786 case KVM_CAP_PIT_STATE2: 3787 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 3788 case KVM_CAP_VCPU_EVENTS: 3789 case KVM_CAP_HYPERV: 3790 case KVM_CAP_HYPERV_VAPIC: 3791 case KVM_CAP_HYPERV_SPIN: 3792 case KVM_CAP_HYPERV_SYNIC: 3793 case KVM_CAP_HYPERV_SYNIC2: 3794 case KVM_CAP_HYPERV_VP_INDEX: 3795 case KVM_CAP_HYPERV_EVENTFD: 3796 case KVM_CAP_HYPERV_TLBFLUSH: 3797 case KVM_CAP_HYPERV_SEND_IPI: 3798 case KVM_CAP_HYPERV_CPUID: 3799 case KVM_CAP_SYS_HYPERV_CPUID: 3800 case KVM_CAP_PCI_SEGMENT: 3801 case KVM_CAP_DEBUGREGS: 3802 case KVM_CAP_X86_ROBUST_SINGLESTEP: 3803 case KVM_CAP_XSAVE: 3804 case KVM_CAP_ASYNC_PF: 3805 case KVM_CAP_ASYNC_PF_INT: 3806 case KVM_CAP_GET_TSC_KHZ: 3807 case KVM_CAP_KVMCLOCK_CTRL: 3808 case KVM_CAP_READONLY_MEM: 3809 case KVM_CAP_HYPERV_TIME: 3810 case KVM_CAP_IOAPIC_POLARITY_IGNORED: 3811 case KVM_CAP_TSC_DEADLINE_TIMER: 3812 case KVM_CAP_DISABLE_QUIRKS: 3813 case KVM_CAP_SET_BOOT_CPU_ID: 3814 case KVM_CAP_SPLIT_IRQCHIP: 3815 case KVM_CAP_IMMEDIATE_EXIT: 3816 case KVM_CAP_PMU_EVENT_FILTER: 3817 case KVM_CAP_GET_MSR_FEATURES: 3818 case KVM_CAP_MSR_PLATFORM_INFO: 3819 case KVM_CAP_EXCEPTION_PAYLOAD: 3820 case KVM_CAP_SET_GUEST_DEBUG: 3821 case KVM_CAP_LAST_CPU: 3822 case KVM_CAP_X86_USER_SPACE_MSR: 3823 case KVM_CAP_X86_MSR_FILTER: 3824 case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: 3825 #ifdef CONFIG_X86_SGX_KVM 3826 case KVM_CAP_SGX_ATTRIBUTE: 3827 #endif 3828 case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: 3829 r = 1; 3830 break; 3831 case KVM_CAP_SET_GUEST_DEBUG2: 3832 return KVM_GUESTDBG_VALID_MASK; 3833 #ifdef CONFIG_KVM_XEN 3834 case KVM_CAP_XEN_HVM: 3835 r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 3836 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | 3837 KVM_XEN_HVM_CONFIG_SHARED_INFO; 3838 if (sched_info_on()) 3839 r |= KVM_XEN_HVM_CONFIG_RUNSTATE; 3840 break; 3841 #endif 3842 case KVM_CAP_SYNC_REGS: 3843 r = KVM_SYNC_X86_VALID_FIELDS; 3844 break; 3845 case KVM_CAP_ADJUST_CLOCK: 3846 r = KVM_CLOCK_TSC_STABLE; 3847 break; 3848 case KVM_CAP_X86_DISABLE_EXITS: 3849 r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | 3850 KVM_X86_DISABLE_EXITS_CSTATE; 3851 if(kvm_can_mwait_in_guest()) 3852 r |= KVM_X86_DISABLE_EXITS_MWAIT; 3853 break; 3854 case KVM_CAP_X86_SMM: 3855 /* SMBASE is usually relocated above 1M on modern chipsets, 3856 * and SMM handlers might indeed rely on 4G segment limits, 3857 * so do not report SMM to be available if real mode is 3858 * emulated via vm86 mode. Still, do not go to great lengths 3859 * to avoid userspace's usage of the feature, because it is a 3860 * fringe case that is not enabled except via specific settings 3861 * of the module parameters. 3862 */ 3863 r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); 3864 break; 3865 case KVM_CAP_VAPIC: 3866 r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); 3867 break; 3868 case KVM_CAP_NR_VCPUS: 3869 r = KVM_SOFT_MAX_VCPUS; 3870 break; 3871 case KVM_CAP_MAX_VCPUS: 3872 r = KVM_MAX_VCPUS; 3873 break; 3874 case KVM_CAP_MAX_VCPU_ID: 3875 r = KVM_MAX_VCPU_ID; 3876 break; 3877 case KVM_CAP_PV_MMU: /* obsolete */ 3878 r = 0; 3879 break; 3880 case KVM_CAP_MCE: 3881 r = KVM_MAX_MCE_BANKS; 3882 break; 3883 case KVM_CAP_XCRS: 3884 r = boot_cpu_has(X86_FEATURE_XSAVE); 3885 break; 3886 case KVM_CAP_TSC_CONTROL: 3887 r = kvm_has_tsc_control; 3888 break; 3889 case KVM_CAP_X2APIC_API: 3890 r = KVM_X2APIC_API_VALID_FLAGS; 3891 break; 3892 case KVM_CAP_NESTED_STATE: 3893 r = kvm_x86_ops.nested_ops->get_state ? 3894 kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; 3895 break; 3896 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: 3897 r = kvm_x86_ops.enable_direct_tlbflush != NULL; 3898 break; 3899 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: 3900 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; 3901 break; 3902 case KVM_CAP_SMALLER_MAXPHYADDR: 3903 r = (int) allow_smaller_maxphyaddr; 3904 break; 3905 case KVM_CAP_STEAL_TIME: 3906 r = sched_info_on(); 3907 break; 3908 case KVM_CAP_X86_BUS_LOCK_EXIT: 3909 if (kvm_has_bus_lock_exit) 3910 r = KVM_BUS_LOCK_DETECTION_OFF | 3911 KVM_BUS_LOCK_DETECTION_EXIT; 3912 else 3913 r = 0; 3914 break; 3915 default: 3916 break; 3917 } 3918 return r; 3919 3920 } 3921 3922 long kvm_arch_dev_ioctl(struct file *filp, 3923 unsigned int ioctl, unsigned long arg) 3924 { 3925 void __user *argp = (void __user *)arg; 3926 long r; 3927 3928 switch (ioctl) { 3929 case KVM_GET_MSR_INDEX_LIST: { 3930 struct kvm_msr_list __user *user_msr_list = argp; 3931 struct kvm_msr_list msr_list; 3932 unsigned n; 3933 3934 r = -EFAULT; 3935 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) 3936 goto out; 3937 n = msr_list.nmsrs; 3938 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; 3939 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) 3940 goto out; 3941 r = -E2BIG; 3942 if (n < msr_list.nmsrs) 3943 goto out; 3944 r = -EFAULT; 3945 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 3946 num_msrs_to_save * sizeof(u32))) 3947 goto out; 3948 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 3949 &emulated_msrs, 3950 num_emulated_msrs * sizeof(u32))) 3951 goto out; 3952 r = 0; 3953 break; 3954 } 3955 case KVM_GET_SUPPORTED_CPUID: 3956 case KVM_GET_EMULATED_CPUID: { 3957 struct kvm_cpuid2 __user *cpuid_arg = argp; 3958 struct kvm_cpuid2 cpuid; 3959 3960 r = -EFAULT; 3961 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 3962 goto out; 3963 3964 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, 3965 ioctl); 3966 if (r) 3967 goto out; 3968 3969 r = -EFAULT; 3970 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) 3971 goto out; 3972 r = 0; 3973 break; 3974 } 3975 case KVM_X86_GET_MCE_CAP_SUPPORTED: 3976 r = -EFAULT; 3977 if (copy_to_user(argp, &kvm_mce_cap_supported, 3978 sizeof(kvm_mce_cap_supported))) 3979 goto out; 3980 r = 0; 3981 break; 3982 case KVM_GET_MSR_FEATURE_INDEX_LIST: { 3983 struct kvm_msr_list __user *user_msr_list = argp; 3984 struct kvm_msr_list msr_list; 3985 unsigned int n; 3986 3987 r = -EFAULT; 3988 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) 3989 goto out; 3990 n = msr_list.nmsrs; 3991 msr_list.nmsrs = num_msr_based_features; 3992 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) 3993 goto out; 3994 r = -E2BIG; 3995 if (n < msr_list.nmsrs) 3996 goto out; 3997 r = -EFAULT; 3998 if (copy_to_user(user_msr_list->indices, &msr_based_features, 3999 num_msr_based_features * sizeof(u32))) 4000 goto out; 4001 r = 0; 4002 break; 4003 } 4004 case KVM_GET_MSRS: 4005 r = msr_io(NULL, argp, do_get_msr_feature, 1); 4006 break; 4007 case KVM_GET_SUPPORTED_HV_CPUID: 4008 r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); 4009 break; 4010 default: 4011 r = -EINVAL; 4012 break; 4013 } 4014 out: 4015 return r; 4016 } 4017 4018 static void wbinvd_ipi(void *garbage) 4019 { 4020 wbinvd(); 4021 } 4022 4023 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 4024 { 4025 return kvm_arch_has_noncoherent_dma(vcpu->kvm); 4026 } 4027 4028 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 4029 { 4030 /* Address WBINVD may be executed by guest */ 4031 if (need_emulate_wbinvd(vcpu)) { 4032 if (static_call(kvm_x86_has_wbinvd_exit)()) 4033 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 4034 else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 4035 smp_call_function_single(vcpu->cpu, 4036 wbinvd_ipi, NULL, 1); 4037 } 4038 4039 static_call(kvm_x86_vcpu_load)(vcpu, cpu); 4040 4041 /* Save host pkru register if supported */ 4042 vcpu->arch.host_pkru = read_pkru(); 4043 4044 /* Apply any externally detected TSC adjustments (due to suspend) */ 4045 if (unlikely(vcpu->arch.tsc_offset_adjustment)) { 4046 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); 4047 vcpu->arch.tsc_offset_adjustment = 0; 4048 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 4049 } 4050 4051 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { 4052 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 4053 rdtsc() - vcpu->arch.last_host_tsc; 4054 if (tsc_delta < 0) 4055 mark_tsc_unstable("KVM discovered backwards TSC"); 4056 4057 if (kvm_check_tsc_unstable()) { 4058 u64 offset = kvm_compute_tsc_offset(vcpu, 4059 vcpu->arch.last_guest_tsc); 4060 kvm_vcpu_write_tsc_offset(vcpu, offset); 4061 vcpu->arch.tsc_catchup = 1; 4062 } 4063 4064 if (kvm_lapic_hv_timer_in_use(vcpu)) 4065 kvm_lapic_restart_hv_timer(vcpu); 4066 4067 /* 4068 * On a host with synchronized TSC, there is no need to update 4069 * kvmclock on vcpu->cpu migration 4070 */ 4071 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 4072 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); 4073 if (vcpu->cpu != cpu) 4074 kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); 4075 vcpu->cpu = cpu; 4076 } 4077 4078 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 4079 } 4080 4081 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) 4082 { 4083 struct kvm_host_map map; 4084 struct kvm_steal_time *st; 4085 4086 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 4087 return; 4088 4089 if (vcpu->arch.st.preempted) 4090 return; 4091 4092 if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, 4093 &vcpu->arch.st.cache, true)) 4094 return; 4095 4096 st = map.hva + 4097 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); 4098 4099 st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; 4100 4101 kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); 4102 } 4103 4104 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 4105 { 4106 int idx; 4107 4108 if (vcpu->preempted && !vcpu->arch.guest_state_protected) 4109 vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); 4110 4111 /* 4112 * Take the srcu lock as memslots will be accessed to check the gfn 4113 * cache generation against the memslots generation. 4114 */ 4115 idx = srcu_read_lock(&vcpu->kvm->srcu); 4116 if (kvm_xen_msr_enabled(vcpu->kvm)) 4117 kvm_xen_runstate_set_preempted(vcpu); 4118 else 4119 kvm_steal_time_set_preempted(vcpu); 4120 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4121 4122 static_call(kvm_x86_vcpu_put)(vcpu); 4123 vcpu->arch.last_host_tsc = rdtsc(); 4124 /* 4125 * If userspace has set any breakpoints or watchpoints, dr6 is restored 4126 * on every vmexit, but if not, we might have a stale dr6 from the 4127 * guest. do_debug expects dr6 to be cleared after it runs, do the same. 4128 */ 4129 set_debugreg(0, 6); 4130 } 4131 4132 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 4133 struct kvm_lapic_state *s) 4134 { 4135 if (vcpu->arch.apicv_active) 4136 static_call(kvm_x86_sync_pir_to_irr)(vcpu); 4137 4138 return kvm_apic_get_state(vcpu, s); 4139 } 4140 4141 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 4142 struct kvm_lapic_state *s) 4143 { 4144 int r; 4145 4146 r = kvm_apic_set_state(vcpu, s); 4147 if (r) 4148 return r; 4149 update_cr8_intercept(vcpu); 4150 4151 return 0; 4152 } 4153 4154 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) 4155 { 4156 /* 4157 * We can accept userspace's request for interrupt injection 4158 * as long as we have a place to store the interrupt number. 4159 * The actual injection will happen when the CPU is able to 4160 * deliver the interrupt. 4161 */ 4162 if (kvm_cpu_has_extint(vcpu)) 4163 return false; 4164 4165 /* Acknowledging ExtINT does not happen if LINT0 is masked. */ 4166 return (!lapic_in_kernel(vcpu) || 4167 kvm_apic_accept_pic_intr(vcpu)); 4168 } 4169 4170 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) 4171 { 4172 return kvm_arch_interrupt_allowed(vcpu) && 4173 kvm_cpu_accept_dm_intr(vcpu); 4174 } 4175 4176 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 4177 struct kvm_interrupt *irq) 4178 { 4179 if (irq->irq >= KVM_NR_INTERRUPTS) 4180 return -EINVAL; 4181 4182 if (!irqchip_in_kernel(vcpu->kvm)) { 4183 kvm_queue_interrupt(vcpu, irq->irq, false); 4184 kvm_make_request(KVM_REQ_EVENT, vcpu); 4185 return 0; 4186 } 4187 4188 /* 4189 * With in-kernel LAPIC, we only use this to inject EXTINT, so 4190 * fail for in-kernel 8259. 4191 */ 4192 if (pic_in_kernel(vcpu->kvm)) 4193 return -ENXIO; 4194 4195 if (vcpu->arch.pending_external_vector != -1) 4196 return -EEXIST; 4197 4198 vcpu->arch.pending_external_vector = irq->irq; 4199 kvm_make_request(KVM_REQ_EVENT, vcpu); 4200 return 0; 4201 } 4202 4203 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 4204 { 4205 kvm_inject_nmi(vcpu); 4206 4207 return 0; 4208 } 4209 4210 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) 4211 { 4212 kvm_make_request(KVM_REQ_SMI, vcpu); 4213 4214 return 0; 4215 } 4216 4217 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 4218 struct kvm_tpr_access_ctl *tac) 4219 { 4220 if (tac->flags) 4221 return -EINVAL; 4222 vcpu->arch.tpr_access_reporting = !!tac->enabled; 4223 return 0; 4224 } 4225 4226 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 4227 u64 mcg_cap) 4228 { 4229 int r; 4230 unsigned bank_num = mcg_cap & 0xff, bank; 4231 4232 r = -EINVAL; 4233 if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) 4234 goto out; 4235 if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) 4236 goto out; 4237 r = 0; 4238 vcpu->arch.mcg_cap = mcg_cap; 4239 /* Init IA32_MCG_CTL to all 1s */ 4240 if (mcg_cap & MCG_CTL_P) 4241 vcpu->arch.mcg_ctl = ~(u64)0; 4242 /* Init IA32_MCi_CTL to all 1s */ 4243 for (bank = 0; bank < bank_num; bank++) 4244 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 4245 4246 static_call(kvm_x86_setup_mce)(vcpu); 4247 out: 4248 return r; 4249 } 4250 4251 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 4252 struct kvm_x86_mce *mce) 4253 { 4254 u64 mcg_cap = vcpu->arch.mcg_cap; 4255 unsigned bank_num = mcg_cap & 0xff; 4256 u64 *banks = vcpu->arch.mce_banks; 4257 4258 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 4259 return -EINVAL; 4260 /* 4261 * if IA32_MCG_CTL is not all 1s, the uncorrected error 4262 * reporting is disabled 4263 */ 4264 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 4265 vcpu->arch.mcg_ctl != ~(u64)0) 4266 return 0; 4267 banks += 4 * mce->bank; 4268 /* 4269 * if IA32_MCi_CTL is not all 1s, the uncorrected error 4270 * reporting is disabled for the bank 4271 */ 4272 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 4273 return 0; 4274 if (mce->status & MCI_STATUS_UC) { 4275 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 4276 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 4277 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4278 return 0; 4279 } 4280 if (banks[1] & MCI_STATUS_VAL) 4281 mce->status |= MCI_STATUS_OVER; 4282 banks[2] = mce->addr; 4283 banks[3] = mce->misc; 4284 vcpu->arch.mcg_status = mce->mcg_status; 4285 banks[1] = mce->status; 4286 kvm_queue_exception(vcpu, MC_VECTOR); 4287 } else if (!(banks[1] & MCI_STATUS_VAL) 4288 || !(banks[1] & MCI_STATUS_UC)) { 4289 if (banks[1] & MCI_STATUS_VAL) 4290 mce->status |= MCI_STATUS_OVER; 4291 banks[2] = mce->addr; 4292 banks[3] = mce->misc; 4293 banks[1] = mce->status; 4294 } else 4295 banks[1] |= MCI_STATUS_OVER; 4296 return 0; 4297 } 4298 4299 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 4300 struct kvm_vcpu_events *events) 4301 { 4302 process_nmi(vcpu); 4303 4304 if (kvm_check_request(KVM_REQ_SMI, vcpu)) 4305 process_smi(vcpu); 4306 4307 /* 4308 * In guest mode, payload delivery should be deferred, 4309 * so that the L1 hypervisor can intercept #PF before 4310 * CR2 is modified (or intercept #DB before DR6 is 4311 * modified under nVMX). Unless the per-VM capability, 4312 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of 4313 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we 4314 * opportunistically defer the exception payload, deliver it if the 4315 * capability hasn't been requested before processing a 4316 * KVM_GET_VCPU_EVENTS. 4317 */ 4318 if (!vcpu->kvm->arch.exception_payload_enabled && 4319 vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) 4320 kvm_deliver_exception_payload(vcpu); 4321 4322 /* 4323 * The API doesn't provide the instruction length for software 4324 * exceptions, so don't report them. As long as the guest RIP 4325 * isn't advanced, we should expect to encounter the exception 4326 * again. 4327 */ 4328 if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { 4329 events->exception.injected = 0; 4330 events->exception.pending = 0; 4331 } else { 4332 events->exception.injected = vcpu->arch.exception.injected; 4333 events->exception.pending = vcpu->arch.exception.pending; 4334 /* 4335 * For ABI compatibility, deliberately conflate 4336 * pending and injected exceptions when 4337 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. 4338 */ 4339 if (!vcpu->kvm->arch.exception_payload_enabled) 4340 events->exception.injected |= 4341 vcpu->arch.exception.pending; 4342 } 4343 events->exception.nr = vcpu->arch.exception.nr; 4344 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 4345 events->exception.error_code = vcpu->arch.exception.error_code; 4346 events->exception_has_payload = vcpu->arch.exception.has_payload; 4347 events->exception_payload = vcpu->arch.exception.payload; 4348 4349 events->interrupt.injected = 4350 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; 4351 events->interrupt.nr = vcpu->arch.interrupt.nr; 4352 events->interrupt.soft = 0; 4353 events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); 4354 4355 events->nmi.injected = vcpu->arch.nmi_injected; 4356 events->nmi.pending = vcpu->arch.nmi_pending != 0; 4357 events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); 4358 events->nmi.pad = 0; 4359 4360 events->sipi_vector = 0; /* never valid when reporting to user space */ 4361 4362 events->smi.smm = is_smm(vcpu); 4363 events->smi.pending = vcpu->arch.smi_pending; 4364 events->smi.smm_inside_nmi = 4365 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); 4366 events->smi.latched_init = kvm_lapic_latched_init(vcpu); 4367 4368 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 4369 | KVM_VCPUEVENT_VALID_SHADOW 4370 | KVM_VCPUEVENT_VALID_SMM); 4371 if (vcpu->kvm->arch.exception_payload_enabled) 4372 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; 4373 4374 memset(&events->reserved, 0, sizeof(events->reserved)); 4375 } 4376 4377 static void kvm_smm_changed(struct kvm_vcpu *vcpu); 4378 4379 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 4380 struct kvm_vcpu_events *events) 4381 { 4382 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 4383 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 4384 | KVM_VCPUEVENT_VALID_SHADOW 4385 | KVM_VCPUEVENT_VALID_SMM 4386 | KVM_VCPUEVENT_VALID_PAYLOAD)) 4387 return -EINVAL; 4388 4389 if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { 4390 if (!vcpu->kvm->arch.exception_payload_enabled) 4391 return -EINVAL; 4392 if (events->exception.pending) 4393 events->exception.injected = 0; 4394 else 4395 events->exception_has_payload = 0; 4396 } else { 4397 events->exception.pending = 0; 4398 events->exception_has_payload = 0; 4399 } 4400 4401 if ((events->exception.injected || events->exception.pending) && 4402 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) 4403 return -EINVAL; 4404 4405 /* INITs are latched while in SMM */ 4406 if (events->flags & KVM_VCPUEVENT_VALID_SMM && 4407 (events->smi.smm || events->smi.pending) && 4408 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4409 return -EINVAL; 4410 4411 process_nmi(vcpu); 4412 vcpu->arch.exception.injected = events->exception.injected; 4413 vcpu->arch.exception.pending = events->exception.pending; 4414 vcpu->arch.exception.nr = events->exception.nr; 4415 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 4416 vcpu->arch.exception.error_code = events->exception.error_code; 4417 vcpu->arch.exception.has_payload = events->exception_has_payload; 4418 vcpu->arch.exception.payload = events->exception_payload; 4419 4420 vcpu->arch.interrupt.injected = events->interrupt.injected; 4421 vcpu->arch.interrupt.nr = events->interrupt.nr; 4422 vcpu->arch.interrupt.soft = events->interrupt.soft; 4423 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 4424 static_call(kvm_x86_set_interrupt_shadow)(vcpu, 4425 events->interrupt.shadow); 4426 4427 vcpu->arch.nmi_injected = events->nmi.injected; 4428 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 4429 vcpu->arch.nmi_pending = events->nmi.pending; 4430 static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); 4431 4432 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && 4433 lapic_in_kernel(vcpu)) 4434 vcpu->arch.apic->sipi_vector = events->sipi_vector; 4435 4436 if (events->flags & KVM_VCPUEVENT_VALID_SMM) { 4437 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { 4438 if (events->smi.smm) 4439 vcpu->arch.hflags |= HF_SMM_MASK; 4440 else 4441 vcpu->arch.hflags &= ~HF_SMM_MASK; 4442 kvm_smm_changed(vcpu); 4443 } 4444 4445 vcpu->arch.smi_pending = events->smi.pending; 4446 4447 if (events->smi.smm) { 4448 if (events->smi.smm_inside_nmi) 4449 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; 4450 else 4451 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; 4452 } 4453 4454 if (lapic_in_kernel(vcpu)) { 4455 if (events->smi.latched_init) 4456 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); 4457 else 4458 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); 4459 } 4460 } 4461 4462 kvm_make_request(KVM_REQ_EVENT, vcpu); 4463 4464 return 0; 4465 } 4466 4467 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 4468 struct kvm_debugregs *dbgregs) 4469 { 4470 unsigned long val; 4471 4472 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 4473 kvm_get_dr(vcpu, 6, &val); 4474 dbgregs->dr6 = val; 4475 dbgregs->dr7 = vcpu->arch.dr7; 4476 dbgregs->flags = 0; 4477 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); 4478 } 4479 4480 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 4481 struct kvm_debugregs *dbgregs) 4482 { 4483 if (dbgregs->flags) 4484 return -EINVAL; 4485 4486 if (!kvm_dr6_valid(dbgregs->dr6)) 4487 return -EINVAL; 4488 if (!kvm_dr7_valid(dbgregs->dr7)) 4489 return -EINVAL; 4490 4491 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 4492 kvm_update_dr0123(vcpu); 4493 vcpu->arch.dr6 = dbgregs->dr6; 4494 vcpu->arch.dr7 = dbgregs->dr7; 4495 kvm_update_dr7(vcpu); 4496 4497 return 0; 4498 } 4499 4500 #define XSTATE_COMPACTION_ENABLED (1ULL << 63) 4501 4502 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) 4503 { 4504 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; 4505 u64 xstate_bv = xsave->header.xfeatures; 4506 u64 valid; 4507 4508 /* 4509 * Copy legacy XSAVE area, to avoid complications with CPUID 4510 * leaves 0 and 1 in the loop below. 4511 */ 4512 memcpy(dest, xsave, XSAVE_HDR_OFFSET); 4513 4514 /* Set XSTATE_BV */ 4515 xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; 4516 *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; 4517 4518 /* 4519 * Copy each region from the possibly compacted offset to the 4520 * non-compacted offset. 4521 */ 4522 valid = xstate_bv & ~XFEATURE_MASK_FPSSE; 4523 while (valid) { 4524 u64 xfeature_mask = valid & -valid; 4525 int xfeature_nr = fls64(xfeature_mask) - 1; 4526 void *src = get_xsave_addr(xsave, xfeature_nr); 4527 4528 if (src) { 4529 u32 size, offset, ecx, edx; 4530 cpuid_count(XSTATE_CPUID, xfeature_nr, 4531 &size, &offset, &ecx, &edx); 4532 if (xfeature_nr == XFEATURE_PKRU) 4533 memcpy(dest + offset, &vcpu->arch.pkru, 4534 sizeof(vcpu->arch.pkru)); 4535 else 4536 memcpy(dest + offset, src, size); 4537 4538 } 4539 4540 valid -= xfeature_mask; 4541 } 4542 } 4543 4544 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) 4545 { 4546 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; 4547 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); 4548 u64 valid; 4549 4550 /* 4551 * Copy legacy XSAVE area, to avoid complications with CPUID 4552 * leaves 0 and 1 in the loop below. 4553 */ 4554 memcpy(xsave, src, XSAVE_HDR_OFFSET); 4555 4556 /* Set XSTATE_BV and possibly XCOMP_BV. */ 4557 xsave->header.xfeatures = xstate_bv; 4558 if (boot_cpu_has(X86_FEATURE_XSAVES)) 4559 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; 4560 4561 /* 4562 * Copy each region from the non-compacted offset to the 4563 * possibly compacted offset. 4564 */ 4565 valid = xstate_bv & ~XFEATURE_MASK_FPSSE; 4566 while (valid) { 4567 u64 xfeature_mask = valid & -valid; 4568 int xfeature_nr = fls64(xfeature_mask) - 1; 4569 void *dest = get_xsave_addr(xsave, xfeature_nr); 4570 4571 if (dest) { 4572 u32 size, offset, ecx, edx; 4573 cpuid_count(XSTATE_CPUID, xfeature_nr, 4574 &size, &offset, &ecx, &edx); 4575 if (xfeature_nr == XFEATURE_PKRU) 4576 memcpy(&vcpu->arch.pkru, src + offset, 4577 sizeof(vcpu->arch.pkru)); 4578 else 4579 memcpy(dest, src + offset, size); 4580 } 4581 4582 valid -= xfeature_mask; 4583 } 4584 } 4585 4586 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, 4587 struct kvm_xsave *guest_xsave) 4588 { 4589 if (!vcpu->arch.guest_fpu) 4590 return; 4591 4592 if (boot_cpu_has(X86_FEATURE_XSAVE)) { 4593 memset(guest_xsave, 0, sizeof(struct kvm_xsave)); 4594 fill_xsave((u8 *) guest_xsave->region, vcpu); 4595 } else { 4596 memcpy(guest_xsave->region, 4597 &vcpu->arch.guest_fpu->state.fxsave, 4598 sizeof(struct fxregs_state)); 4599 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 4600 XFEATURE_MASK_FPSSE; 4601 } 4602 } 4603 4604 #define XSAVE_MXCSR_OFFSET 24 4605 4606 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 4607 struct kvm_xsave *guest_xsave) 4608 { 4609 u64 xstate_bv; 4610 u32 mxcsr; 4611 4612 if (!vcpu->arch.guest_fpu) 4613 return 0; 4614 4615 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; 4616 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; 4617 4618 if (boot_cpu_has(X86_FEATURE_XSAVE)) { 4619 /* 4620 * Here we allow setting states that are not present in 4621 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility 4622 * with old userspace. 4623 */ 4624 if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) 4625 return -EINVAL; 4626 load_xsave(vcpu, (u8 *)guest_xsave->region); 4627 } else { 4628 if (xstate_bv & ~XFEATURE_MASK_FPSSE || 4629 mxcsr & ~mxcsr_feature_mask) 4630 return -EINVAL; 4631 memcpy(&vcpu->arch.guest_fpu->state.fxsave, 4632 guest_xsave->region, sizeof(struct fxregs_state)); 4633 } 4634 return 0; 4635 } 4636 4637 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, 4638 struct kvm_xcrs *guest_xcrs) 4639 { 4640 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 4641 guest_xcrs->nr_xcrs = 0; 4642 return; 4643 } 4644 4645 guest_xcrs->nr_xcrs = 1; 4646 guest_xcrs->flags = 0; 4647 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; 4648 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; 4649 } 4650 4651 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, 4652 struct kvm_xcrs *guest_xcrs) 4653 { 4654 int i, r = 0; 4655 4656 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 4657 return -EINVAL; 4658 4659 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) 4660 return -EINVAL; 4661 4662 for (i = 0; i < guest_xcrs->nr_xcrs; i++) 4663 /* Only support XCR0 currently */ 4664 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { 4665 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, 4666 guest_xcrs->xcrs[i].value); 4667 break; 4668 } 4669 if (r) 4670 r = -EINVAL; 4671 return r; 4672 } 4673 4674 /* 4675 * kvm_set_guest_paused() indicates to the guest kernel that it has been 4676 * stopped by the hypervisor. This function will be called from the host only. 4677 * EINVAL is returned when the host attempts to set the flag for a guest that 4678 * does not support pv clocks. 4679 */ 4680 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) 4681 { 4682 if (!vcpu->arch.pv_time_enabled) 4683 return -EINVAL; 4684 vcpu->arch.pvclock_set_guest_stopped_request = true; 4685 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 4686 return 0; 4687 } 4688 4689 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, 4690 struct kvm_enable_cap *cap) 4691 { 4692 int r; 4693 uint16_t vmcs_version; 4694 void __user *user_ptr; 4695 4696 if (cap->flags) 4697 return -EINVAL; 4698 4699 switch (cap->cap) { 4700 case KVM_CAP_HYPERV_SYNIC2: 4701 if (cap->args[0]) 4702 return -EINVAL; 4703 fallthrough; 4704 4705 case KVM_CAP_HYPERV_SYNIC: 4706 if (!irqchip_in_kernel(vcpu->kvm)) 4707 return -EINVAL; 4708 return kvm_hv_activate_synic(vcpu, cap->cap == 4709 KVM_CAP_HYPERV_SYNIC2); 4710 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: 4711 if (!kvm_x86_ops.nested_ops->enable_evmcs) 4712 return -ENOTTY; 4713 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); 4714 if (!r) { 4715 user_ptr = (void __user *)(uintptr_t)cap->args[0]; 4716 if (copy_to_user(user_ptr, &vmcs_version, 4717 sizeof(vmcs_version))) 4718 r = -EFAULT; 4719 } 4720 return r; 4721 case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: 4722 if (!kvm_x86_ops.enable_direct_tlbflush) 4723 return -ENOTTY; 4724 4725 return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); 4726 4727 case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: 4728 vcpu->arch.pv_cpuid.enforce = cap->args[0]; 4729 if (vcpu->arch.pv_cpuid.enforce) 4730 kvm_update_pv_runtime(vcpu); 4731 4732 return 0; 4733 default: 4734 return -EINVAL; 4735 } 4736 } 4737 4738 long kvm_arch_vcpu_ioctl(struct file *filp, 4739 unsigned int ioctl, unsigned long arg) 4740 { 4741 struct kvm_vcpu *vcpu = filp->private_data; 4742 void __user *argp = (void __user *)arg; 4743 int r; 4744 union { 4745 struct kvm_lapic_state *lapic; 4746 struct kvm_xsave *xsave; 4747 struct kvm_xcrs *xcrs; 4748 void *buffer; 4749 } u; 4750 4751 vcpu_load(vcpu); 4752 4753 u.buffer = NULL; 4754 switch (ioctl) { 4755 case KVM_GET_LAPIC: { 4756 r = -EINVAL; 4757 if (!lapic_in_kernel(vcpu)) 4758 goto out; 4759 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), 4760 GFP_KERNEL_ACCOUNT); 4761 4762 r = -ENOMEM; 4763 if (!u.lapic) 4764 goto out; 4765 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); 4766 if (r) 4767 goto out; 4768 r = -EFAULT; 4769 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) 4770 goto out; 4771 r = 0; 4772 break; 4773 } 4774 case KVM_SET_LAPIC: { 4775 r = -EINVAL; 4776 if (!lapic_in_kernel(vcpu)) 4777 goto out; 4778 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 4779 if (IS_ERR(u.lapic)) { 4780 r = PTR_ERR(u.lapic); 4781 goto out_nofree; 4782 } 4783 4784 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 4785 break; 4786 } 4787 case KVM_INTERRUPT: { 4788 struct kvm_interrupt irq; 4789 4790 r = -EFAULT; 4791 if (copy_from_user(&irq, argp, sizeof(irq))) 4792 goto out; 4793 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 4794 break; 4795 } 4796 case KVM_NMI: { 4797 r = kvm_vcpu_ioctl_nmi(vcpu); 4798 break; 4799 } 4800 case KVM_SMI: { 4801 r = kvm_vcpu_ioctl_smi(vcpu); 4802 break; 4803 } 4804 case KVM_SET_CPUID: { 4805 struct kvm_cpuid __user *cpuid_arg = argp; 4806 struct kvm_cpuid cpuid; 4807 4808 r = -EFAULT; 4809 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 4810 goto out; 4811 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 4812 break; 4813 } 4814 case KVM_SET_CPUID2: { 4815 struct kvm_cpuid2 __user *cpuid_arg = argp; 4816 struct kvm_cpuid2 cpuid; 4817 4818 r = -EFAULT; 4819 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 4820 goto out; 4821 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 4822 cpuid_arg->entries); 4823 break; 4824 } 4825 case KVM_GET_CPUID2: { 4826 struct kvm_cpuid2 __user *cpuid_arg = argp; 4827 struct kvm_cpuid2 cpuid; 4828 4829 r = -EFAULT; 4830 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 4831 goto out; 4832 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 4833 cpuid_arg->entries); 4834 if (r) 4835 goto out; 4836 r = -EFAULT; 4837 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) 4838 goto out; 4839 r = 0; 4840 break; 4841 } 4842 case KVM_GET_MSRS: { 4843 int idx = srcu_read_lock(&vcpu->kvm->srcu); 4844 r = msr_io(vcpu, argp, do_get_msr, 1); 4845 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4846 break; 4847 } 4848 case KVM_SET_MSRS: { 4849 int idx = srcu_read_lock(&vcpu->kvm->srcu); 4850 r = msr_io(vcpu, argp, do_set_msr, 0); 4851 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4852 break; 4853 } 4854 case KVM_TPR_ACCESS_REPORTING: { 4855 struct kvm_tpr_access_ctl tac; 4856 4857 r = -EFAULT; 4858 if (copy_from_user(&tac, argp, sizeof(tac))) 4859 goto out; 4860 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 4861 if (r) 4862 goto out; 4863 r = -EFAULT; 4864 if (copy_to_user(argp, &tac, sizeof(tac))) 4865 goto out; 4866 r = 0; 4867 break; 4868 }; 4869 case KVM_SET_VAPIC_ADDR: { 4870 struct kvm_vapic_addr va; 4871 int idx; 4872 4873 r = -EINVAL; 4874 if (!lapic_in_kernel(vcpu)) 4875 goto out; 4876 r = -EFAULT; 4877 if (copy_from_user(&va, argp, sizeof(va))) 4878 goto out; 4879 idx = srcu_read_lock(&vcpu->kvm->srcu); 4880 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 4881 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4882 break; 4883 } 4884 case KVM_X86_SETUP_MCE: { 4885 u64 mcg_cap; 4886 4887 r = -EFAULT; 4888 if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap))) 4889 goto out; 4890 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 4891 break; 4892 } 4893 case KVM_X86_SET_MCE: { 4894 struct kvm_x86_mce mce; 4895 4896 r = -EFAULT; 4897 if (copy_from_user(&mce, argp, sizeof(mce))) 4898 goto out; 4899 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 4900 break; 4901 } 4902 case KVM_GET_VCPU_EVENTS: { 4903 struct kvm_vcpu_events events; 4904 4905 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 4906 4907 r = -EFAULT; 4908 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 4909 break; 4910 r = 0; 4911 break; 4912 } 4913 case KVM_SET_VCPU_EVENTS: { 4914 struct kvm_vcpu_events events; 4915 4916 r = -EFAULT; 4917 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 4918 break; 4919 4920 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 4921 break; 4922 } 4923 case KVM_GET_DEBUGREGS: { 4924 struct kvm_debugregs dbgregs; 4925 4926 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); 4927 4928 r = -EFAULT; 4929 if (copy_to_user(argp, &dbgregs, 4930 sizeof(struct kvm_debugregs))) 4931 break; 4932 r = 0; 4933 break; 4934 } 4935 case KVM_SET_DEBUGREGS: { 4936 struct kvm_debugregs dbgregs; 4937 4938 r = -EFAULT; 4939 if (copy_from_user(&dbgregs, argp, 4940 sizeof(struct kvm_debugregs))) 4941 break; 4942 4943 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 4944 break; 4945 } 4946 case KVM_GET_XSAVE: { 4947 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); 4948 r = -ENOMEM; 4949 if (!u.xsave) 4950 break; 4951 4952 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); 4953 4954 r = -EFAULT; 4955 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) 4956 break; 4957 r = 0; 4958 break; 4959 } 4960 case KVM_SET_XSAVE: { 4961 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 4962 if (IS_ERR(u.xsave)) { 4963 r = PTR_ERR(u.xsave); 4964 goto out_nofree; 4965 } 4966 4967 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 4968 break; 4969 } 4970 case KVM_GET_XCRS: { 4971 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); 4972 r = -ENOMEM; 4973 if (!u.xcrs) 4974 break; 4975 4976 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); 4977 4978 r = -EFAULT; 4979 if (copy_to_user(argp, u.xcrs, 4980 sizeof(struct kvm_xcrs))) 4981 break; 4982 r = 0; 4983 break; 4984 } 4985 case KVM_SET_XCRS: { 4986 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 4987 if (IS_ERR(u.xcrs)) { 4988 r = PTR_ERR(u.xcrs); 4989 goto out_nofree; 4990 } 4991 4992 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 4993 break; 4994 } 4995 case KVM_SET_TSC_KHZ: { 4996 u32 user_tsc_khz; 4997 4998 r = -EINVAL; 4999 user_tsc_khz = (u32)arg; 5000 5001 if (kvm_has_tsc_control && 5002 user_tsc_khz >= kvm_max_guest_tsc_khz) 5003 goto out; 5004 5005 if (user_tsc_khz == 0) 5006 user_tsc_khz = tsc_khz; 5007 5008 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) 5009 r = 0; 5010 5011 goto out; 5012 } 5013 case KVM_GET_TSC_KHZ: { 5014 r = vcpu->arch.virtual_tsc_khz; 5015 goto out; 5016 } 5017 case KVM_KVMCLOCK_CTRL: { 5018 r = kvm_set_guest_paused(vcpu); 5019 goto out; 5020 } 5021 case KVM_ENABLE_CAP: { 5022 struct kvm_enable_cap cap; 5023 5024 r = -EFAULT; 5025 if (copy_from_user(&cap, argp, sizeof(cap))) 5026 goto out; 5027 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); 5028 break; 5029 } 5030 case KVM_GET_NESTED_STATE: { 5031 struct kvm_nested_state __user *user_kvm_nested_state = argp; 5032 u32 user_data_size; 5033 5034 r = -EINVAL; 5035 if (!kvm_x86_ops.nested_ops->get_state) 5036 break; 5037 5038 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); 5039 r = -EFAULT; 5040 if (get_user(user_data_size, &user_kvm_nested_state->size)) 5041 break; 5042 5043 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, 5044 user_data_size); 5045 if (r < 0) 5046 break; 5047 5048 if (r > user_data_size) { 5049 if (put_user(r, &user_kvm_nested_state->size)) 5050 r = -EFAULT; 5051 else 5052 r = -E2BIG; 5053 break; 5054 } 5055 5056 r = 0; 5057 break; 5058 } 5059 case KVM_SET_NESTED_STATE: { 5060 struct kvm_nested_state __user *user_kvm_nested_state = argp; 5061 struct kvm_nested_state kvm_state; 5062 int idx; 5063 5064 r = -EINVAL; 5065 if (!kvm_x86_ops.nested_ops->set_state) 5066 break; 5067 5068 r = -EFAULT; 5069 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) 5070 break; 5071 5072 r = -EINVAL; 5073 if (kvm_state.size < sizeof(kvm_state)) 5074 break; 5075 5076 if (kvm_state.flags & 5077 ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE 5078 | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING 5079 | KVM_STATE_NESTED_GIF_SET)) 5080 break; 5081 5082 /* nested_run_pending implies guest_mode. */ 5083 if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING) 5084 && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) 5085 break; 5086 5087 idx = srcu_read_lock(&vcpu->kvm->srcu); 5088 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); 5089 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5090 break; 5091 } 5092 case KVM_GET_SUPPORTED_HV_CPUID: 5093 r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); 5094 break; 5095 #ifdef CONFIG_KVM_XEN 5096 case KVM_XEN_VCPU_GET_ATTR: { 5097 struct kvm_xen_vcpu_attr xva; 5098 5099 r = -EFAULT; 5100 if (copy_from_user(&xva, argp, sizeof(xva))) 5101 goto out; 5102 r = kvm_xen_vcpu_get_attr(vcpu, &xva); 5103 if (!r && copy_to_user(argp, &xva, sizeof(xva))) 5104 r = -EFAULT; 5105 break; 5106 } 5107 case KVM_XEN_VCPU_SET_ATTR: { 5108 struct kvm_xen_vcpu_attr xva; 5109 5110 r = -EFAULT; 5111 if (copy_from_user(&xva, argp, sizeof(xva))) 5112 goto out; 5113 r = kvm_xen_vcpu_set_attr(vcpu, &xva); 5114 break; 5115 } 5116 #endif 5117 default: 5118 r = -EINVAL; 5119 } 5120 out: 5121 kfree(u.buffer); 5122 out_nofree: 5123 vcpu_put(vcpu); 5124 return r; 5125 } 5126 5127 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) 5128 { 5129 return VM_FAULT_SIGBUS; 5130 } 5131 5132 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 5133 { 5134 int ret; 5135 5136 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 5137 return -EINVAL; 5138 ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); 5139 return ret; 5140 } 5141 5142 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 5143 u64 ident_addr) 5144 { 5145 return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); 5146 } 5147 5148 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 5149 unsigned long kvm_nr_mmu_pages) 5150 { 5151 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 5152 return -EINVAL; 5153 5154 mutex_lock(&kvm->slots_lock); 5155 5156 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 5157 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 5158 5159 mutex_unlock(&kvm->slots_lock); 5160 return 0; 5161 } 5162 5163 static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 5164 { 5165 return kvm->arch.n_max_mmu_pages; 5166 } 5167 5168 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 5169 { 5170 struct kvm_pic *pic = kvm->arch.vpic; 5171 int r; 5172 5173 r = 0; 5174 switch (chip->chip_id) { 5175 case KVM_IRQCHIP_PIC_MASTER: 5176 memcpy(&chip->chip.pic, &pic->pics[0], 5177 sizeof(struct kvm_pic_state)); 5178 break; 5179 case KVM_IRQCHIP_PIC_SLAVE: 5180 memcpy(&chip->chip.pic, &pic->pics[1], 5181 sizeof(struct kvm_pic_state)); 5182 break; 5183 case KVM_IRQCHIP_IOAPIC: 5184 kvm_get_ioapic(kvm, &chip->chip.ioapic); 5185 break; 5186 default: 5187 r = -EINVAL; 5188 break; 5189 } 5190 return r; 5191 } 5192 5193 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 5194 { 5195 struct kvm_pic *pic = kvm->arch.vpic; 5196 int r; 5197 5198 r = 0; 5199 switch (chip->chip_id) { 5200 case KVM_IRQCHIP_PIC_MASTER: 5201 spin_lock(&pic->lock); 5202 memcpy(&pic->pics[0], &chip->chip.pic, 5203 sizeof(struct kvm_pic_state)); 5204 spin_unlock(&pic->lock); 5205 break; 5206 case KVM_IRQCHIP_PIC_SLAVE: 5207 spin_lock(&pic->lock); 5208 memcpy(&pic->pics[1], &chip->chip.pic, 5209 sizeof(struct kvm_pic_state)); 5210 spin_unlock(&pic->lock); 5211 break; 5212 case KVM_IRQCHIP_IOAPIC: 5213 kvm_set_ioapic(kvm, &chip->chip.ioapic); 5214 break; 5215 default: 5216 r = -EINVAL; 5217 break; 5218 } 5219 kvm_pic_update_irq(pic); 5220 return r; 5221 } 5222 5223 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 5224 { 5225 struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; 5226 5227 BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); 5228 5229 mutex_lock(&kps->lock); 5230 memcpy(ps, &kps->channels, sizeof(*ps)); 5231 mutex_unlock(&kps->lock); 5232 return 0; 5233 } 5234 5235 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 5236 { 5237 int i; 5238 struct kvm_pit *pit = kvm->arch.vpit; 5239 5240 mutex_lock(&pit->pit_state.lock); 5241 memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); 5242 for (i = 0; i < 3; i++) 5243 kvm_pit_load_count(pit, i, ps->channels[i].count, 0); 5244 mutex_unlock(&pit->pit_state.lock); 5245 return 0; 5246 } 5247 5248 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 5249 { 5250 mutex_lock(&kvm->arch.vpit->pit_state.lock); 5251 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 5252 sizeof(ps->channels)); 5253 ps->flags = kvm->arch.vpit->pit_state.flags; 5254 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 5255 memset(&ps->reserved, 0, sizeof(ps->reserved)); 5256 return 0; 5257 } 5258 5259 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 5260 { 5261 int start = 0; 5262 int i; 5263 u32 prev_legacy, cur_legacy; 5264 struct kvm_pit *pit = kvm->arch.vpit; 5265 5266 mutex_lock(&pit->pit_state.lock); 5267 prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 5268 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 5269 if (!prev_legacy && cur_legacy) 5270 start = 1; 5271 memcpy(&pit->pit_state.channels, &ps->channels, 5272 sizeof(pit->pit_state.channels)); 5273 pit->pit_state.flags = ps->flags; 5274 for (i = 0; i < 3; i++) 5275 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, 5276 start && i == 0); 5277 mutex_unlock(&pit->pit_state.lock); 5278 return 0; 5279 } 5280 5281 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 5282 struct kvm_reinject_control *control) 5283 { 5284 struct kvm_pit *pit = kvm->arch.vpit; 5285 5286 /* pit->pit_state.lock was overloaded to prevent userspace from getting 5287 * an inconsistent state after running multiple KVM_REINJECT_CONTROL 5288 * ioctls in parallel. Use a separate lock if that ioctl isn't rare. 5289 */ 5290 mutex_lock(&pit->pit_state.lock); 5291 kvm_pit_set_reinject(pit, control->pit_reinject); 5292 mutex_unlock(&pit->pit_state.lock); 5293 5294 return 0; 5295 } 5296 5297 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 5298 { 5299 5300 /* 5301 * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called 5302 * before reporting dirty_bitmap to userspace. KVM flushes the buffers 5303 * on all VM-Exits, thus we only need to kick running vCPUs to force a 5304 * VM-Exit. 5305 */ 5306 struct kvm_vcpu *vcpu; 5307 int i; 5308 5309 kvm_for_each_vcpu(i, vcpu, kvm) 5310 kvm_vcpu_kick(vcpu); 5311 } 5312 5313 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 5314 bool line_status) 5315 { 5316 if (!irqchip_in_kernel(kvm)) 5317 return -ENXIO; 5318 5319 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 5320 irq_event->irq, irq_event->level, 5321 line_status); 5322 return 0; 5323 } 5324 5325 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 5326 struct kvm_enable_cap *cap) 5327 { 5328 int r; 5329 5330 if (cap->flags) 5331 return -EINVAL; 5332 5333 switch (cap->cap) { 5334 case KVM_CAP_DISABLE_QUIRKS: 5335 kvm->arch.disabled_quirks = cap->args[0]; 5336 r = 0; 5337 break; 5338 case KVM_CAP_SPLIT_IRQCHIP: { 5339 mutex_lock(&kvm->lock); 5340 r = -EINVAL; 5341 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) 5342 goto split_irqchip_unlock; 5343 r = -EEXIST; 5344 if (irqchip_in_kernel(kvm)) 5345 goto split_irqchip_unlock; 5346 if (kvm->created_vcpus) 5347 goto split_irqchip_unlock; 5348 r = kvm_setup_empty_irq_routing(kvm); 5349 if (r) 5350 goto split_irqchip_unlock; 5351 /* Pairs with irqchip_in_kernel. */ 5352 smp_wmb(); 5353 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; 5354 kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; 5355 r = 0; 5356 split_irqchip_unlock: 5357 mutex_unlock(&kvm->lock); 5358 break; 5359 } 5360 case KVM_CAP_X2APIC_API: 5361 r = -EINVAL; 5362 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) 5363 break; 5364 5365 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) 5366 kvm->arch.x2apic_format = true; 5367 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) 5368 kvm->arch.x2apic_broadcast_quirk_disabled = true; 5369 5370 r = 0; 5371 break; 5372 case KVM_CAP_X86_DISABLE_EXITS: 5373 r = -EINVAL; 5374 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) 5375 break; 5376 5377 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && 5378 kvm_can_mwait_in_guest()) 5379 kvm->arch.mwait_in_guest = true; 5380 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) 5381 kvm->arch.hlt_in_guest = true; 5382 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) 5383 kvm->arch.pause_in_guest = true; 5384 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) 5385 kvm->arch.cstate_in_guest = true; 5386 r = 0; 5387 break; 5388 case KVM_CAP_MSR_PLATFORM_INFO: 5389 kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; 5390 r = 0; 5391 break; 5392 case KVM_CAP_EXCEPTION_PAYLOAD: 5393 kvm->arch.exception_payload_enabled = cap->args[0]; 5394 r = 0; 5395 break; 5396 case KVM_CAP_X86_USER_SPACE_MSR: 5397 kvm->arch.user_space_msr_mask = cap->args[0]; 5398 r = 0; 5399 break; 5400 case KVM_CAP_X86_BUS_LOCK_EXIT: 5401 r = -EINVAL; 5402 if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) 5403 break; 5404 5405 if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && 5406 (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) 5407 break; 5408 5409 if (kvm_has_bus_lock_exit && 5410 cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) 5411 kvm->arch.bus_lock_detection_enabled = true; 5412 r = 0; 5413 break; 5414 #ifdef CONFIG_X86_SGX_KVM 5415 case KVM_CAP_SGX_ATTRIBUTE: { 5416 unsigned long allowed_attributes = 0; 5417 5418 r = sgx_set_attribute(&allowed_attributes, cap->args[0]); 5419 if (r) 5420 break; 5421 5422 /* KVM only supports the PROVISIONKEY privileged attribute. */ 5423 if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && 5424 !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) 5425 kvm->arch.sgx_provisioning_allowed = true; 5426 else 5427 r = -EINVAL; 5428 break; 5429 } 5430 #endif 5431 case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: 5432 r = -EINVAL; 5433 if (kvm_x86_ops.vm_copy_enc_context_from) 5434 r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); 5435 return r; 5436 default: 5437 r = -EINVAL; 5438 break; 5439 } 5440 return r; 5441 } 5442 5443 static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) 5444 { 5445 struct kvm_x86_msr_filter *msr_filter; 5446 5447 msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); 5448 if (!msr_filter) 5449 return NULL; 5450 5451 msr_filter->default_allow = default_allow; 5452 return msr_filter; 5453 } 5454 5455 static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) 5456 { 5457 u32 i; 5458 5459 if (!msr_filter) 5460 return; 5461 5462 for (i = 0; i < msr_filter->count; i++) 5463 kfree(msr_filter->ranges[i].bitmap); 5464 5465 kfree(msr_filter); 5466 } 5467 5468 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, 5469 struct kvm_msr_filter_range *user_range) 5470 { 5471 struct msr_bitmap_range range; 5472 unsigned long *bitmap = NULL; 5473 size_t bitmap_size; 5474 int r; 5475 5476 if (!user_range->nmsrs) 5477 return 0; 5478 5479 bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); 5480 if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) 5481 return -EINVAL; 5482 5483 bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); 5484 if (IS_ERR(bitmap)) 5485 return PTR_ERR(bitmap); 5486 5487 range = (struct msr_bitmap_range) { 5488 .flags = user_range->flags, 5489 .base = user_range->base, 5490 .nmsrs = user_range->nmsrs, 5491 .bitmap = bitmap, 5492 }; 5493 5494 if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { 5495 r = -EINVAL; 5496 goto err; 5497 } 5498 5499 if (!range.flags) { 5500 r = -EINVAL; 5501 goto err; 5502 } 5503 5504 /* Everything ok, add this range identifier. */ 5505 msr_filter->ranges[msr_filter->count] = range; 5506 msr_filter->count++; 5507 5508 return 0; 5509 err: 5510 kfree(bitmap); 5511 return r; 5512 } 5513 5514 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) 5515 { 5516 struct kvm_msr_filter __user *user_msr_filter = argp; 5517 struct kvm_x86_msr_filter *new_filter, *old_filter; 5518 struct kvm_msr_filter filter; 5519 bool default_allow; 5520 bool empty = true; 5521 int r = 0; 5522 u32 i; 5523 5524 if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) 5525 return -EFAULT; 5526 5527 for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) 5528 empty &= !filter.ranges[i].nmsrs; 5529 5530 default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); 5531 if (empty && !default_allow) 5532 return -EINVAL; 5533 5534 new_filter = kvm_alloc_msr_filter(default_allow); 5535 if (!new_filter) 5536 return -ENOMEM; 5537 5538 for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { 5539 r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); 5540 if (r) { 5541 kvm_free_msr_filter(new_filter); 5542 return r; 5543 } 5544 } 5545 5546 mutex_lock(&kvm->lock); 5547 5548 /* The per-VM filter is protected by kvm->lock... */ 5549 old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); 5550 5551 rcu_assign_pointer(kvm->arch.msr_filter, new_filter); 5552 synchronize_srcu(&kvm->srcu); 5553 5554 kvm_free_msr_filter(old_filter); 5555 5556 kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); 5557 mutex_unlock(&kvm->lock); 5558 5559 return 0; 5560 } 5561 5562 long kvm_arch_vm_ioctl(struct file *filp, 5563 unsigned int ioctl, unsigned long arg) 5564 { 5565 struct kvm *kvm = filp->private_data; 5566 void __user *argp = (void __user *)arg; 5567 int r = -ENOTTY; 5568 /* 5569 * This union makes it completely explicit to gcc-3.x 5570 * that these two variables' stack usage should be 5571 * combined, not added together. 5572 */ 5573 union { 5574 struct kvm_pit_state ps; 5575 struct kvm_pit_state2 ps2; 5576 struct kvm_pit_config pit_config; 5577 } u; 5578 5579 switch (ioctl) { 5580 case KVM_SET_TSS_ADDR: 5581 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 5582 break; 5583 case KVM_SET_IDENTITY_MAP_ADDR: { 5584 u64 ident_addr; 5585 5586 mutex_lock(&kvm->lock); 5587 r = -EINVAL; 5588 if (kvm->created_vcpus) 5589 goto set_identity_unlock; 5590 r = -EFAULT; 5591 if (copy_from_user(&ident_addr, argp, sizeof(ident_addr))) 5592 goto set_identity_unlock; 5593 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 5594 set_identity_unlock: 5595 mutex_unlock(&kvm->lock); 5596 break; 5597 } 5598 case KVM_SET_NR_MMU_PAGES: 5599 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 5600 break; 5601 case KVM_GET_NR_MMU_PAGES: 5602 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 5603 break; 5604 case KVM_CREATE_IRQCHIP: { 5605 mutex_lock(&kvm->lock); 5606 5607 r = -EEXIST; 5608 if (irqchip_in_kernel(kvm)) 5609 goto create_irqchip_unlock; 5610 5611 r = -EINVAL; 5612 if (kvm->created_vcpus) 5613 goto create_irqchip_unlock; 5614 5615 r = kvm_pic_init(kvm); 5616 if (r) 5617 goto create_irqchip_unlock; 5618 5619 r = kvm_ioapic_init(kvm); 5620 if (r) { 5621 kvm_pic_destroy(kvm); 5622 goto create_irqchip_unlock; 5623 } 5624 5625 r = kvm_setup_default_irq_routing(kvm); 5626 if (r) { 5627 kvm_ioapic_destroy(kvm); 5628 kvm_pic_destroy(kvm); 5629 goto create_irqchip_unlock; 5630 } 5631 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ 5632 smp_wmb(); 5633 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; 5634 create_irqchip_unlock: 5635 mutex_unlock(&kvm->lock); 5636 break; 5637 } 5638 case KVM_CREATE_PIT: 5639 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 5640 goto create_pit; 5641 case KVM_CREATE_PIT2: 5642 r = -EFAULT; 5643 if (copy_from_user(&u.pit_config, argp, 5644 sizeof(struct kvm_pit_config))) 5645 goto out; 5646 create_pit: 5647 mutex_lock(&kvm->lock); 5648 r = -EEXIST; 5649 if (kvm->arch.vpit) 5650 goto create_pit_unlock; 5651 r = -ENOMEM; 5652 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 5653 if (kvm->arch.vpit) 5654 r = 0; 5655 create_pit_unlock: 5656 mutex_unlock(&kvm->lock); 5657 break; 5658 case KVM_GET_IRQCHIP: { 5659 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 5660 struct kvm_irqchip *chip; 5661 5662 chip = memdup_user(argp, sizeof(*chip)); 5663 if (IS_ERR(chip)) { 5664 r = PTR_ERR(chip); 5665 goto out; 5666 } 5667 5668 r = -ENXIO; 5669 if (!irqchip_kernel(kvm)) 5670 goto get_irqchip_out; 5671 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 5672 if (r) 5673 goto get_irqchip_out; 5674 r = -EFAULT; 5675 if (copy_to_user(argp, chip, sizeof(*chip))) 5676 goto get_irqchip_out; 5677 r = 0; 5678 get_irqchip_out: 5679 kfree(chip); 5680 break; 5681 } 5682 case KVM_SET_IRQCHIP: { 5683 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 5684 struct kvm_irqchip *chip; 5685 5686 chip = memdup_user(argp, sizeof(*chip)); 5687 if (IS_ERR(chip)) { 5688 r = PTR_ERR(chip); 5689 goto out; 5690 } 5691 5692 r = -ENXIO; 5693 if (!irqchip_kernel(kvm)) 5694 goto set_irqchip_out; 5695 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 5696 set_irqchip_out: 5697 kfree(chip); 5698 break; 5699 } 5700 case KVM_GET_PIT: { 5701 r = -EFAULT; 5702 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 5703 goto out; 5704 r = -ENXIO; 5705 if (!kvm->arch.vpit) 5706 goto out; 5707 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 5708 if (r) 5709 goto out; 5710 r = -EFAULT; 5711 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 5712 goto out; 5713 r = 0; 5714 break; 5715 } 5716 case KVM_SET_PIT: { 5717 r = -EFAULT; 5718 if (copy_from_user(&u.ps, argp, sizeof(u.ps))) 5719 goto out; 5720 mutex_lock(&kvm->lock); 5721 r = -ENXIO; 5722 if (!kvm->arch.vpit) 5723 goto set_pit_out; 5724 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 5725 set_pit_out: 5726 mutex_unlock(&kvm->lock); 5727 break; 5728 } 5729 case KVM_GET_PIT2: { 5730 r = -ENXIO; 5731 if (!kvm->arch.vpit) 5732 goto out; 5733 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 5734 if (r) 5735 goto out; 5736 r = -EFAULT; 5737 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 5738 goto out; 5739 r = 0; 5740 break; 5741 } 5742 case KVM_SET_PIT2: { 5743 r = -EFAULT; 5744 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 5745 goto out; 5746 mutex_lock(&kvm->lock); 5747 r = -ENXIO; 5748 if (!kvm->arch.vpit) 5749 goto set_pit2_out; 5750 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 5751 set_pit2_out: 5752 mutex_unlock(&kvm->lock); 5753 break; 5754 } 5755 case KVM_REINJECT_CONTROL: { 5756 struct kvm_reinject_control control; 5757 r = -EFAULT; 5758 if (copy_from_user(&control, argp, sizeof(control))) 5759 goto out; 5760 r = -ENXIO; 5761 if (!kvm->arch.vpit) 5762 goto out; 5763 r = kvm_vm_ioctl_reinject(kvm, &control); 5764 break; 5765 } 5766 case KVM_SET_BOOT_CPU_ID: 5767 r = 0; 5768 mutex_lock(&kvm->lock); 5769 if (kvm->created_vcpus) 5770 r = -EBUSY; 5771 else 5772 kvm->arch.bsp_vcpu_id = arg; 5773 mutex_unlock(&kvm->lock); 5774 break; 5775 #ifdef CONFIG_KVM_XEN 5776 case KVM_XEN_HVM_CONFIG: { 5777 struct kvm_xen_hvm_config xhc; 5778 r = -EFAULT; 5779 if (copy_from_user(&xhc, argp, sizeof(xhc))) 5780 goto out; 5781 r = kvm_xen_hvm_config(kvm, &xhc); 5782 break; 5783 } 5784 case KVM_XEN_HVM_GET_ATTR: { 5785 struct kvm_xen_hvm_attr xha; 5786 5787 r = -EFAULT; 5788 if (copy_from_user(&xha, argp, sizeof(xha))) 5789 goto out; 5790 r = kvm_xen_hvm_get_attr(kvm, &xha); 5791 if (!r && copy_to_user(argp, &xha, sizeof(xha))) 5792 r = -EFAULT; 5793 break; 5794 } 5795 case KVM_XEN_HVM_SET_ATTR: { 5796 struct kvm_xen_hvm_attr xha; 5797 5798 r = -EFAULT; 5799 if (copy_from_user(&xha, argp, sizeof(xha))) 5800 goto out; 5801 r = kvm_xen_hvm_set_attr(kvm, &xha); 5802 break; 5803 } 5804 #endif 5805 case KVM_SET_CLOCK: { 5806 struct kvm_arch *ka = &kvm->arch; 5807 struct kvm_clock_data user_ns; 5808 u64 now_ns; 5809 5810 r = -EFAULT; 5811 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 5812 goto out; 5813 5814 r = -EINVAL; 5815 if (user_ns.flags) 5816 goto out; 5817 5818 r = 0; 5819 /* 5820 * TODO: userspace has to take care of races with VCPU_RUN, so 5821 * kvm_gen_update_masterclock() can be cut down to locked 5822 * pvclock_update_vm_gtod_copy(). 5823 */ 5824 kvm_gen_update_masterclock(kvm); 5825 5826 /* 5827 * This pairs with kvm_guest_time_update(): when masterclock is 5828 * in use, we use master_kernel_ns + kvmclock_offset to set 5829 * unsigned 'system_time' so if we use get_kvmclock_ns() (which 5830 * is slightly ahead) here we risk going negative on unsigned 5831 * 'system_time' when 'user_ns.clock' is very small. 5832 */ 5833 spin_lock_irq(&ka->pvclock_gtod_sync_lock); 5834 if (kvm->arch.use_master_clock) 5835 now_ns = ka->master_kernel_ns; 5836 else 5837 now_ns = get_kvmclock_base_ns(); 5838 ka->kvmclock_offset = user_ns.clock - now_ns; 5839 spin_unlock_irq(&ka->pvclock_gtod_sync_lock); 5840 5841 kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); 5842 break; 5843 } 5844 case KVM_GET_CLOCK: { 5845 struct kvm_clock_data user_ns; 5846 u64 now_ns; 5847 5848 now_ns = get_kvmclock_ns(kvm); 5849 user_ns.clock = now_ns; 5850 user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; 5851 memset(&user_ns.pad, 0, sizeof(user_ns.pad)); 5852 5853 r = -EFAULT; 5854 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 5855 goto out; 5856 r = 0; 5857 break; 5858 } 5859 case KVM_MEMORY_ENCRYPT_OP: { 5860 r = -ENOTTY; 5861 if (kvm_x86_ops.mem_enc_op) 5862 r = static_call(kvm_x86_mem_enc_op)(kvm, argp); 5863 break; 5864 } 5865 case KVM_MEMORY_ENCRYPT_REG_REGION: { 5866 struct kvm_enc_region region; 5867 5868 r = -EFAULT; 5869 if (copy_from_user(®ion, argp, sizeof(region))) 5870 goto out; 5871 5872 r = -ENOTTY; 5873 if (kvm_x86_ops.mem_enc_reg_region) 5874 r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); 5875 break; 5876 } 5877 case KVM_MEMORY_ENCRYPT_UNREG_REGION: { 5878 struct kvm_enc_region region; 5879 5880 r = -EFAULT; 5881 if (copy_from_user(®ion, argp, sizeof(region))) 5882 goto out; 5883 5884 r = -ENOTTY; 5885 if (kvm_x86_ops.mem_enc_unreg_region) 5886 r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); 5887 break; 5888 } 5889 case KVM_HYPERV_EVENTFD: { 5890 struct kvm_hyperv_eventfd hvevfd; 5891 5892 r = -EFAULT; 5893 if (copy_from_user(&hvevfd, argp, sizeof(hvevfd))) 5894 goto out; 5895 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); 5896 break; 5897 } 5898 case KVM_SET_PMU_EVENT_FILTER: 5899 r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); 5900 break; 5901 case KVM_X86_SET_MSR_FILTER: 5902 r = kvm_vm_ioctl_set_msr_filter(kvm, argp); 5903 break; 5904 default: 5905 r = -ENOTTY; 5906 } 5907 out: 5908 return r; 5909 } 5910 5911 static void kvm_init_msr_list(void) 5912 { 5913 struct x86_pmu_capability x86_pmu; 5914 u32 dummy[2]; 5915 unsigned i; 5916 5917 BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, 5918 "Please update the fixed PMCs in msrs_to_saved_all[]"); 5919 5920 perf_get_x86_pmu_capability(&x86_pmu); 5921 5922 num_msrs_to_save = 0; 5923 num_emulated_msrs = 0; 5924 num_msr_based_features = 0; 5925 5926 for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { 5927 if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) 5928 continue; 5929 5930 /* 5931 * Even MSRs that are valid in the host may not be exposed 5932 * to the guests in some cases. 5933 */ 5934 switch (msrs_to_save_all[i]) { 5935 case MSR_IA32_BNDCFGS: 5936 if (!kvm_mpx_supported()) 5937 continue; 5938 break; 5939 case MSR_TSC_AUX: 5940 if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) 5941 continue; 5942 break; 5943 case MSR_IA32_UMWAIT_CONTROL: 5944 if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) 5945 continue; 5946 break; 5947 case MSR_IA32_RTIT_CTL: 5948 case MSR_IA32_RTIT_STATUS: 5949 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) 5950 continue; 5951 break; 5952 case MSR_IA32_RTIT_CR3_MATCH: 5953 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 5954 !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) 5955 continue; 5956 break; 5957 case MSR_IA32_RTIT_OUTPUT_BASE: 5958 case MSR_IA32_RTIT_OUTPUT_MASK: 5959 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 5960 (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && 5961 !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) 5962 continue; 5963 break; 5964 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 5965 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 5966 msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= 5967 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) 5968 continue; 5969 break; 5970 case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: 5971 if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= 5972 min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) 5973 continue; 5974 break; 5975 case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: 5976 if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= 5977 min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) 5978 continue; 5979 break; 5980 default: 5981 break; 5982 } 5983 5984 msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; 5985 } 5986 5987 for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { 5988 if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) 5989 continue; 5990 5991 emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; 5992 } 5993 5994 for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { 5995 struct kvm_msr_entry msr; 5996 5997 msr.index = msr_based_features_all[i]; 5998 if (kvm_get_msr_feature(&msr)) 5999 continue; 6000 6001 msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; 6002 } 6003 } 6004 6005 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 6006 const void *v) 6007 { 6008 int handled = 0; 6009 int n; 6010 6011 do { 6012 n = min(len, 8); 6013 if (!(lapic_in_kernel(vcpu) && 6014 !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) 6015 && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) 6016 break; 6017 handled += n; 6018 addr += n; 6019 len -= n; 6020 v += n; 6021 } while (len); 6022 6023 return handled; 6024 } 6025 6026 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 6027 { 6028 int handled = 0; 6029 int n; 6030 6031 do { 6032 n = min(len, 8); 6033 if (!(lapic_in_kernel(vcpu) && 6034 !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, 6035 addr, n, v)) 6036 && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) 6037 break; 6038 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); 6039 handled += n; 6040 addr += n; 6041 len -= n; 6042 v += n; 6043 } while (len); 6044 6045 return handled; 6046 } 6047 6048 static void kvm_set_segment(struct kvm_vcpu *vcpu, 6049 struct kvm_segment *var, int seg) 6050 { 6051 static_call(kvm_x86_set_segment)(vcpu, var, seg); 6052 } 6053 6054 void kvm_get_segment(struct kvm_vcpu *vcpu, 6055 struct kvm_segment *var, int seg) 6056 { 6057 static_call(kvm_x86_get_segment)(vcpu, var, seg); 6058 } 6059 6060 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, 6061 struct x86_exception *exception) 6062 { 6063 gpa_t t_gpa; 6064 6065 BUG_ON(!mmu_is_nested(vcpu)); 6066 6067 /* NPT walks are always user-walks */ 6068 access |= PFERR_USER_MASK; 6069 t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); 6070 6071 return t_gpa; 6072 } 6073 6074 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, 6075 struct x86_exception *exception) 6076 { 6077 u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; 6078 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 6079 } 6080 EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); 6081 6082 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, 6083 struct x86_exception *exception) 6084 { 6085 u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; 6086 access |= PFERR_FETCH_MASK; 6087 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 6088 } 6089 6090 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, 6091 struct x86_exception *exception) 6092 { 6093 u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; 6094 access |= PFERR_WRITE_MASK; 6095 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 6096 } 6097 EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); 6098 6099 /* uses this to access any guest's mapped memory without checking CPL */ 6100 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, 6101 struct x86_exception *exception) 6102 { 6103 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); 6104 } 6105 6106 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 6107 struct kvm_vcpu *vcpu, u32 access, 6108 struct x86_exception *exception) 6109 { 6110 void *data = val; 6111 int r = X86EMUL_CONTINUE; 6112 6113 while (bytes) { 6114 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 6115 exception); 6116 unsigned offset = addr & (PAGE_SIZE-1); 6117 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 6118 int ret; 6119 6120 if (gpa == UNMAPPED_GVA) 6121 return X86EMUL_PROPAGATE_FAULT; 6122 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, 6123 offset, toread); 6124 if (ret < 0) { 6125 r = X86EMUL_IO_NEEDED; 6126 goto out; 6127 } 6128 6129 bytes -= toread; 6130 data += toread; 6131 addr += toread; 6132 } 6133 out: 6134 return r; 6135 } 6136 6137 /* used for instruction fetching */ 6138 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, 6139 gva_t addr, void *val, unsigned int bytes, 6140 struct x86_exception *exception) 6141 { 6142 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6143 u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; 6144 unsigned offset; 6145 int ret; 6146 6147 /* Inline kvm_read_guest_virt_helper for speed. */ 6148 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, 6149 exception); 6150 if (unlikely(gpa == UNMAPPED_GVA)) 6151 return X86EMUL_PROPAGATE_FAULT; 6152 6153 offset = addr & (PAGE_SIZE-1); 6154 if (WARN_ON(offset + bytes > PAGE_SIZE)) 6155 bytes = (unsigned)PAGE_SIZE - offset; 6156 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val, 6157 offset, bytes); 6158 if (unlikely(ret < 0)) 6159 return X86EMUL_IO_NEEDED; 6160 6161 return X86EMUL_CONTINUE; 6162 } 6163 6164 int kvm_read_guest_virt(struct kvm_vcpu *vcpu, 6165 gva_t addr, void *val, unsigned int bytes, 6166 struct x86_exception *exception) 6167 { 6168 u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; 6169 6170 /* 6171 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED 6172 * is returned, but our callers are not ready for that and they blindly 6173 * call kvm_inject_page_fault. Ensure that they at least do not leak 6174 * uninitialized kernel stack memory into cr2 and error code. 6175 */ 6176 memset(exception, 0, sizeof(*exception)); 6177 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 6178 exception); 6179 } 6180 EXPORT_SYMBOL_GPL(kvm_read_guest_virt); 6181 6182 static int emulator_read_std(struct x86_emulate_ctxt *ctxt, 6183 gva_t addr, void *val, unsigned int bytes, 6184 struct x86_exception *exception, bool system) 6185 { 6186 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6187 u32 access = 0; 6188 6189 if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) 6190 access |= PFERR_USER_MASK; 6191 6192 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); 6193 } 6194 6195 static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, 6196 unsigned long addr, void *val, unsigned int bytes) 6197 { 6198 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6199 int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); 6200 6201 return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; 6202 } 6203 6204 static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 6205 struct kvm_vcpu *vcpu, u32 access, 6206 struct x86_exception *exception) 6207 { 6208 void *data = val; 6209 int r = X86EMUL_CONTINUE; 6210 6211 while (bytes) { 6212 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 6213 access, 6214 exception); 6215 unsigned offset = addr & (PAGE_SIZE-1); 6216 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 6217 int ret; 6218 6219 if (gpa == UNMAPPED_GVA) 6220 return X86EMUL_PROPAGATE_FAULT; 6221 ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); 6222 if (ret < 0) { 6223 r = X86EMUL_IO_NEEDED; 6224 goto out; 6225 } 6226 6227 bytes -= towrite; 6228 data += towrite; 6229 addr += towrite; 6230 } 6231 out: 6232 return r; 6233 } 6234 6235 static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, 6236 unsigned int bytes, struct x86_exception *exception, 6237 bool system) 6238 { 6239 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6240 u32 access = PFERR_WRITE_MASK; 6241 6242 if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) 6243 access |= PFERR_USER_MASK; 6244 6245 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 6246 access, exception); 6247 } 6248 6249 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, 6250 unsigned int bytes, struct x86_exception *exception) 6251 { 6252 /* kvm_write_guest_virt_system can pull in tons of pages. */ 6253 vcpu->arch.l1tf_flush_l1d = true; 6254 6255 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 6256 PFERR_WRITE_MASK, exception); 6257 } 6258 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); 6259 6260 int handle_ud(struct kvm_vcpu *vcpu) 6261 { 6262 static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; 6263 int emul_type = EMULTYPE_TRAP_UD; 6264 char sig[5]; /* ud2; .ascii "kvm" */ 6265 struct x86_exception e; 6266 6267 if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0))) 6268 return 1; 6269 6270 if (force_emulation_prefix && 6271 kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), 6272 sig, sizeof(sig), &e) == 0 && 6273 memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { 6274 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); 6275 emul_type = EMULTYPE_TRAP_UD_FORCED; 6276 } 6277 6278 return kvm_emulate_instruction(vcpu, emul_type); 6279 } 6280 EXPORT_SYMBOL_GPL(handle_ud); 6281 6282 static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, 6283 gpa_t gpa, bool write) 6284 { 6285 /* For APIC access vmexit */ 6286 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 6287 return 1; 6288 6289 if (vcpu_match_mmio_gpa(vcpu, gpa)) { 6290 trace_vcpu_match_mmio(gva, gpa, write, true); 6291 return 1; 6292 } 6293 6294 return 0; 6295 } 6296 6297 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, 6298 gpa_t *gpa, struct x86_exception *exception, 6299 bool write) 6300 { 6301 u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) 6302 | (write ? PFERR_WRITE_MASK : 0); 6303 6304 /* 6305 * currently PKRU is only applied to ept enabled guest so 6306 * there is no pkey in EPT page table for L1 guest or EPT 6307 * shadow page table for L2 guest. 6308 */ 6309 if (vcpu_match_mmio_gva(vcpu, gva) 6310 && !permission_fault(vcpu, vcpu->arch.walk_mmu, 6311 vcpu->arch.mmio_access, 0, access)) { 6312 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | 6313 (gva & (PAGE_SIZE - 1)); 6314 trace_vcpu_match_mmio(gva, *gpa, write, false); 6315 return 1; 6316 } 6317 6318 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 6319 6320 if (*gpa == UNMAPPED_GVA) 6321 return -1; 6322 6323 return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); 6324 } 6325 6326 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 6327 const void *val, int bytes) 6328 { 6329 int ret; 6330 6331 ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); 6332 if (ret < 0) 6333 return 0; 6334 kvm_page_track_write(vcpu, gpa, val, bytes); 6335 return 1; 6336 } 6337 6338 struct read_write_emulator_ops { 6339 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, 6340 int bytes); 6341 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, 6342 void *val, int bytes); 6343 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, 6344 int bytes, void *val); 6345 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, 6346 void *val, int bytes); 6347 bool write; 6348 }; 6349 6350 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) 6351 { 6352 if (vcpu->mmio_read_completed) { 6353 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 6354 vcpu->mmio_fragments[0].gpa, val); 6355 vcpu->mmio_read_completed = 0; 6356 return 1; 6357 } 6358 6359 return 0; 6360 } 6361 6362 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, 6363 void *val, int bytes) 6364 { 6365 return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes); 6366 } 6367 6368 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, 6369 void *val, int bytes) 6370 { 6371 return emulator_write_phys(vcpu, gpa, val, bytes); 6372 } 6373 6374 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) 6375 { 6376 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val); 6377 return vcpu_mmio_write(vcpu, gpa, bytes, val); 6378 } 6379 6380 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 6381 void *val, int bytes) 6382 { 6383 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL); 6384 return X86EMUL_IO_NEEDED; 6385 } 6386 6387 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 6388 void *val, int bytes) 6389 { 6390 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; 6391 6392 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); 6393 return X86EMUL_CONTINUE; 6394 } 6395 6396 static const struct read_write_emulator_ops read_emultor = { 6397 .read_write_prepare = read_prepare, 6398 .read_write_emulate = read_emulate, 6399 .read_write_mmio = vcpu_mmio_read, 6400 .read_write_exit_mmio = read_exit_mmio, 6401 }; 6402 6403 static const struct read_write_emulator_ops write_emultor = { 6404 .read_write_emulate = write_emulate, 6405 .read_write_mmio = write_mmio, 6406 .read_write_exit_mmio = write_exit_mmio, 6407 .write = true, 6408 }; 6409 6410 static int emulator_read_write_onepage(unsigned long addr, void *val, 6411 unsigned int bytes, 6412 struct x86_exception *exception, 6413 struct kvm_vcpu *vcpu, 6414 const struct read_write_emulator_ops *ops) 6415 { 6416 gpa_t gpa; 6417 int handled, ret; 6418 bool write = ops->write; 6419 struct kvm_mmio_fragment *frag; 6420 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 6421 6422 /* 6423 * If the exit was due to a NPF we may already have a GPA. 6424 * If the GPA is present, use it to avoid the GVA to GPA table walk. 6425 * Note, this cannot be used on string operations since string 6426 * operation using rep will only have the initial GPA from the NPF 6427 * occurred. 6428 */ 6429 if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && 6430 (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { 6431 gpa = ctxt->gpa_val; 6432 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); 6433 } else { 6434 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 6435 if (ret < 0) 6436 return X86EMUL_PROPAGATE_FAULT; 6437 } 6438 6439 if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes)) 6440 return X86EMUL_CONTINUE; 6441 6442 /* 6443 * Is this MMIO handled locally? 6444 */ 6445 handled = ops->read_write_mmio(vcpu, gpa, bytes, val); 6446 if (handled == bytes) 6447 return X86EMUL_CONTINUE; 6448 6449 gpa += handled; 6450 bytes -= handled; 6451 val += handled; 6452 6453 WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); 6454 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; 6455 frag->gpa = gpa; 6456 frag->data = val; 6457 frag->len = bytes; 6458 return X86EMUL_CONTINUE; 6459 } 6460 6461 static int emulator_read_write(struct x86_emulate_ctxt *ctxt, 6462 unsigned long addr, 6463 void *val, unsigned int bytes, 6464 struct x86_exception *exception, 6465 const struct read_write_emulator_ops *ops) 6466 { 6467 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6468 gpa_t gpa; 6469 int rc; 6470 6471 if (ops->read_write_prepare && 6472 ops->read_write_prepare(vcpu, val, bytes)) 6473 return X86EMUL_CONTINUE; 6474 6475 vcpu->mmio_nr_fragments = 0; 6476 6477 /* Crossing a page boundary? */ 6478 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 6479 int now; 6480 6481 now = -addr & ~PAGE_MASK; 6482 rc = emulator_read_write_onepage(addr, val, now, exception, 6483 vcpu, ops); 6484 6485 if (rc != X86EMUL_CONTINUE) 6486 return rc; 6487 addr += now; 6488 if (ctxt->mode != X86EMUL_MODE_PROT64) 6489 addr = (u32)addr; 6490 val += now; 6491 bytes -= now; 6492 } 6493 6494 rc = emulator_read_write_onepage(addr, val, bytes, exception, 6495 vcpu, ops); 6496 if (rc != X86EMUL_CONTINUE) 6497 return rc; 6498 6499 if (!vcpu->mmio_nr_fragments) 6500 return rc; 6501 6502 gpa = vcpu->mmio_fragments[0].gpa; 6503 6504 vcpu->mmio_needed = 1; 6505 vcpu->mmio_cur_fragment = 0; 6506 6507 vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); 6508 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; 6509 vcpu->run->exit_reason = KVM_EXIT_MMIO; 6510 vcpu->run->mmio.phys_addr = gpa; 6511 6512 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); 6513 } 6514 6515 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 6516 unsigned long addr, 6517 void *val, 6518 unsigned int bytes, 6519 struct x86_exception *exception) 6520 { 6521 return emulator_read_write(ctxt, addr, val, bytes, 6522 exception, &read_emultor); 6523 } 6524 6525 static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, 6526 unsigned long addr, 6527 const void *val, 6528 unsigned int bytes, 6529 struct x86_exception *exception) 6530 { 6531 return emulator_read_write(ctxt, addr, (void *)val, bytes, 6532 exception, &write_emultor); 6533 } 6534 6535 #define CMPXCHG_TYPE(t, ptr, old, new) \ 6536 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 6537 6538 #ifdef CONFIG_X86_64 6539 # define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) 6540 #else 6541 # define CMPXCHG64(ptr, old, new) \ 6542 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 6543 #endif 6544 6545 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, 6546 unsigned long addr, 6547 const void *old, 6548 const void *new, 6549 unsigned int bytes, 6550 struct x86_exception *exception) 6551 { 6552 struct kvm_host_map map; 6553 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6554 u64 page_line_mask; 6555 gpa_t gpa; 6556 char *kaddr; 6557 bool exchanged; 6558 6559 /* guests cmpxchg8b have to be emulated atomically */ 6560 if (bytes > 8 || (bytes & (bytes - 1))) 6561 goto emul_write; 6562 6563 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 6564 6565 if (gpa == UNMAPPED_GVA || 6566 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 6567 goto emul_write; 6568 6569 /* 6570 * Emulate the atomic as a straight write to avoid #AC if SLD is 6571 * enabled in the host and the access splits a cache line. 6572 */ 6573 if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 6574 page_line_mask = ~(cache_line_size() - 1); 6575 else 6576 page_line_mask = PAGE_MASK; 6577 6578 if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) 6579 goto emul_write; 6580 6581 if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) 6582 goto emul_write; 6583 6584 kaddr = map.hva + offset_in_page(gpa); 6585 6586 switch (bytes) { 6587 case 1: 6588 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); 6589 break; 6590 case 2: 6591 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); 6592 break; 6593 case 4: 6594 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); 6595 break; 6596 case 8: 6597 exchanged = CMPXCHG64(kaddr, old, new); 6598 break; 6599 default: 6600 BUG(); 6601 } 6602 6603 kvm_vcpu_unmap(vcpu, &map, true); 6604 6605 if (!exchanged) 6606 return X86EMUL_CMPXCHG_FAILED; 6607 6608 kvm_page_track_write(vcpu, gpa, new, bytes); 6609 6610 return X86EMUL_CONTINUE; 6611 6612 emul_write: 6613 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 6614 6615 return emulator_write_emulated(ctxt, addr, new, bytes, exception); 6616 } 6617 6618 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 6619 { 6620 int r = 0, i; 6621 6622 for (i = 0; i < vcpu->arch.pio.count; i++) { 6623 if (vcpu->arch.pio.in) 6624 r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, 6625 vcpu->arch.pio.size, pd); 6626 else 6627 r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, 6628 vcpu->arch.pio.port, vcpu->arch.pio.size, 6629 pd); 6630 if (r) 6631 break; 6632 pd += vcpu->arch.pio.size; 6633 } 6634 return r; 6635 } 6636 6637 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, 6638 unsigned short port, void *val, 6639 unsigned int count, bool in) 6640 { 6641 vcpu->arch.pio.port = port; 6642 vcpu->arch.pio.in = in; 6643 vcpu->arch.pio.count = count; 6644 vcpu->arch.pio.size = size; 6645 6646 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 6647 vcpu->arch.pio.count = 0; 6648 return 1; 6649 } 6650 6651 vcpu->run->exit_reason = KVM_EXIT_IO; 6652 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 6653 vcpu->run->io.size = size; 6654 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 6655 vcpu->run->io.count = count; 6656 vcpu->run->io.port = port; 6657 6658 return 0; 6659 } 6660 6661 static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, 6662 unsigned short port, void *val, unsigned int count) 6663 { 6664 int ret; 6665 6666 if (vcpu->arch.pio.count) 6667 goto data_avail; 6668 6669 memset(vcpu->arch.pio_data, 0, size * count); 6670 6671 ret = emulator_pio_in_out(vcpu, size, port, val, count, true); 6672 if (ret) { 6673 data_avail: 6674 memcpy(val, vcpu->arch.pio_data, size * count); 6675 trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); 6676 vcpu->arch.pio.count = 0; 6677 return 1; 6678 } 6679 6680 return 0; 6681 } 6682 6683 static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, 6684 int size, unsigned short port, void *val, 6685 unsigned int count) 6686 { 6687 return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); 6688 6689 } 6690 6691 static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, 6692 unsigned short port, const void *val, 6693 unsigned int count) 6694 { 6695 memcpy(vcpu->arch.pio_data, val, size * count); 6696 trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); 6697 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); 6698 } 6699 6700 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, 6701 int size, unsigned short port, 6702 const void *val, unsigned int count) 6703 { 6704 return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); 6705 } 6706 6707 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 6708 { 6709 return static_call(kvm_x86_get_segment_base)(vcpu, seg); 6710 } 6711 6712 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) 6713 { 6714 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); 6715 } 6716 6717 static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) 6718 { 6719 if (!need_emulate_wbinvd(vcpu)) 6720 return X86EMUL_CONTINUE; 6721 6722 if (static_call(kvm_x86_has_wbinvd_exit)()) { 6723 int cpu = get_cpu(); 6724 6725 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 6726 on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, 6727 wbinvd_ipi, NULL, 1); 6728 put_cpu(); 6729 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 6730 } else 6731 wbinvd(); 6732 return X86EMUL_CONTINUE; 6733 } 6734 6735 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 6736 { 6737 kvm_emulate_wbinvd_noskip(vcpu); 6738 return kvm_skip_emulated_instruction(vcpu); 6739 } 6740 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 6741 6742 6743 6744 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) 6745 { 6746 kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); 6747 } 6748 6749 static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 6750 unsigned long *dest) 6751 { 6752 kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); 6753 } 6754 6755 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, 6756 unsigned long value) 6757 { 6758 6759 return kvm_set_dr(emul_to_vcpu(ctxt), dr, value); 6760 } 6761 6762 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 6763 { 6764 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 6765 } 6766 6767 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) 6768 { 6769 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6770 unsigned long value; 6771 6772 switch (cr) { 6773 case 0: 6774 value = kvm_read_cr0(vcpu); 6775 break; 6776 case 2: 6777 value = vcpu->arch.cr2; 6778 break; 6779 case 3: 6780 value = kvm_read_cr3(vcpu); 6781 break; 6782 case 4: 6783 value = kvm_read_cr4(vcpu); 6784 break; 6785 case 8: 6786 value = kvm_get_cr8(vcpu); 6787 break; 6788 default: 6789 kvm_err("%s: unexpected cr %u\n", __func__, cr); 6790 return 0; 6791 } 6792 6793 return value; 6794 } 6795 6796 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) 6797 { 6798 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6799 int res = 0; 6800 6801 switch (cr) { 6802 case 0: 6803 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 6804 break; 6805 case 2: 6806 vcpu->arch.cr2 = val; 6807 break; 6808 case 3: 6809 res = kvm_set_cr3(vcpu, val); 6810 break; 6811 case 4: 6812 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 6813 break; 6814 case 8: 6815 res = kvm_set_cr8(vcpu, val); 6816 break; 6817 default: 6818 kvm_err("%s: unexpected cr %u\n", __func__, cr); 6819 res = -1; 6820 } 6821 6822 return res; 6823 } 6824 6825 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 6826 { 6827 return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); 6828 } 6829 6830 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 6831 { 6832 static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); 6833 } 6834 6835 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 6836 { 6837 static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); 6838 } 6839 6840 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 6841 { 6842 static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); 6843 } 6844 6845 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) 6846 { 6847 static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); 6848 } 6849 6850 static unsigned long emulator_get_cached_segment_base( 6851 struct x86_emulate_ctxt *ctxt, int seg) 6852 { 6853 return get_segment_base(emul_to_vcpu(ctxt), seg); 6854 } 6855 6856 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, 6857 struct desc_struct *desc, u32 *base3, 6858 int seg) 6859 { 6860 struct kvm_segment var; 6861 6862 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); 6863 *selector = var.selector; 6864 6865 if (var.unusable) { 6866 memset(desc, 0, sizeof(*desc)); 6867 if (base3) 6868 *base3 = 0; 6869 return false; 6870 } 6871 6872 if (var.g) 6873 var.limit >>= 12; 6874 set_desc_limit(desc, var.limit); 6875 set_desc_base(desc, (unsigned long)var.base); 6876 #ifdef CONFIG_X86_64 6877 if (base3) 6878 *base3 = var.base >> 32; 6879 #endif 6880 desc->type = var.type; 6881 desc->s = var.s; 6882 desc->dpl = var.dpl; 6883 desc->p = var.present; 6884 desc->avl = var.avl; 6885 desc->l = var.l; 6886 desc->d = var.db; 6887 desc->g = var.g; 6888 6889 return true; 6890 } 6891 6892 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, 6893 struct desc_struct *desc, u32 base3, 6894 int seg) 6895 { 6896 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6897 struct kvm_segment var; 6898 6899 var.selector = selector; 6900 var.base = get_desc_base(desc); 6901 #ifdef CONFIG_X86_64 6902 var.base |= ((u64)base3) << 32; 6903 #endif 6904 var.limit = get_desc_limit(desc); 6905 if (desc->g) 6906 var.limit = (var.limit << 12) | 0xfff; 6907 var.type = desc->type; 6908 var.dpl = desc->dpl; 6909 var.db = desc->d; 6910 var.s = desc->s; 6911 var.l = desc->l; 6912 var.g = desc->g; 6913 var.avl = desc->avl; 6914 var.present = desc->p; 6915 var.unusable = !var.present; 6916 var.padding = 0; 6917 6918 kvm_set_segment(vcpu, &var, seg); 6919 return; 6920 } 6921 6922 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, 6923 u32 msr_index, u64 *pdata) 6924 { 6925 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6926 int r; 6927 6928 r = kvm_get_msr(vcpu, msr_index, pdata); 6929 6930 if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { 6931 /* Bounce to user space */ 6932 return X86EMUL_IO_NEEDED; 6933 } 6934 6935 return r; 6936 } 6937 6938 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 6939 u32 msr_index, u64 data) 6940 { 6941 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6942 int r; 6943 6944 r = kvm_set_msr(vcpu, msr_index, data); 6945 6946 if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { 6947 /* Bounce to user space */ 6948 return X86EMUL_IO_NEEDED; 6949 } 6950 6951 return r; 6952 } 6953 6954 static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) 6955 { 6956 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6957 6958 return vcpu->arch.smbase; 6959 } 6960 6961 static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) 6962 { 6963 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 6964 6965 vcpu->arch.smbase = smbase; 6966 } 6967 6968 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, 6969 u32 pmc) 6970 { 6971 return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); 6972 } 6973 6974 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, 6975 u32 pmc, u64 *pdata) 6976 { 6977 return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); 6978 } 6979 6980 static void emulator_halt(struct x86_emulate_ctxt *ctxt) 6981 { 6982 emul_to_vcpu(ctxt)->arch.halt_request = 1; 6983 } 6984 6985 static int emulator_intercept(struct x86_emulate_ctxt *ctxt, 6986 struct x86_instruction_info *info, 6987 enum x86_intercept_stage stage) 6988 { 6989 return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, 6990 &ctxt->exception); 6991 } 6992 6993 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 6994 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, 6995 bool exact_only) 6996 { 6997 return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); 6998 } 6999 7000 static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) 7001 { 7002 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); 7003 } 7004 7005 static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) 7006 { 7007 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); 7008 } 7009 7010 static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) 7011 { 7012 return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); 7013 } 7014 7015 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) 7016 { 7017 return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); 7018 } 7019 7020 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) 7021 { 7022 kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val); 7023 } 7024 7025 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) 7026 { 7027 static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); 7028 } 7029 7030 static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) 7031 { 7032 return emul_to_vcpu(ctxt)->arch.hflags; 7033 } 7034 7035 static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) 7036 { 7037 emul_to_vcpu(ctxt)->arch.hflags = emul_flags; 7038 } 7039 7040 static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, 7041 const char *smstate) 7042 { 7043 return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate); 7044 } 7045 7046 static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) 7047 { 7048 kvm_smm_changed(emul_to_vcpu(ctxt)); 7049 } 7050 7051 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) 7052 { 7053 return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); 7054 } 7055 7056 static const struct x86_emulate_ops emulate_ops = { 7057 .read_gpr = emulator_read_gpr, 7058 .write_gpr = emulator_write_gpr, 7059 .read_std = emulator_read_std, 7060 .write_std = emulator_write_std, 7061 .read_phys = kvm_read_guest_phys_system, 7062 .fetch = kvm_fetch_guest_virt, 7063 .read_emulated = emulator_read_emulated, 7064 .write_emulated = emulator_write_emulated, 7065 .cmpxchg_emulated = emulator_cmpxchg_emulated, 7066 .invlpg = emulator_invlpg, 7067 .pio_in_emulated = emulator_pio_in_emulated, 7068 .pio_out_emulated = emulator_pio_out_emulated, 7069 .get_segment = emulator_get_segment, 7070 .set_segment = emulator_set_segment, 7071 .get_cached_segment_base = emulator_get_cached_segment_base, 7072 .get_gdt = emulator_get_gdt, 7073 .get_idt = emulator_get_idt, 7074 .set_gdt = emulator_set_gdt, 7075 .set_idt = emulator_set_idt, 7076 .get_cr = emulator_get_cr, 7077 .set_cr = emulator_set_cr, 7078 .cpl = emulator_get_cpl, 7079 .get_dr = emulator_get_dr, 7080 .set_dr = emulator_set_dr, 7081 .get_smbase = emulator_get_smbase, 7082 .set_smbase = emulator_set_smbase, 7083 .set_msr = emulator_set_msr, 7084 .get_msr = emulator_get_msr, 7085 .check_pmc = emulator_check_pmc, 7086 .read_pmc = emulator_read_pmc, 7087 .halt = emulator_halt, 7088 .wbinvd = emulator_wbinvd, 7089 .fix_hypercall = emulator_fix_hypercall, 7090 .intercept = emulator_intercept, 7091 .get_cpuid = emulator_get_cpuid, 7092 .guest_has_long_mode = emulator_guest_has_long_mode, 7093 .guest_has_movbe = emulator_guest_has_movbe, 7094 .guest_has_fxsr = emulator_guest_has_fxsr, 7095 .set_nmi_mask = emulator_set_nmi_mask, 7096 .get_hflags = emulator_get_hflags, 7097 .set_hflags = emulator_set_hflags, 7098 .pre_leave_smm = emulator_pre_leave_smm, 7099 .post_leave_smm = emulator_post_leave_smm, 7100 .set_xcr = emulator_set_xcr, 7101 }; 7102 7103 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 7104 { 7105 u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); 7106 /* 7107 * an sti; sti; sequence only disable interrupts for the first 7108 * instruction. So, if the last instruction, be it emulated or 7109 * not, left the system with the INT_STI flag enabled, it 7110 * means that the last instruction is an sti. We should not 7111 * leave the flag on in this case. The same goes for mov ss 7112 */ 7113 if (int_shadow & mask) 7114 mask = 0; 7115 if (unlikely(int_shadow || mask)) { 7116 static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); 7117 if (!mask) 7118 kvm_make_request(KVM_REQ_EVENT, vcpu); 7119 } 7120 } 7121 7122 static bool inject_emulated_exception(struct kvm_vcpu *vcpu) 7123 { 7124 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 7125 if (ctxt->exception.vector == PF_VECTOR) 7126 return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); 7127 7128 if (ctxt->exception.error_code_valid) 7129 kvm_queue_exception_e(vcpu, ctxt->exception.vector, 7130 ctxt->exception.error_code); 7131 else 7132 kvm_queue_exception(vcpu, ctxt->exception.vector); 7133 return false; 7134 } 7135 7136 static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) 7137 { 7138 struct x86_emulate_ctxt *ctxt; 7139 7140 ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); 7141 if (!ctxt) { 7142 pr_err("kvm: failed to allocate vcpu's emulator\n"); 7143 return NULL; 7144 } 7145 7146 ctxt->vcpu = vcpu; 7147 ctxt->ops = &emulate_ops; 7148 vcpu->arch.emulate_ctxt = ctxt; 7149 7150 return ctxt; 7151 } 7152 7153 static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 7154 { 7155 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 7156 int cs_db, cs_l; 7157 7158 static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); 7159 7160 ctxt->gpa_available = false; 7161 ctxt->eflags = kvm_get_rflags(vcpu); 7162 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; 7163 7164 ctxt->eip = kvm_rip_read(vcpu); 7165 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 7166 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : 7167 (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : 7168 cs_db ? X86EMUL_MODE_PROT32 : 7169 X86EMUL_MODE_PROT16; 7170 BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); 7171 BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); 7172 BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); 7173 7174 init_decode_cache(ctxt); 7175 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 7176 } 7177 7178 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) 7179 { 7180 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 7181 int ret; 7182 7183 init_emulate_ctxt(vcpu); 7184 7185 ctxt->op_bytes = 2; 7186 ctxt->ad_bytes = 2; 7187 ctxt->_eip = ctxt->eip + inc_eip; 7188 ret = emulate_int_real(ctxt, irq); 7189 7190 if (ret != X86EMUL_CONTINUE) { 7191 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 7192 } else { 7193 ctxt->eip = ctxt->_eip; 7194 kvm_rip_write(vcpu, ctxt->eip); 7195 kvm_set_rflags(vcpu, ctxt->eflags); 7196 } 7197 } 7198 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); 7199 7200 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) 7201 { 7202 ++vcpu->stat.insn_emulation_fail; 7203 trace_kvm_emulate_insn_failed(vcpu); 7204 7205 if (emulation_type & EMULTYPE_VMWARE_GP) { 7206 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 7207 return 1; 7208 } 7209 7210 if (emulation_type & EMULTYPE_SKIP) { 7211 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 7212 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 7213 vcpu->run->internal.ndata = 0; 7214 return 0; 7215 } 7216 7217 kvm_queue_exception(vcpu, UD_VECTOR); 7218 7219 if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { 7220 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 7221 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 7222 vcpu->run->internal.ndata = 0; 7223 return 0; 7224 } 7225 7226 return 1; 7227 } 7228 7229 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 7230 bool write_fault_to_shadow_pgtable, 7231 int emulation_type) 7232 { 7233 gpa_t gpa = cr2_or_gpa; 7234 kvm_pfn_t pfn; 7235 7236 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 7237 return false; 7238 7239 if (WARN_ON_ONCE(is_guest_mode(vcpu)) || 7240 WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) 7241 return false; 7242 7243 if (!vcpu->arch.mmu->direct_map) { 7244 /* 7245 * Write permission should be allowed since only 7246 * write access need to be emulated. 7247 */ 7248 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 7249 7250 /* 7251 * If the mapping is invalid in guest, let cpu retry 7252 * it to generate fault. 7253 */ 7254 if (gpa == UNMAPPED_GVA) 7255 return true; 7256 } 7257 7258 /* 7259 * Do not retry the unhandleable instruction if it faults on the 7260 * readonly host memory, otherwise it will goto a infinite loop: 7261 * retry instruction -> write #PF -> emulation fail -> retry 7262 * instruction -> ... 7263 */ 7264 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 7265 7266 /* 7267 * If the instruction failed on the error pfn, it can not be fixed, 7268 * report the error to userspace. 7269 */ 7270 if (is_error_noslot_pfn(pfn)) 7271 return false; 7272 7273 kvm_release_pfn_clean(pfn); 7274 7275 /* The instructions are well-emulated on direct mmu. */ 7276 if (vcpu->arch.mmu->direct_map) { 7277 unsigned int indirect_shadow_pages; 7278 7279 write_lock(&vcpu->kvm->mmu_lock); 7280 indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; 7281 write_unlock(&vcpu->kvm->mmu_lock); 7282 7283 if (indirect_shadow_pages) 7284 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 7285 7286 return true; 7287 } 7288 7289 /* 7290 * if emulation was due to access to shadowed page table 7291 * and it failed try to unshadow page and re-enter the 7292 * guest to let CPU execute the instruction. 7293 */ 7294 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 7295 7296 /* 7297 * If the access faults on its page table, it can not 7298 * be fixed by unprotecting shadow page and it should 7299 * be reported to userspace. 7300 */ 7301 return !write_fault_to_shadow_pgtable; 7302 } 7303 7304 static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 7305 gpa_t cr2_or_gpa, int emulation_type) 7306 { 7307 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 7308 unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; 7309 7310 last_retry_eip = vcpu->arch.last_retry_eip; 7311 last_retry_addr = vcpu->arch.last_retry_addr; 7312 7313 /* 7314 * If the emulation is caused by #PF and it is non-page_table 7315 * writing instruction, it means the VM-EXIT is caused by shadow 7316 * page protected, we can zap the shadow page and retry this 7317 * instruction directly. 7318 * 7319 * Note: if the guest uses a non-page-table modifying instruction 7320 * on the PDE that points to the instruction, then we will unmap 7321 * the instruction and go to an infinite loop. So, we cache the 7322 * last retried eip and the last fault address, if we meet the eip 7323 * and the address again, we can break out of the potential infinite 7324 * loop. 7325 */ 7326 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; 7327 7328 if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) 7329 return false; 7330 7331 if (WARN_ON_ONCE(is_guest_mode(vcpu)) || 7332 WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) 7333 return false; 7334 7335 if (x86_page_table_writing_insn(ctxt)) 7336 return false; 7337 7338 if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) 7339 return false; 7340 7341 vcpu->arch.last_retry_eip = ctxt->eip; 7342 vcpu->arch.last_retry_addr = cr2_or_gpa; 7343 7344 if (!vcpu->arch.mmu->direct_map) 7345 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); 7346 7347 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 7348 7349 return true; 7350 } 7351 7352 static int complete_emulated_mmio(struct kvm_vcpu *vcpu); 7353 static int complete_emulated_pio(struct kvm_vcpu *vcpu); 7354 7355 static void kvm_smm_changed(struct kvm_vcpu *vcpu) 7356 { 7357 if (!(vcpu->arch.hflags & HF_SMM_MASK)) { 7358 /* This is a good place to trace that we are exiting SMM. */ 7359 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); 7360 7361 /* Process a latched INIT or SMI, if any. */ 7362 kvm_make_request(KVM_REQ_EVENT, vcpu); 7363 } 7364 7365 kvm_mmu_reset_context(vcpu); 7366 } 7367 7368 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, 7369 unsigned long *db) 7370 { 7371 u32 dr6 = 0; 7372 int i; 7373 u32 enable, rwlen; 7374 7375 enable = dr7; 7376 rwlen = dr7 >> 16; 7377 for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) 7378 if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) 7379 dr6 |= (1 << i); 7380 return dr6; 7381 } 7382 7383 static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) 7384 { 7385 struct kvm_run *kvm_run = vcpu->run; 7386 7387 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 7388 kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW; 7389 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 7390 kvm_run->debug.arch.exception = DB_VECTOR; 7391 kvm_run->exit_reason = KVM_EXIT_DEBUG; 7392 return 0; 7393 } 7394 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); 7395 return 1; 7396 } 7397 7398 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 7399 { 7400 unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); 7401 int r; 7402 7403 r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); 7404 if (unlikely(!r)) 7405 return 0; 7406 7407 /* 7408 * rflags is the old, "raw" value of the flags. The new value has 7409 * not been saved yet. 7410 * 7411 * This is correct even for TF set by the guest, because "the 7412 * processor will not generate this exception after the instruction 7413 * that sets the TF flag". 7414 */ 7415 if (unlikely(rflags & X86_EFLAGS_TF)) 7416 r = kvm_vcpu_do_singlestep(vcpu); 7417 return r; 7418 } 7419 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); 7420 7421 static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) 7422 { 7423 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && 7424 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { 7425 struct kvm_run *kvm_run = vcpu->run; 7426 unsigned long eip = kvm_get_linear_rip(vcpu); 7427 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, 7428 vcpu->arch.guest_debug_dr7, 7429 vcpu->arch.eff_db); 7430 7431 if (dr6 != 0) { 7432 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 7433 kvm_run->debug.arch.pc = eip; 7434 kvm_run->debug.arch.exception = DB_VECTOR; 7435 kvm_run->exit_reason = KVM_EXIT_DEBUG; 7436 *r = 0; 7437 return true; 7438 } 7439 } 7440 7441 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && 7442 !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { 7443 unsigned long eip = kvm_get_linear_rip(vcpu); 7444 u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, 7445 vcpu->arch.dr7, 7446 vcpu->arch.db); 7447 7448 if (dr6 != 0) { 7449 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 7450 *r = 1; 7451 return true; 7452 } 7453 } 7454 7455 return false; 7456 } 7457 7458 static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) 7459 { 7460 switch (ctxt->opcode_len) { 7461 case 1: 7462 switch (ctxt->b) { 7463 case 0xe4: /* IN */ 7464 case 0xe5: 7465 case 0xec: 7466 case 0xed: 7467 case 0xe6: /* OUT */ 7468 case 0xe7: 7469 case 0xee: 7470 case 0xef: 7471 case 0x6c: /* INS */ 7472 case 0x6d: 7473 case 0x6e: /* OUTS */ 7474 case 0x6f: 7475 return true; 7476 } 7477 break; 7478 case 2: 7479 switch (ctxt->b) { 7480 case 0x33: /* RDPMC */ 7481 return true; 7482 } 7483 break; 7484 } 7485 7486 return false; 7487 } 7488 7489 /* 7490 * Decode to be emulated instruction. Return EMULATION_OK if success. 7491 */ 7492 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, 7493 void *insn, int insn_len) 7494 { 7495 int r = EMULATION_OK; 7496 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 7497 7498 init_emulate_ctxt(vcpu); 7499 7500 /* 7501 * We will reenter on the same instruction since we do not set 7502 * complete_userspace_io. This does not handle watchpoints yet, 7503 * those would be handled in the emulate_ops. 7504 */ 7505 if (!(emulation_type & EMULTYPE_SKIP) && 7506 kvm_vcpu_check_breakpoint(vcpu, &r)) 7507 return r; 7508 7509 ctxt->interruptibility = 0; 7510 ctxt->have_exception = false; 7511 ctxt->exception.vector = -1; 7512 ctxt->perm_ok = false; 7513 7514 ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; 7515 7516 r = x86_decode_insn(ctxt, insn, insn_len); 7517 7518 trace_kvm_emulate_insn_start(vcpu); 7519 ++vcpu->stat.insn_emulation; 7520 7521 return r; 7522 } 7523 EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); 7524 7525 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 7526 int emulation_type, void *insn, int insn_len) 7527 { 7528 int r; 7529 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 7530 bool writeback = true; 7531 bool write_fault_to_spt; 7532 7533 if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len))) 7534 return 1; 7535 7536 vcpu->arch.l1tf_flush_l1d = true; 7537 7538 /* 7539 * Clear write_fault_to_shadow_pgtable here to ensure it is 7540 * never reused. 7541 */ 7542 write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; 7543 vcpu->arch.write_fault_to_shadow_pgtable = false; 7544 7545 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 7546 kvm_clear_exception_queue(vcpu); 7547 7548 r = x86_decode_emulated_instruction(vcpu, emulation_type, 7549 insn, insn_len); 7550 if (r != EMULATION_OK) { 7551 if ((emulation_type & EMULTYPE_TRAP_UD) || 7552 (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { 7553 kvm_queue_exception(vcpu, UD_VECTOR); 7554 return 1; 7555 } 7556 if (reexecute_instruction(vcpu, cr2_or_gpa, 7557 write_fault_to_spt, 7558 emulation_type)) 7559 return 1; 7560 if (ctxt->have_exception) { 7561 /* 7562 * #UD should result in just EMULATION_FAILED, and trap-like 7563 * exception should not be encountered during decode. 7564 */ 7565 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || 7566 exception_type(ctxt->exception.vector) == EXCPT_TRAP); 7567 inject_emulated_exception(vcpu); 7568 return 1; 7569 } 7570 return handle_emulation_failure(vcpu, emulation_type); 7571 } 7572 } 7573 7574 if ((emulation_type & EMULTYPE_VMWARE_GP) && 7575 !is_vmware_backdoor_opcode(ctxt)) { 7576 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 7577 return 1; 7578 } 7579 7580 /* 7581 * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks 7582 * for kvm_skip_emulated_instruction(). The caller is responsible for 7583 * updating interruptibility state and injecting single-step #DBs. 7584 */ 7585 if (emulation_type & EMULTYPE_SKIP) { 7586 kvm_rip_write(vcpu, ctxt->_eip); 7587 if (ctxt->eflags & X86_EFLAGS_RF) 7588 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); 7589 return 1; 7590 } 7591 7592 if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) 7593 return 1; 7594 7595 /* this is needed for vmware backdoor interface to work since it 7596 changes registers values during IO operation */ 7597 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 7598 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 7599 emulator_invalidate_register_cache(ctxt); 7600 } 7601 7602 restart: 7603 if (emulation_type & EMULTYPE_PF) { 7604 /* Save the faulting GPA (cr2) in the address field */ 7605 ctxt->exception.address = cr2_or_gpa; 7606 7607 /* With shadow page tables, cr2 contains a GVA or nGPA. */ 7608 if (vcpu->arch.mmu->direct_map) { 7609 ctxt->gpa_available = true; 7610 ctxt->gpa_val = cr2_or_gpa; 7611 } 7612 } else { 7613 /* Sanitize the address out of an abundance of paranoia. */ 7614 ctxt->exception.address = 0; 7615 } 7616 7617 r = x86_emulate_insn(ctxt); 7618 7619 if (r == EMULATION_INTERCEPTED) 7620 return 1; 7621 7622 if (r == EMULATION_FAILED) { 7623 if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, 7624 emulation_type)) 7625 return 1; 7626 7627 return handle_emulation_failure(vcpu, emulation_type); 7628 } 7629 7630 if (ctxt->have_exception) { 7631 r = 1; 7632 if (inject_emulated_exception(vcpu)) 7633 return r; 7634 } else if (vcpu->arch.pio.count) { 7635 if (!vcpu->arch.pio.in) { 7636 /* FIXME: return into emulator if single-stepping. */ 7637 vcpu->arch.pio.count = 0; 7638 } else { 7639 writeback = false; 7640 vcpu->arch.complete_userspace_io = complete_emulated_pio; 7641 } 7642 r = 0; 7643 } else if (vcpu->mmio_needed) { 7644 ++vcpu->stat.mmio_exits; 7645 7646 if (!vcpu->mmio_is_write) 7647 writeback = false; 7648 r = 0; 7649 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 7650 } else if (r == EMULATION_RESTART) 7651 goto restart; 7652 else 7653 r = 1; 7654 7655 if (writeback) { 7656 unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); 7657 toggle_interruptibility(vcpu, ctxt->interruptibility); 7658 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 7659 if (!ctxt->have_exception || 7660 exception_type(ctxt->exception.vector) == EXCPT_TRAP) { 7661 kvm_rip_write(vcpu, ctxt->eip); 7662 if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) 7663 r = kvm_vcpu_do_singlestep(vcpu); 7664 if (kvm_x86_ops.update_emulated_instruction) 7665 static_call(kvm_x86_update_emulated_instruction)(vcpu); 7666 __kvm_set_rflags(vcpu, ctxt->eflags); 7667 } 7668 7669 /* 7670 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will 7671 * do nothing, and it will be requested again as soon as 7672 * the shadow expires. But we still need to check here, 7673 * because POPF has no interrupt shadow. 7674 */ 7675 if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) 7676 kvm_make_request(KVM_REQ_EVENT, vcpu); 7677 } else 7678 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 7679 7680 return r; 7681 } 7682 7683 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) 7684 { 7685 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); 7686 } 7687 EXPORT_SYMBOL_GPL(kvm_emulate_instruction); 7688 7689 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, 7690 void *insn, int insn_len) 7691 { 7692 return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); 7693 } 7694 EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); 7695 7696 static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) 7697 { 7698 vcpu->arch.pio.count = 0; 7699 return 1; 7700 } 7701 7702 static int complete_fast_pio_out(struct kvm_vcpu *vcpu) 7703 { 7704 vcpu->arch.pio.count = 0; 7705 7706 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) 7707 return 1; 7708 7709 return kvm_skip_emulated_instruction(vcpu); 7710 } 7711 7712 static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, 7713 unsigned short port) 7714 { 7715 unsigned long val = kvm_rax_read(vcpu); 7716 int ret = emulator_pio_out(vcpu, size, port, &val, 1); 7717 7718 if (ret) 7719 return ret; 7720 7721 /* 7722 * Workaround userspace that relies on old KVM behavior of %rip being 7723 * incremented prior to exiting to userspace to handle "OUT 0x7e". 7724 */ 7725 if (port == 0x7e && 7726 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) { 7727 vcpu->arch.complete_userspace_io = 7728 complete_fast_pio_out_port_0x7e; 7729 kvm_skip_emulated_instruction(vcpu); 7730 } else { 7731 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); 7732 vcpu->arch.complete_userspace_io = complete_fast_pio_out; 7733 } 7734 return 0; 7735 } 7736 7737 static int complete_fast_pio_in(struct kvm_vcpu *vcpu) 7738 { 7739 unsigned long val; 7740 7741 /* We should only ever be called with arch.pio.count equal to 1 */ 7742 BUG_ON(vcpu->arch.pio.count != 1); 7743 7744 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { 7745 vcpu->arch.pio.count = 0; 7746 return 1; 7747 } 7748 7749 /* For size less than 4 we merge, else we zero extend */ 7750 val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; 7751 7752 /* 7753 * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform 7754 * the copy and tracing 7755 */ 7756 emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); 7757 kvm_rax_write(vcpu, val); 7758 7759 return kvm_skip_emulated_instruction(vcpu); 7760 } 7761 7762 static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, 7763 unsigned short port) 7764 { 7765 unsigned long val; 7766 int ret; 7767 7768 /* For size less than 4 we merge, else we zero extend */ 7769 val = (size < 4) ? kvm_rax_read(vcpu) : 0; 7770 7771 ret = emulator_pio_in(vcpu, size, port, &val, 1); 7772 if (ret) { 7773 kvm_rax_write(vcpu, val); 7774 return ret; 7775 } 7776 7777 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); 7778 vcpu->arch.complete_userspace_io = complete_fast_pio_in; 7779 7780 return 0; 7781 } 7782 7783 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) 7784 { 7785 int ret; 7786 7787 if (in) 7788 ret = kvm_fast_pio_in(vcpu, size, port); 7789 else 7790 ret = kvm_fast_pio_out(vcpu, size, port); 7791 return ret && kvm_skip_emulated_instruction(vcpu); 7792 } 7793 EXPORT_SYMBOL_GPL(kvm_fast_pio); 7794 7795 static int kvmclock_cpu_down_prep(unsigned int cpu) 7796 { 7797 __this_cpu_write(cpu_tsc_khz, 0); 7798 return 0; 7799 } 7800 7801 static void tsc_khz_changed(void *data) 7802 { 7803 struct cpufreq_freqs *freq = data; 7804 unsigned long khz = 0; 7805 7806 if (data) 7807 khz = freq->new; 7808 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 7809 khz = cpufreq_quick_get(raw_smp_processor_id()); 7810 if (!khz) 7811 khz = tsc_khz; 7812 __this_cpu_write(cpu_tsc_khz, khz); 7813 } 7814 7815 #ifdef CONFIG_X86_64 7816 static void kvm_hyperv_tsc_notifier(void) 7817 { 7818 struct kvm *kvm; 7819 struct kvm_vcpu *vcpu; 7820 int cpu; 7821 unsigned long flags; 7822 7823 mutex_lock(&kvm_lock); 7824 list_for_each_entry(kvm, &vm_list, vm_list) 7825 kvm_make_mclock_inprogress_request(kvm); 7826 7827 hyperv_stop_tsc_emulation(); 7828 7829 /* TSC frequency always matches when on Hyper-V */ 7830 for_each_present_cpu(cpu) 7831 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 7832 kvm_max_guest_tsc_khz = tsc_khz; 7833 7834 list_for_each_entry(kvm, &vm_list, vm_list) { 7835 struct kvm_arch *ka = &kvm->arch; 7836 7837 spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); 7838 pvclock_update_vm_gtod_copy(kvm); 7839 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); 7840 7841 kvm_for_each_vcpu(cpu, vcpu, kvm) 7842 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 7843 7844 kvm_for_each_vcpu(cpu, vcpu, kvm) 7845 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); 7846 } 7847 mutex_unlock(&kvm_lock); 7848 } 7849 #endif 7850 7851 static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) 7852 { 7853 struct kvm *kvm; 7854 struct kvm_vcpu *vcpu; 7855 int i, send_ipi = 0; 7856 7857 /* 7858 * We allow guests to temporarily run on slowing clocks, 7859 * provided we notify them after, or to run on accelerating 7860 * clocks, provided we notify them before. Thus time never 7861 * goes backwards. 7862 * 7863 * However, we have a problem. We can't atomically update 7864 * the frequency of a given CPU from this function; it is 7865 * merely a notifier, which can be called from any CPU. 7866 * Changing the TSC frequency at arbitrary points in time 7867 * requires a recomputation of local variables related to 7868 * the TSC for each VCPU. We must flag these local variables 7869 * to be updated and be sure the update takes place with the 7870 * new frequency before any guests proceed. 7871 * 7872 * Unfortunately, the combination of hotplug CPU and frequency 7873 * change creates an intractable locking scenario; the order 7874 * of when these callouts happen is undefined with respect to 7875 * CPU hotplug, and they can race with each other. As such, 7876 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is 7877 * undefined; you can actually have a CPU frequency change take 7878 * place in between the computation of X and the setting of the 7879 * variable. To protect against this problem, all updates of 7880 * the per_cpu tsc_khz variable are done in an interrupt 7881 * protected IPI, and all callers wishing to update the value 7882 * must wait for a synchronous IPI to complete (which is trivial 7883 * if the caller is on the CPU already). This establishes the 7884 * necessary total order on variable updates. 7885 * 7886 * Note that because a guest time update may take place 7887 * anytime after the setting of the VCPU's request bit, the 7888 * correct TSC value must be set before the request. However, 7889 * to ensure the update actually makes it to any guest which 7890 * starts running in hardware virtualization between the set 7891 * and the acquisition of the spinlock, we must also ping the 7892 * CPU after setting the request bit. 7893 * 7894 */ 7895 7896 smp_call_function_single(cpu, tsc_khz_changed, freq, 1); 7897 7898 mutex_lock(&kvm_lock); 7899 list_for_each_entry(kvm, &vm_list, vm_list) { 7900 kvm_for_each_vcpu(i, vcpu, kvm) { 7901 if (vcpu->cpu != cpu) 7902 continue; 7903 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 7904 if (vcpu->cpu != raw_smp_processor_id()) 7905 send_ipi = 1; 7906 } 7907 } 7908 mutex_unlock(&kvm_lock); 7909 7910 if (freq->old < freq->new && send_ipi) { 7911 /* 7912 * We upscale the frequency. Must make the guest 7913 * doesn't see old kvmclock values while running with 7914 * the new frequency, otherwise we risk the guest sees 7915 * time go backwards. 7916 * 7917 * In case we update the frequency for another cpu 7918 * (which might be in guest context) send an interrupt 7919 * to kick the cpu out of guest context. Next time 7920 * guest context is entered kvmclock will be updated, 7921 * so the guest will not see stale values. 7922 */ 7923 smp_call_function_single(cpu, tsc_khz_changed, freq, 1); 7924 } 7925 } 7926 7927 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 7928 void *data) 7929 { 7930 struct cpufreq_freqs *freq = data; 7931 int cpu; 7932 7933 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 7934 return 0; 7935 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 7936 return 0; 7937 7938 for_each_cpu(cpu, freq->policy->cpus) 7939 __kvmclock_cpufreq_notifier(freq, cpu); 7940 7941 return 0; 7942 } 7943 7944 static struct notifier_block kvmclock_cpufreq_notifier_block = { 7945 .notifier_call = kvmclock_cpufreq_notifier 7946 }; 7947 7948 static int kvmclock_cpu_online(unsigned int cpu) 7949 { 7950 tsc_khz_changed(NULL); 7951 return 0; 7952 } 7953 7954 static void kvm_timer_init(void) 7955 { 7956 max_tsc_khz = tsc_khz; 7957 7958 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 7959 #ifdef CONFIG_CPU_FREQ 7960 struct cpufreq_policy *policy; 7961 int cpu; 7962 7963 cpu = get_cpu(); 7964 policy = cpufreq_cpu_get(cpu); 7965 if (policy) { 7966 if (policy->cpuinfo.max_freq) 7967 max_tsc_khz = policy->cpuinfo.max_freq; 7968 cpufreq_cpu_put(policy); 7969 } 7970 put_cpu(); 7971 #endif 7972 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 7973 CPUFREQ_TRANSITION_NOTIFIER); 7974 } 7975 7976 cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", 7977 kvmclock_cpu_online, kvmclock_cpu_down_prep); 7978 } 7979 7980 DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 7981 EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); 7982 7983 int kvm_is_in_guest(void) 7984 { 7985 return __this_cpu_read(current_vcpu) != NULL; 7986 } 7987 7988 static int kvm_is_user_mode(void) 7989 { 7990 int user_mode = 3; 7991 7992 if (__this_cpu_read(current_vcpu)) 7993 user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); 7994 7995 return user_mode != 0; 7996 } 7997 7998 static unsigned long kvm_get_guest_ip(void) 7999 { 8000 unsigned long ip = 0; 8001 8002 if (__this_cpu_read(current_vcpu)) 8003 ip = kvm_rip_read(__this_cpu_read(current_vcpu)); 8004 8005 return ip; 8006 } 8007 8008 static void kvm_handle_intel_pt_intr(void) 8009 { 8010 struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); 8011 8012 kvm_make_request(KVM_REQ_PMI, vcpu); 8013 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8014 (unsigned long *)&vcpu->arch.pmu.global_status); 8015 } 8016 8017 static struct perf_guest_info_callbacks kvm_guest_cbs = { 8018 .is_in_guest = kvm_is_in_guest, 8019 .is_user_mode = kvm_is_user_mode, 8020 .get_guest_ip = kvm_get_guest_ip, 8021 .handle_intel_pt_intr = kvm_handle_intel_pt_intr, 8022 }; 8023 8024 #ifdef CONFIG_X86_64 8025 static void pvclock_gtod_update_fn(struct work_struct *work) 8026 { 8027 struct kvm *kvm; 8028 8029 struct kvm_vcpu *vcpu; 8030 int i; 8031 8032 mutex_lock(&kvm_lock); 8033 list_for_each_entry(kvm, &vm_list, vm_list) 8034 kvm_for_each_vcpu(i, vcpu, kvm) 8035 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 8036 atomic_set(&kvm_guest_has_master_clock, 0); 8037 mutex_unlock(&kvm_lock); 8038 } 8039 8040 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 8041 8042 /* 8043 * Notification about pvclock gtod data update. 8044 */ 8045 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, 8046 void *priv) 8047 { 8048 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 8049 struct timekeeper *tk = priv; 8050 8051 update_pvclock_gtod(tk); 8052 8053 /* disable master clock if host does not trust, or does not 8054 * use, TSC based clocksource. 8055 */ 8056 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && 8057 atomic_read(&kvm_guest_has_master_clock) != 0) 8058 queue_work(system_long_wq, &pvclock_gtod_work); 8059 8060 return 0; 8061 } 8062 8063 static struct notifier_block pvclock_gtod_notifier = { 8064 .notifier_call = pvclock_gtod_notify, 8065 }; 8066 #endif 8067 8068 int kvm_arch_init(void *opaque) 8069 { 8070 struct kvm_x86_init_ops *ops = opaque; 8071 int r; 8072 8073 if (kvm_x86_ops.hardware_enable) { 8074 printk(KERN_ERR "kvm: already loaded the other module\n"); 8075 r = -EEXIST; 8076 goto out; 8077 } 8078 8079 if (!ops->cpu_has_kvm_support()) { 8080 pr_err_ratelimited("kvm: no hardware support\n"); 8081 r = -EOPNOTSUPP; 8082 goto out; 8083 } 8084 if (ops->disabled_by_bios()) { 8085 pr_err_ratelimited("kvm: disabled by bios\n"); 8086 r = -EOPNOTSUPP; 8087 goto out; 8088 } 8089 8090 /* 8091 * KVM explicitly assumes that the guest has an FPU and 8092 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the 8093 * vCPU's FPU state as a fxregs_state struct. 8094 */ 8095 if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { 8096 printk(KERN_ERR "kvm: inadequate fpu\n"); 8097 r = -EOPNOTSUPP; 8098 goto out; 8099 } 8100 8101 r = -ENOMEM; 8102 x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), 8103 __alignof__(struct fpu), SLAB_ACCOUNT, 8104 NULL); 8105 if (!x86_fpu_cache) { 8106 printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); 8107 goto out; 8108 } 8109 8110 x86_emulator_cache = kvm_alloc_emulator_cache(); 8111 if (!x86_emulator_cache) { 8112 pr_err("kvm: failed to allocate cache for x86 emulator\n"); 8113 goto out_free_x86_fpu_cache; 8114 } 8115 8116 user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); 8117 if (!user_return_msrs) { 8118 printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); 8119 goto out_free_x86_emulator_cache; 8120 } 8121 8122 r = kvm_mmu_module_init(); 8123 if (r) 8124 goto out_free_percpu; 8125 8126 kvm_timer_init(); 8127 8128 perf_register_guest_info_callbacks(&kvm_guest_cbs); 8129 8130 if (boot_cpu_has(X86_FEATURE_XSAVE)) { 8131 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 8132 supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; 8133 } 8134 8135 if (pi_inject_timer == -1) 8136 pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); 8137 #ifdef CONFIG_X86_64 8138 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); 8139 8140 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) 8141 set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); 8142 #endif 8143 8144 return 0; 8145 8146 out_free_percpu: 8147 free_percpu(user_return_msrs); 8148 out_free_x86_emulator_cache: 8149 kmem_cache_destroy(x86_emulator_cache); 8150 out_free_x86_fpu_cache: 8151 kmem_cache_destroy(x86_fpu_cache); 8152 out: 8153 return r; 8154 } 8155 8156 void kvm_arch_exit(void) 8157 { 8158 #ifdef CONFIG_X86_64 8159 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) 8160 clear_hv_tscchange_cb(); 8161 #endif 8162 kvm_lapic_exit(); 8163 perf_unregister_guest_info_callbacks(&kvm_guest_cbs); 8164 8165 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 8166 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 8167 CPUFREQ_TRANSITION_NOTIFIER); 8168 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); 8169 #ifdef CONFIG_X86_64 8170 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); 8171 #endif 8172 kvm_x86_ops.hardware_enable = NULL; 8173 kvm_mmu_module_exit(); 8174 free_percpu(user_return_msrs); 8175 kmem_cache_destroy(x86_fpu_cache); 8176 #ifdef CONFIG_KVM_XEN 8177 static_key_deferred_flush(&kvm_xen_enabled); 8178 WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); 8179 #endif 8180 } 8181 8182 static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) 8183 { 8184 ++vcpu->stat.halt_exits; 8185 if (lapic_in_kernel(vcpu)) { 8186 vcpu->arch.mp_state = state; 8187 return 1; 8188 } else { 8189 vcpu->run->exit_reason = reason; 8190 return 0; 8191 } 8192 } 8193 8194 int kvm_vcpu_halt(struct kvm_vcpu *vcpu) 8195 { 8196 return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 8197 } 8198 EXPORT_SYMBOL_GPL(kvm_vcpu_halt); 8199 8200 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 8201 { 8202 int ret = kvm_skip_emulated_instruction(vcpu); 8203 /* 8204 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 8205 * KVM_EXIT_DEBUG here. 8206 */ 8207 return kvm_vcpu_halt(vcpu) && ret; 8208 } 8209 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 8210 8211 int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 8212 { 8213 int ret = kvm_skip_emulated_instruction(vcpu); 8214 8215 return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; 8216 } 8217 EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 8218 8219 #ifdef CONFIG_X86_64 8220 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, 8221 unsigned long clock_type) 8222 { 8223 struct kvm_clock_pairing clock_pairing; 8224 struct timespec64 ts; 8225 u64 cycle; 8226 int ret; 8227 8228 if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) 8229 return -KVM_EOPNOTSUPP; 8230 8231 if (!kvm_get_walltime_and_clockread(&ts, &cycle)) 8232 return -KVM_EOPNOTSUPP; 8233 8234 clock_pairing.sec = ts.tv_sec; 8235 clock_pairing.nsec = ts.tv_nsec; 8236 clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); 8237 clock_pairing.flags = 0; 8238 memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad)); 8239 8240 ret = 0; 8241 if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, 8242 sizeof(struct kvm_clock_pairing))) 8243 ret = -KVM_EFAULT; 8244 8245 return ret; 8246 } 8247 #endif 8248 8249 /* 8250 * kvm_pv_kick_cpu_op: Kick a vcpu. 8251 * 8252 * @apicid - apicid of vcpu to be kicked. 8253 */ 8254 static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) 8255 { 8256 struct kvm_lapic_irq lapic_irq; 8257 8258 lapic_irq.shorthand = APIC_DEST_NOSHORT; 8259 lapic_irq.dest_mode = APIC_DEST_PHYSICAL; 8260 lapic_irq.level = 0; 8261 lapic_irq.dest_id = apicid; 8262 lapic_irq.msi_redir_hint = false; 8263 8264 lapic_irq.delivery_mode = APIC_DM_REMRD; 8265 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); 8266 } 8267 8268 bool kvm_apicv_activated(struct kvm *kvm) 8269 { 8270 return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); 8271 } 8272 EXPORT_SYMBOL_GPL(kvm_apicv_activated); 8273 8274 void kvm_apicv_init(struct kvm *kvm, bool enable) 8275 { 8276 if (enable) 8277 clear_bit(APICV_INHIBIT_REASON_DISABLE, 8278 &kvm->arch.apicv_inhibit_reasons); 8279 else 8280 set_bit(APICV_INHIBIT_REASON_DISABLE, 8281 &kvm->arch.apicv_inhibit_reasons); 8282 } 8283 EXPORT_SYMBOL_GPL(kvm_apicv_init); 8284 8285 static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) 8286 { 8287 struct kvm_vcpu *target = NULL; 8288 struct kvm_apic_map *map; 8289 8290 vcpu->stat.directed_yield_attempted++; 8291 8292 rcu_read_lock(); 8293 map = rcu_dereference(vcpu->kvm->arch.apic_map); 8294 8295 if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) 8296 target = map->phys_map[dest_id]->vcpu; 8297 8298 rcu_read_unlock(); 8299 8300 if (!target || !READ_ONCE(target->ready)) 8301 goto no_yield; 8302 8303 /* Ignore requests to yield to self */ 8304 if (vcpu == target) 8305 goto no_yield; 8306 8307 if (kvm_vcpu_yield_to(target) <= 0) 8308 goto no_yield; 8309 8310 vcpu->stat.directed_yield_successful++; 8311 8312 no_yield: 8313 return; 8314 } 8315 8316 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 8317 { 8318 unsigned long nr, a0, a1, a2, a3, ret; 8319 int op_64_bit; 8320 8321 if (kvm_xen_hypercall_enabled(vcpu->kvm)) 8322 return kvm_xen_hypercall(vcpu); 8323 8324 if (kvm_hv_hypercall_enabled(vcpu)) 8325 return kvm_hv_hypercall(vcpu); 8326 8327 nr = kvm_rax_read(vcpu); 8328 a0 = kvm_rbx_read(vcpu); 8329 a1 = kvm_rcx_read(vcpu); 8330 a2 = kvm_rdx_read(vcpu); 8331 a3 = kvm_rsi_read(vcpu); 8332 8333 trace_kvm_hypercall(nr, a0, a1, a2, a3); 8334 8335 op_64_bit = is_64_bit_mode(vcpu); 8336 if (!op_64_bit) { 8337 nr &= 0xFFFFFFFF; 8338 a0 &= 0xFFFFFFFF; 8339 a1 &= 0xFFFFFFFF; 8340 a2 &= 0xFFFFFFFF; 8341 a3 &= 0xFFFFFFFF; 8342 } 8343 8344 if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { 8345 ret = -KVM_EPERM; 8346 goto out; 8347 } 8348 8349 ret = -KVM_ENOSYS; 8350 8351 switch (nr) { 8352 case KVM_HC_VAPIC_POLL_IRQ: 8353 ret = 0; 8354 break; 8355 case KVM_HC_KICK_CPU: 8356 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) 8357 break; 8358 8359 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); 8360 kvm_sched_yield(vcpu, a1); 8361 ret = 0; 8362 break; 8363 #ifdef CONFIG_X86_64 8364 case KVM_HC_CLOCK_PAIRING: 8365 ret = kvm_pv_clock_pairing(vcpu, a0, a1); 8366 break; 8367 #endif 8368 case KVM_HC_SEND_IPI: 8369 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) 8370 break; 8371 8372 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); 8373 break; 8374 case KVM_HC_SCHED_YIELD: 8375 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) 8376 break; 8377 8378 kvm_sched_yield(vcpu, a0); 8379 ret = 0; 8380 break; 8381 default: 8382 ret = -KVM_ENOSYS; 8383 break; 8384 } 8385 out: 8386 if (!op_64_bit) 8387 ret = (u32)ret; 8388 kvm_rax_write(vcpu, ret); 8389 8390 ++vcpu->stat.hypercalls; 8391 return kvm_skip_emulated_instruction(vcpu); 8392 } 8393 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 8394 8395 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 8396 { 8397 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 8398 char instruction[3]; 8399 unsigned long rip = kvm_rip_read(vcpu); 8400 8401 static_call(kvm_x86_patch_hypercall)(vcpu, instruction); 8402 8403 return emulator_write_emulated(ctxt, rip, instruction, 3, 8404 &ctxt->exception); 8405 } 8406 8407 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 8408 { 8409 return vcpu->run->request_interrupt_window && 8410 likely(!pic_in_kernel(vcpu->kvm)); 8411 } 8412 8413 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 8414 { 8415 struct kvm_run *kvm_run = vcpu->run; 8416 8417 /* 8418 * if_flag is obsolete and useless, so do not bother 8419 * setting it for SEV-ES guests. Userspace can just 8420 * use kvm_run->ready_for_interrupt_injection. 8421 */ 8422 kvm_run->if_flag = !vcpu->arch.guest_state_protected 8423 && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 8424 8425 kvm_run->cr8 = kvm_get_cr8(vcpu); 8426 kvm_run->apic_base = kvm_get_apic_base(vcpu); 8427 kvm_run->ready_for_interrupt_injection = 8428 pic_in_kernel(vcpu->kvm) || 8429 kvm_vcpu_ready_for_interrupt_injection(vcpu); 8430 8431 if (is_smm(vcpu)) 8432 kvm_run->flags |= KVM_RUN_X86_SMM; 8433 } 8434 8435 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 8436 { 8437 int max_irr, tpr; 8438 8439 if (!kvm_x86_ops.update_cr8_intercept) 8440 return; 8441 8442 if (!lapic_in_kernel(vcpu)) 8443 return; 8444 8445 if (vcpu->arch.apicv_active) 8446 return; 8447 8448 if (!vcpu->arch.apic->vapic_addr) 8449 max_irr = kvm_lapic_find_highest_irr(vcpu); 8450 else 8451 max_irr = -1; 8452 8453 if (max_irr != -1) 8454 max_irr >>= 4; 8455 8456 tpr = kvm_lapic_get_cr8(vcpu); 8457 8458 static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); 8459 } 8460 8461 8462 int kvm_check_nested_events(struct kvm_vcpu *vcpu) 8463 { 8464 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 8465 return -EIO; 8466 8467 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 8468 kvm_x86_ops.nested_ops->triple_fault(vcpu); 8469 return 1; 8470 } 8471 8472 return kvm_x86_ops.nested_ops->check_events(vcpu); 8473 } 8474 8475 static void kvm_inject_exception(struct kvm_vcpu *vcpu) 8476 { 8477 if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) 8478 vcpu->arch.exception.error_code = false; 8479 static_call(kvm_x86_queue_exception)(vcpu); 8480 } 8481 8482 static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) 8483 { 8484 int r; 8485 bool can_inject = true; 8486 8487 /* try to reinject previous events if any */ 8488 8489 if (vcpu->arch.exception.injected) { 8490 kvm_inject_exception(vcpu); 8491 can_inject = false; 8492 } 8493 /* 8494 * Do not inject an NMI or interrupt if there is a pending 8495 * exception. Exceptions and interrupts are recognized at 8496 * instruction boundaries, i.e. the start of an instruction. 8497 * Trap-like exceptions, e.g. #DB, have higher priority than 8498 * NMIs and interrupts, i.e. traps are recognized before an 8499 * NMI/interrupt that's pending on the same instruction. 8500 * Fault-like exceptions, e.g. #GP and #PF, are the lowest 8501 * priority, but are only generated (pended) during instruction 8502 * execution, i.e. a pending fault-like exception means the 8503 * fault occurred on the *previous* instruction and must be 8504 * serviced prior to recognizing any new events in order to 8505 * fully complete the previous instruction. 8506 */ 8507 else if (!vcpu->arch.exception.pending) { 8508 if (vcpu->arch.nmi_injected) { 8509 static_call(kvm_x86_set_nmi)(vcpu); 8510 can_inject = false; 8511 } else if (vcpu->arch.interrupt.injected) { 8512 static_call(kvm_x86_set_irq)(vcpu); 8513 can_inject = false; 8514 } 8515 } 8516 8517 WARN_ON_ONCE(vcpu->arch.exception.injected && 8518 vcpu->arch.exception.pending); 8519 8520 /* 8521 * Call check_nested_events() even if we reinjected a previous event 8522 * in order for caller to determine if it should require immediate-exit 8523 * from L2 to L1 due to pending L1 events which require exit 8524 * from L2 to L1. 8525 */ 8526 if (is_guest_mode(vcpu)) { 8527 r = kvm_check_nested_events(vcpu); 8528 if (r < 0) 8529 goto busy; 8530 } 8531 8532 /* try to inject new event if pending */ 8533 if (vcpu->arch.exception.pending) { 8534 trace_kvm_inj_exception(vcpu->arch.exception.nr, 8535 vcpu->arch.exception.has_error_code, 8536 vcpu->arch.exception.error_code); 8537 8538 vcpu->arch.exception.pending = false; 8539 vcpu->arch.exception.injected = true; 8540 8541 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) 8542 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | 8543 X86_EFLAGS_RF); 8544 8545 if (vcpu->arch.exception.nr == DB_VECTOR) { 8546 kvm_deliver_exception_payload(vcpu); 8547 if (vcpu->arch.dr7 & DR7_GD) { 8548 vcpu->arch.dr7 &= ~DR7_GD; 8549 kvm_update_dr7(vcpu); 8550 } 8551 } 8552 8553 kvm_inject_exception(vcpu); 8554 can_inject = false; 8555 } 8556 8557 /* 8558 * Finally, inject interrupt events. If an event cannot be injected 8559 * due to architectural conditions (e.g. IF=0) a window-open exit 8560 * will re-request KVM_REQ_EVENT. Sometimes however an event is pending 8561 * and can architecturally be injected, but we cannot do it right now: 8562 * an interrupt could have arrived just now and we have to inject it 8563 * as a vmexit, or there could already an event in the queue, which is 8564 * indicated by can_inject. In that case we request an immediate exit 8565 * in order to make progress and get back here for another iteration. 8566 * The kvm_x86_ops hooks communicate this by returning -EBUSY. 8567 */ 8568 if (vcpu->arch.smi_pending) { 8569 r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; 8570 if (r < 0) 8571 goto busy; 8572 if (r) { 8573 vcpu->arch.smi_pending = false; 8574 ++vcpu->arch.smi_count; 8575 enter_smm(vcpu); 8576 can_inject = false; 8577 } else 8578 static_call(kvm_x86_enable_smi_window)(vcpu); 8579 } 8580 8581 if (vcpu->arch.nmi_pending) { 8582 r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; 8583 if (r < 0) 8584 goto busy; 8585 if (r) { 8586 --vcpu->arch.nmi_pending; 8587 vcpu->arch.nmi_injected = true; 8588 static_call(kvm_x86_set_nmi)(vcpu); 8589 can_inject = false; 8590 WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); 8591 } 8592 if (vcpu->arch.nmi_pending) 8593 static_call(kvm_x86_enable_nmi_window)(vcpu); 8594 } 8595 8596 if (kvm_cpu_has_injectable_intr(vcpu)) { 8597 r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; 8598 if (r < 0) 8599 goto busy; 8600 if (r) { 8601 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); 8602 static_call(kvm_x86_set_irq)(vcpu); 8603 WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); 8604 } 8605 if (kvm_cpu_has_injectable_intr(vcpu)) 8606 static_call(kvm_x86_enable_irq_window)(vcpu); 8607 } 8608 8609 if (is_guest_mode(vcpu) && 8610 kvm_x86_ops.nested_ops->hv_timer_pending && 8611 kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) 8612 *req_immediate_exit = true; 8613 8614 WARN_ON(vcpu->arch.exception.pending); 8615 return; 8616 8617 busy: 8618 *req_immediate_exit = true; 8619 return; 8620 } 8621 8622 static void process_nmi(struct kvm_vcpu *vcpu) 8623 { 8624 unsigned limit = 2; 8625 8626 /* 8627 * x86 is limited to one NMI running, and one NMI pending after it. 8628 * If an NMI is already in progress, limit further NMIs to just one. 8629 * Otherwise, allow two (and we'll inject the first one immediately). 8630 */ 8631 if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) 8632 limit = 1; 8633 8634 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); 8635 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); 8636 kvm_make_request(KVM_REQ_EVENT, vcpu); 8637 } 8638 8639 static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) 8640 { 8641 u32 flags = 0; 8642 flags |= seg->g << 23; 8643 flags |= seg->db << 22; 8644 flags |= seg->l << 21; 8645 flags |= seg->avl << 20; 8646 flags |= seg->present << 15; 8647 flags |= seg->dpl << 13; 8648 flags |= seg->s << 12; 8649 flags |= seg->type << 8; 8650 return flags; 8651 } 8652 8653 static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) 8654 { 8655 struct kvm_segment seg; 8656 int offset; 8657 8658 kvm_get_segment(vcpu, &seg, n); 8659 put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); 8660 8661 if (n < 3) 8662 offset = 0x7f84 + n * 12; 8663 else 8664 offset = 0x7f2c + (n - 3) * 12; 8665 8666 put_smstate(u32, buf, offset + 8, seg.base); 8667 put_smstate(u32, buf, offset + 4, seg.limit); 8668 put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); 8669 } 8670 8671 #ifdef CONFIG_X86_64 8672 static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) 8673 { 8674 struct kvm_segment seg; 8675 int offset; 8676 u16 flags; 8677 8678 kvm_get_segment(vcpu, &seg, n); 8679 offset = 0x7e00 + n * 16; 8680 8681 flags = enter_smm_get_segment_flags(&seg) >> 8; 8682 put_smstate(u16, buf, offset, seg.selector); 8683 put_smstate(u16, buf, offset + 2, flags); 8684 put_smstate(u32, buf, offset + 4, seg.limit); 8685 put_smstate(u64, buf, offset + 8, seg.base); 8686 } 8687 #endif 8688 8689 static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) 8690 { 8691 struct desc_ptr dt; 8692 struct kvm_segment seg; 8693 unsigned long val; 8694 int i; 8695 8696 put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); 8697 put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); 8698 put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); 8699 put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); 8700 8701 for (i = 0; i < 8; i++) 8702 put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); 8703 8704 kvm_get_dr(vcpu, 6, &val); 8705 put_smstate(u32, buf, 0x7fcc, (u32)val); 8706 kvm_get_dr(vcpu, 7, &val); 8707 put_smstate(u32, buf, 0x7fc8, (u32)val); 8708 8709 kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); 8710 put_smstate(u32, buf, 0x7fc4, seg.selector); 8711 put_smstate(u32, buf, 0x7f64, seg.base); 8712 put_smstate(u32, buf, 0x7f60, seg.limit); 8713 put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); 8714 8715 kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); 8716 put_smstate(u32, buf, 0x7fc0, seg.selector); 8717 put_smstate(u32, buf, 0x7f80, seg.base); 8718 put_smstate(u32, buf, 0x7f7c, seg.limit); 8719 put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); 8720 8721 static_call(kvm_x86_get_gdt)(vcpu, &dt); 8722 put_smstate(u32, buf, 0x7f74, dt.address); 8723 put_smstate(u32, buf, 0x7f70, dt.size); 8724 8725 static_call(kvm_x86_get_idt)(vcpu, &dt); 8726 put_smstate(u32, buf, 0x7f58, dt.address); 8727 put_smstate(u32, buf, 0x7f54, dt.size); 8728 8729 for (i = 0; i < 6; i++) 8730 enter_smm_save_seg_32(vcpu, buf, i); 8731 8732 put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); 8733 8734 /* revision id */ 8735 put_smstate(u32, buf, 0x7efc, 0x00020000); 8736 put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); 8737 } 8738 8739 #ifdef CONFIG_X86_64 8740 static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) 8741 { 8742 struct desc_ptr dt; 8743 struct kvm_segment seg; 8744 unsigned long val; 8745 int i; 8746 8747 for (i = 0; i < 16; i++) 8748 put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); 8749 8750 put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); 8751 put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); 8752 8753 kvm_get_dr(vcpu, 6, &val); 8754 put_smstate(u64, buf, 0x7f68, val); 8755 kvm_get_dr(vcpu, 7, &val); 8756 put_smstate(u64, buf, 0x7f60, val); 8757 8758 put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); 8759 put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); 8760 put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); 8761 8762 put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); 8763 8764 /* revision id */ 8765 put_smstate(u32, buf, 0x7efc, 0x00020064); 8766 8767 put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); 8768 8769 kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); 8770 put_smstate(u16, buf, 0x7e90, seg.selector); 8771 put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); 8772 put_smstate(u32, buf, 0x7e94, seg.limit); 8773 put_smstate(u64, buf, 0x7e98, seg.base); 8774 8775 static_call(kvm_x86_get_idt)(vcpu, &dt); 8776 put_smstate(u32, buf, 0x7e84, dt.size); 8777 put_smstate(u64, buf, 0x7e88, dt.address); 8778 8779 kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); 8780 put_smstate(u16, buf, 0x7e70, seg.selector); 8781 put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); 8782 put_smstate(u32, buf, 0x7e74, seg.limit); 8783 put_smstate(u64, buf, 0x7e78, seg.base); 8784 8785 static_call(kvm_x86_get_gdt)(vcpu, &dt); 8786 put_smstate(u32, buf, 0x7e64, dt.size); 8787 put_smstate(u64, buf, 0x7e68, dt.address); 8788 8789 for (i = 0; i < 6; i++) 8790 enter_smm_save_seg_64(vcpu, buf, i); 8791 } 8792 #endif 8793 8794 static void enter_smm(struct kvm_vcpu *vcpu) 8795 { 8796 struct kvm_segment cs, ds; 8797 struct desc_ptr dt; 8798 char buf[512]; 8799 u32 cr0; 8800 8801 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); 8802 memset(buf, 0, 512); 8803 #ifdef CONFIG_X86_64 8804 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 8805 enter_smm_save_state_64(vcpu, buf); 8806 else 8807 #endif 8808 enter_smm_save_state_32(vcpu, buf); 8809 8810 /* 8811 * Give pre_enter_smm() a chance to make ISA-specific changes to the 8812 * vCPU state (e.g. leave guest mode) after we've saved the state into 8813 * the SMM state-save area. 8814 */ 8815 static_call(kvm_x86_pre_enter_smm)(vcpu, buf); 8816 8817 vcpu->arch.hflags |= HF_SMM_MASK; 8818 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); 8819 8820 if (static_call(kvm_x86_get_nmi_mask)(vcpu)) 8821 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; 8822 else 8823 static_call(kvm_x86_set_nmi_mask)(vcpu, true); 8824 8825 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); 8826 kvm_rip_write(vcpu, 0x8000); 8827 8828 cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); 8829 static_call(kvm_x86_set_cr0)(vcpu, cr0); 8830 vcpu->arch.cr0 = cr0; 8831 8832 static_call(kvm_x86_set_cr4)(vcpu, 0); 8833 8834 /* Undocumented: IDT limit is set to zero on entry to SMM. */ 8835 dt.address = dt.size = 0; 8836 static_call(kvm_x86_set_idt)(vcpu, &dt); 8837 8838 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 8839 8840 cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; 8841 cs.base = vcpu->arch.smbase; 8842 8843 ds.selector = 0; 8844 ds.base = 0; 8845 8846 cs.limit = ds.limit = 0xffffffff; 8847 cs.type = ds.type = 0x3; 8848 cs.dpl = ds.dpl = 0; 8849 cs.db = ds.db = 0; 8850 cs.s = ds.s = 1; 8851 cs.l = ds.l = 0; 8852 cs.g = ds.g = 1; 8853 cs.avl = ds.avl = 0; 8854 cs.present = ds.present = 1; 8855 cs.unusable = ds.unusable = 0; 8856 cs.padding = ds.padding = 0; 8857 8858 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); 8859 kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); 8860 kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); 8861 kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); 8862 kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); 8863 kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); 8864 8865 #ifdef CONFIG_X86_64 8866 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 8867 static_call(kvm_x86_set_efer)(vcpu, 0); 8868 #endif 8869 8870 kvm_update_cpuid_runtime(vcpu); 8871 kvm_mmu_reset_context(vcpu); 8872 } 8873 8874 static void process_smi(struct kvm_vcpu *vcpu) 8875 { 8876 vcpu->arch.smi_pending = true; 8877 kvm_make_request(KVM_REQ_EVENT, vcpu); 8878 } 8879 8880 void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, 8881 unsigned long *vcpu_bitmap) 8882 { 8883 cpumask_var_t cpus; 8884 8885 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 8886 8887 kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, 8888 NULL, vcpu_bitmap, cpus); 8889 8890 free_cpumask_var(cpus); 8891 } 8892 8893 void kvm_make_scan_ioapic_request(struct kvm *kvm) 8894 { 8895 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); 8896 } 8897 8898 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) 8899 { 8900 if (!lapic_in_kernel(vcpu)) 8901 return; 8902 8903 vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); 8904 kvm_apic_update_apicv(vcpu); 8905 static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); 8906 } 8907 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); 8908 8909 /* 8910 * NOTE: Do not hold any lock prior to calling this. 8911 * 8912 * In particular, kvm_request_apicv_update() expects kvm->srcu not to be 8913 * locked, because it calls __x86_set_memory_region() which does 8914 * synchronize_srcu(&kvm->srcu). 8915 */ 8916 void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) 8917 { 8918 struct kvm_vcpu *except; 8919 unsigned long old, new, expected; 8920 8921 if (!kvm_x86_ops.check_apicv_inhibit_reasons || 8922 !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) 8923 return; 8924 8925 old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); 8926 do { 8927 expected = new = old; 8928 if (activate) 8929 __clear_bit(bit, &new); 8930 else 8931 __set_bit(bit, &new); 8932 if (new == old) 8933 break; 8934 old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); 8935 } while (old != expected); 8936 8937 if (!!old == !!new) 8938 return; 8939 8940 trace_kvm_apicv_update_request(activate, bit); 8941 if (kvm_x86_ops.pre_update_apicv_exec_ctrl) 8942 static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); 8943 8944 /* 8945 * Sending request to update APICV for all other vcpus, 8946 * while update the calling vcpu immediately instead of 8947 * waiting for another #VMEXIT to handle the request. 8948 */ 8949 except = kvm_get_running_vcpu(); 8950 kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, 8951 except); 8952 if (except) 8953 kvm_vcpu_update_apicv(except); 8954 } 8955 EXPORT_SYMBOL_GPL(kvm_request_apicv_update); 8956 8957 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 8958 { 8959 if (!kvm_apic_present(vcpu)) 8960 return; 8961 8962 bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); 8963 8964 if (irqchip_split(vcpu->kvm)) 8965 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); 8966 else { 8967 if (vcpu->arch.apicv_active) 8968 static_call(kvm_x86_sync_pir_to_irr)(vcpu); 8969 if (ioapic_in_kernel(vcpu->kvm)) 8970 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); 8971 } 8972 8973 if (is_guest_mode(vcpu)) 8974 vcpu->arch.load_eoi_exitmap_pending = true; 8975 else 8976 kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); 8977 } 8978 8979 static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) 8980 { 8981 u64 eoi_exit_bitmap[4]; 8982 8983 if (!kvm_apic_hw_enabled(vcpu->arch.apic)) 8984 return; 8985 8986 if (to_hv_vcpu(vcpu)) 8987 bitmap_or((ulong *)eoi_exit_bitmap, 8988 vcpu->arch.ioapic_handled_vectors, 8989 to_hv_synic(vcpu)->vec_bitmap, 256); 8990 8991 static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); 8992 } 8993 8994 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 8995 unsigned long start, unsigned long end) 8996 { 8997 unsigned long apic_address; 8998 8999 /* 9000 * The physical address of apic access page is stored in the VMCS. 9001 * Update it when it becomes invalid. 9002 */ 9003 apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 9004 if (start <= apic_address && apic_address < end) 9005 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); 9006 } 9007 9008 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) 9009 { 9010 if (!lapic_in_kernel(vcpu)) 9011 return; 9012 9013 if (!kvm_x86_ops.set_apic_access_page_addr) 9014 return; 9015 9016 static_call(kvm_x86_set_apic_access_page_addr)(vcpu); 9017 } 9018 9019 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) 9020 { 9021 smp_send_reschedule(vcpu->cpu); 9022 } 9023 EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); 9024 9025 /* 9026 * Returns 1 to let vcpu_run() continue the guest execution loop without 9027 * exiting to the userspace. Otherwise, the value will be returned to the 9028 * userspace. 9029 */ 9030 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 9031 { 9032 int r; 9033 bool req_int_win = 9034 dm_request_for_irq_injection(vcpu) && 9035 kvm_cpu_accept_dm_intr(vcpu); 9036 fastpath_t exit_fastpath; 9037 9038 bool req_immediate_exit = false; 9039 9040 /* Forbid vmenter if vcpu dirty ring is soft-full */ 9041 if (unlikely(vcpu->kvm->dirty_ring_size && 9042 kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { 9043 vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; 9044 trace_kvm_dirty_ring_exit(vcpu); 9045 r = 0; 9046 goto out; 9047 } 9048 9049 if (kvm_request_pending(vcpu)) { 9050 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 9051 if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { 9052 r = 0; 9053 goto out; 9054 } 9055 } 9056 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 9057 kvm_mmu_unload(vcpu); 9058 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 9059 __kvm_migrate_timers(vcpu); 9060 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) 9061 kvm_gen_update_masterclock(vcpu->kvm); 9062 if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) 9063 kvm_gen_kvmclock_update(vcpu); 9064 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 9065 r = kvm_guest_time_update(vcpu); 9066 if (unlikely(r)) 9067 goto out; 9068 } 9069 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 9070 kvm_mmu_sync_roots(vcpu); 9071 if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) 9072 kvm_mmu_load_pgd(vcpu); 9073 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { 9074 kvm_vcpu_flush_tlb_all(vcpu); 9075 9076 /* Flushing all ASIDs flushes the current ASID... */ 9077 kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 9078 } 9079 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 9080 kvm_vcpu_flush_tlb_current(vcpu); 9081 if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) 9082 kvm_vcpu_flush_tlb_guest(vcpu); 9083 9084 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { 9085 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 9086 r = 0; 9087 goto out; 9088 } 9089 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 9090 if (is_guest_mode(vcpu)) { 9091 kvm_x86_ops.nested_ops->triple_fault(vcpu); 9092 } else { 9093 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 9094 vcpu->mmio_needed = 0; 9095 r = 0; 9096 goto out; 9097 } 9098 } 9099 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { 9100 /* Page is swapped out. Do synthetic halt */ 9101 vcpu->arch.apf.halted = true; 9102 r = 1; 9103 goto out; 9104 } 9105 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) 9106 record_steal_time(vcpu); 9107 if (kvm_check_request(KVM_REQ_SMI, vcpu)) 9108 process_smi(vcpu); 9109 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 9110 process_nmi(vcpu); 9111 if (kvm_check_request(KVM_REQ_PMU, vcpu)) 9112 kvm_pmu_handle_event(vcpu); 9113 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 9114 kvm_pmu_deliver_pmi(vcpu); 9115 if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { 9116 BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); 9117 if (test_bit(vcpu->arch.pending_ioapic_eoi, 9118 vcpu->arch.ioapic_handled_vectors)) { 9119 vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; 9120 vcpu->run->eoi.vector = 9121 vcpu->arch.pending_ioapic_eoi; 9122 r = 0; 9123 goto out; 9124 } 9125 } 9126 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) 9127 vcpu_scan_ioapic(vcpu); 9128 if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu)) 9129 vcpu_load_eoi_exitmap(vcpu); 9130 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) 9131 kvm_vcpu_reload_apic_access_page(vcpu); 9132 if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { 9133 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 9134 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; 9135 r = 0; 9136 goto out; 9137 } 9138 if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { 9139 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 9140 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; 9141 r = 0; 9142 goto out; 9143 } 9144 if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { 9145 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 9146 9147 vcpu->run->exit_reason = KVM_EXIT_HYPERV; 9148 vcpu->run->hyperv = hv_vcpu->exit; 9149 r = 0; 9150 goto out; 9151 } 9152 9153 /* 9154 * KVM_REQ_HV_STIMER has to be processed after 9155 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers 9156 * depend on the guest clock being up-to-date 9157 */ 9158 if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) 9159 kvm_hv_process_stimers(vcpu); 9160 if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) 9161 kvm_vcpu_update_apicv(vcpu); 9162 if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) 9163 kvm_check_async_pf_completion(vcpu); 9164 if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) 9165 static_call(kvm_x86_msr_filter_changed)(vcpu); 9166 9167 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) 9168 static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); 9169 } 9170 9171 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || 9172 kvm_xen_has_interrupt(vcpu)) { 9173 ++vcpu->stat.req_event; 9174 kvm_apic_accept_events(vcpu); 9175 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 9176 r = 1; 9177 goto out; 9178 } 9179 9180 inject_pending_event(vcpu, &req_immediate_exit); 9181 if (req_int_win) 9182 static_call(kvm_x86_enable_irq_window)(vcpu); 9183 9184 if (kvm_lapic_enabled(vcpu)) { 9185 update_cr8_intercept(vcpu); 9186 kvm_lapic_sync_to_vapic(vcpu); 9187 } 9188 } 9189 9190 r = kvm_mmu_reload(vcpu); 9191 if (unlikely(r)) { 9192 goto cancel_injection; 9193 } 9194 9195 preempt_disable(); 9196 9197 static_call(kvm_x86_prepare_guest_switch)(vcpu); 9198 9199 /* 9200 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt 9201 * IPI are then delayed after guest entry, which ensures that they 9202 * result in virtual interrupt delivery. 9203 */ 9204 local_irq_disable(); 9205 vcpu->mode = IN_GUEST_MODE; 9206 9207 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 9208 9209 /* 9210 * 1) We should set ->mode before checking ->requests. Please see 9211 * the comment in kvm_vcpu_exiting_guest_mode(). 9212 * 9213 * 2) For APICv, we should set ->mode before checking PID.ON. This 9214 * pairs with the memory barrier implicit in pi_test_and_set_on 9215 * (see vmx_deliver_posted_interrupt). 9216 * 9217 * 3) This also orders the write to mode from any reads to the page 9218 * tables done while the VCPU is running. Please see the comment 9219 * in kvm_flush_remote_tlbs. 9220 */ 9221 smp_mb__after_srcu_read_unlock(); 9222 9223 /* 9224 * This handles the case where a posted interrupt was 9225 * notified with kvm_vcpu_kick. 9226 */ 9227 if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) 9228 static_call(kvm_x86_sync_pir_to_irr)(vcpu); 9229 9230 if (kvm_vcpu_exit_request(vcpu)) { 9231 vcpu->mode = OUTSIDE_GUEST_MODE; 9232 smp_wmb(); 9233 local_irq_enable(); 9234 preempt_enable(); 9235 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 9236 r = 1; 9237 goto cancel_injection; 9238 } 9239 9240 if (req_immediate_exit) { 9241 kvm_make_request(KVM_REQ_EVENT, vcpu); 9242 static_call(kvm_x86_request_immediate_exit)(vcpu); 9243 } 9244 9245 fpregs_assert_state_consistent(); 9246 if (test_thread_flag(TIF_NEED_FPU_LOAD)) 9247 switch_fpu_return(); 9248 9249 if (unlikely(vcpu->arch.switch_db_regs)) { 9250 set_debugreg(0, 7); 9251 set_debugreg(vcpu->arch.eff_db[0], 0); 9252 set_debugreg(vcpu->arch.eff_db[1], 1); 9253 set_debugreg(vcpu->arch.eff_db[2], 2); 9254 set_debugreg(vcpu->arch.eff_db[3], 3); 9255 set_debugreg(vcpu->arch.dr6, 6); 9256 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; 9257 } 9258 9259 for (;;) { 9260 exit_fastpath = static_call(kvm_x86_run)(vcpu); 9261 if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) 9262 break; 9263 9264 if (unlikely(kvm_vcpu_exit_request(vcpu))) { 9265 exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; 9266 break; 9267 } 9268 9269 if (vcpu->arch.apicv_active) 9270 static_call(kvm_x86_sync_pir_to_irr)(vcpu); 9271 } 9272 9273 /* 9274 * Do this here before restoring debug registers on the host. And 9275 * since we do this before handling the vmexit, a DR access vmexit 9276 * can (a) read the correct value of the debug registers, (b) set 9277 * KVM_DEBUGREG_WONT_EXIT again. 9278 */ 9279 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { 9280 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); 9281 static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); 9282 kvm_update_dr0123(vcpu); 9283 kvm_update_dr7(vcpu); 9284 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; 9285 } 9286 9287 /* 9288 * If the guest has used debug registers, at least dr7 9289 * will be disabled while returning to the host. 9290 * If we don't have active breakpoints in the host, we don't 9291 * care about the messed up debug address registers. But if 9292 * we have some of them active, restore the old state. 9293 */ 9294 if (hw_breakpoint_active()) 9295 hw_breakpoint_restore(); 9296 9297 vcpu->arch.last_vmentry_cpu = vcpu->cpu; 9298 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); 9299 9300 vcpu->mode = OUTSIDE_GUEST_MODE; 9301 smp_wmb(); 9302 9303 static_call(kvm_x86_handle_exit_irqoff)(vcpu); 9304 9305 /* 9306 * Consume any pending interrupts, including the possible source of 9307 * VM-Exit on SVM and any ticks that occur between VM-Exit and now. 9308 * An instruction is required after local_irq_enable() to fully unblock 9309 * interrupts on processors that implement an interrupt shadow, the 9310 * stat.exits increment will do nicely. 9311 */ 9312 kvm_before_interrupt(vcpu); 9313 local_irq_enable(); 9314 ++vcpu->stat.exits; 9315 local_irq_disable(); 9316 kvm_after_interrupt(vcpu); 9317 9318 /* 9319 * Wait until after servicing IRQs to account guest time so that any 9320 * ticks that occurred while running the guest are properly accounted 9321 * to the guest. Waiting until IRQs are enabled degrades the accuracy 9322 * of accounting via context tracking, but the loss of accuracy is 9323 * acceptable for all known use cases. 9324 */ 9325 vtime_account_guest_exit(); 9326 9327 if (lapic_in_kernel(vcpu)) { 9328 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; 9329 if (delta != S64_MIN) { 9330 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); 9331 vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; 9332 } 9333 } 9334 9335 local_irq_enable(); 9336 preempt_enable(); 9337 9338 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 9339 9340 /* 9341 * Profile KVM exit RIPs: 9342 */ 9343 if (unlikely(prof_on == KVM_PROFILING)) { 9344 unsigned long rip = kvm_rip_read(vcpu); 9345 profile_hit(KVM_PROFILING, (void *)rip); 9346 } 9347 9348 if (unlikely(vcpu->arch.tsc_always_catchup)) 9349 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 9350 9351 if (vcpu->arch.apic_attention) 9352 kvm_lapic_sync_from_vapic(vcpu); 9353 9354 r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); 9355 return r; 9356 9357 cancel_injection: 9358 if (req_immediate_exit) 9359 kvm_make_request(KVM_REQ_EVENT, vcpu); 9360 static_call(kvm_x86_cancel_injection)(vcpu); 9361 if (unlikely(vcpu->arch.apic_attention)) 9362 kvm_lapic_sync_from_vapic(vcpu); 9363 out: 9364 return r; 9365 } 9366 9367 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) 9368 { 9369 if (!kvm_arch_vcpu_runnable(vcpu) && 9370 (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { 9371 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 9372 kvm_vcpu_block(vcpu); 9373 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 9374 9375 if (kvm_x86_ops.post_block) 9376 static_call(kvm_x86_post_block)(vcpu); 9377 9378 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) 9379 return 1; 9380 } 9381 9382 kvm_apic_accept_events(vcpu); 9383 switch(vcpu->arch.mp_state) { 9384 case KVM_MP_STATE_HALTED: 9385 case KVM_MP_STATE_AP_RESET_HOLD: 9386 vcpu->arch.pv.pv_unhalted = false; 9387 vcpu->arch.mp_state = 9388 KVM_MP_STATE_RUNNABLE; 9389 fallthrough; 9390 case KVM_MP_STATE_RUNNABLE: 9391 vcpu->arch.apf.halted = false; 9392 break; 9393 case KVM_MP_STATE_INIT_RECEIVED: 9394 break; 9395 default: 9396 return -EINTR; 9397 } 9398 return 1; 9399 } 9400 9401 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 9402 { 9403 if (is_guest_mode(vcpu)) 9404 kvm_check_nested_events(vcpu); 9405 9406 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 9407 !vcpu->arch.apf.halted); 9408 } 9409 9410 static int vcpu_run(struct kvm_vcpu *vcpu) 9411 { 9412 int r; 9413 struct kvm *kvm = vcpu->kvm; 9414 9415 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 9416 vcpu->arch.l1tf_flush_l1d = true; 9417 9418 for (;;) { 9419 if (kvm_vcpu_running(vcpu)) { 9420 r = vcpu_enter_guest(vcpu); 9421 } else { 9422 r = vcpu_block(kvm, vcpu); 9423 } 9424 9425 if (r <= 0) 9426 break; 9427 9428 kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); 9429 if (kvm_cpu_has_pending_timer(vcpu)) 9430 kvm_inject_pending_timer_irqs(vcpu); 9431 9432 if (dm_request_for_irq_injection(vcpu) && 9433 kvm_vcpu_ready_for_interrupt_injection(vcpu)) { 9434 r = 0; 9435 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 9436 ++vcpu->stat.request_irq_exits; 9437 break; 9438 } 9439 9440 if (__xfer_to_guest_mode_work_pending()) { 9441 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 9442 r = xfer_to_guest_mode_handle_work(vcpu); 9443 if (r) 9444 return r; 9445 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 9446 } 9447 } 9448 9449 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 9450 9451 return r; 9452 } 9453 9454 static inline int complete_emulated_io(struct kvm_vcpu *vcpu) 9455 { 9456 int r; 9457 9458 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 9459 r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 9460 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 9461 return r; 9462 } 9463 9464 static int complete_emulated_pio(struct kvm_vcpu *vcpu) 9465 { 9466 BUG_ON(!vcpu->arch.pio.count); 9467 9468 return complete_emulated_io(vcpu); 9469 } 9470 9471 /* 9472 * Implements the following, as a state machine: 9473 * 9474 * read: 9475 * for each fragment 9476 * for each mmio piece in the fragment 9477 * write gpa, len 9478 * exit 9479 * copy data 9480 * execute insn 9481 * 9482 * write: 9483 * for each fragment 9484 * for each mmio piece in the fragment 9485 * write gpa, len 9486 * copy data 9487 * exit 9488 */ 9489 static int complete_emulated_mmio(struct kvm_vcpu *vcpu) 9490 { 9491 struct kvm_run *run = vcpu->run; 9492 struct kvm_mmio_fragment *frag; 9493 unsigned len; 9494 9495 BUG_ON(!vcpu->mmio_needed); 9496 9497 /* Complete previous fragment */ 9498 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; 9499 len = min(8u, frag->len); 9500 if (!vcpu->mmio_is_write) 9501 memcpy(frag->data, run->mmio.data, len); 9502 9503 if (frag->len <= 8) { 9504 /* Switch to the next fragment. */ 9505 frag++; 9506 vcpu->mmio_cur_fragment++; 9507 } else { 9508 /* Go forward to the next mmio piece. */ 9509 frag->data += len; 9510 frag->gpa += len; 9511 frag->len -= len; 9512 } 9513 9514 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { 9515 vcpu->mmio_needed = 0; 9516 9517 /* FIXME: return into emulator if single-stepping. */ 9518 if (vcpu->mmio_is_write) 9519 return 1; 9520 vcpu->mmio_read_completed = 1; 9521 return complete_emulated_io(vcpu); 9522 } 9523 9524 run->exit_reason = KVM_EXIT_MMIO; 9525 run->mmio.phys_addr = frag->gpa; 9526 if (vcpu->mmio_is_write) 9527 memcpy(run->mmio.data, frag->data, min(8u, frag->len)); 9528 run->mmio.len = min(8u, frag->len); 9529 run->mmio.is_write = vcpu->mmio_is_write; 9530 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 9531 return 0; 9532 } 9533 9534 static void kvm_save_current_fpu(struct fpu *fpu) 9535 { 9536 /* 9537 * If the target FPU state is not resident in the CPU registers, just 9538 * memcpy() from current, else save CPU state directly to the target. 9539 */ 9540 if (test_thread_flag(TIF_NEED_FPU_LOAD)) 9541 memcpy(&fpu->state, ¤t->thread.fpu.state, 9542 fpu_kernel_xstate_size); 9543 else 9544 copy_fpregs_to_fpstate(fpu); 9545 } 9546 9547 /* Swap (qemu) user FPU context for the guest FPU context. */ 9548 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 9549 { 9550 fpregs_lock(); 9551 9552 kvm_save_current_fpu(vcpu->arch.user_fpu); 9553 9554 /* 9555 * Guests with protected state can't have it set by the hypervisor, 9556 * so skip trying to set it. 9557 */ 9558 if (vcpu->arch.guest_fpu) 9559 /* PKRU is separately restored in kvm_x86_ops.run. */ 9560 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, 9561 ~XFEATURE_MASK_PKRU); 9562 9563 fpregs_mark_activate(); 9564 fpregs_unlock(); 9565 9566 trace_kvm_fpu(1); 9567 } 9568 9569 /* When vcpu_run ends, restore user space FPU context. */ 9570 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 9571 { 9572 fpregs_lock(); 9573 9574 /* 9575 * Guests with protected state can't have it read by the hypervisor, 9576 * so skip trying to save it. 9577 */ 9578 if (vcpu->arch.guest_fpu) 9579 kvm_save_current_fpu(vcpu->arch.guest_fpu); 9580 9581 copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); 9582 9583 fpregs_mark_activate(); 9584 fpregs_unlock(); 9585 9586 ++vcpu->stat.fpu_reload; 9587 trace_kvm_fpu(0); 9588 } 9589 9590 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) 9591 { 9592 struct kvm_run *kvm_run = vcpu->run; 9593 int r; 9594 9595 vcpu_load(vcpu); 9596 kvm_sigset_activate(vcpu); 9597 kvm_run->flags = 0; 9598 kvm_load_guest_fpu(vcpu); 9599 9600 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 9601 if (kvm_run->immediate_exit) { 9602 r = -EINTR; 9603 goto out; 9604 } 9605 kvm_vcpu_block(vcpu); 9606 kvm_apic_accept_events(vcpu); 9607 kvm_clear_request(KVM_REQ_UNHALT, vcpu); 9608 r = -EAGAIN; 9609 if (signal_pending(current)) { 9610 r = -EINTR; 9611 kvm_run->exit_reason = KVM_EXIT_INTR; 9612 ++vcpu->stat.signal_exits; 9613 } 9614 goto out; 9615 } 9616 9617 if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { 9618 r = -EINVAL; 9619 goto out; 9620 } 9621 9622 if (kvm_run->kvm_dirty_regs) { 9623 r = sync_regs(vcpu); 9624 if (r != 0) 9625 goto out; 9626 } 9627 9628 /* re-sync apic's tpr */ 9629 if (!lapic_in_kernel(vcpu)) { 9630 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { 9631 r = -EINVAL; 9632 goto out; 9633 } 9634 } 9635 9636 if (unlikely(vcpu->arch.complete_userspace_io)) { 9637 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; 9638 vcpu->arch.complete_userspace_io = NULL; 9639 r = cui(vcpu); 9640 if (r <= 0) 9641 goto out; 9642 } else 9643 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 9644 9645 if (kvm_run->immediate_exit) 9646 r = -EINTR; 9647 else 9648 r = vcpu_run(vcpu); 9649 9650 out: 9651 kvm_put_guest_fpu(vcpu); 9652 if (kvm_run->kvm_valid_regs) 9653 store_regs(vcpu); 9654 post_kvm_run_save(vcpu); 9655 kvm_sigset_deactivate(vcpu); 9656 9657 vcpu_put(vcpu); 9658 return r; 9659 } 9660 9661 static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 9662 { 9663 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { 9664 /* 9665 * We are here if userspace calls get_regs() in the middle of 9666 * instruction emulation. Registers state needs to be copied 9667 * back from emulation context to vcpu. Userspace shouldn't do 9668 * that usually, but some bad designed PV devices (vmware 9669 * backdoor interface) need this to work 9670 */ 9671 emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); 9672 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 9673 } 9674 regs->rax = kvm_rax_read(vcpu); 9675 regs->rbx = kvm_rbx_read(vcpu); 9676 regs->rcx = kvm_rcx_read(vcpu); 9677 regs->rdx = kvm_rdx_read(vcpu); 9678 regs->rsi = kvm_rsi_read(vcpu); 9679 regs->rdi = kvm_rdi_read(vcpu); 9680 regs->rsp = kvm_rsp_read(vcpu); 9681 regs->rbp = kvm_rbp_read(vcpu); 9682 #ifdef CONFIG_X86_64 9683 regs->r8 = kvm_r8_read(vcpu); 9684 regs->r9 = kvm_r9_read(vcpu); 9685 regs->r10 = kvm_r10_read(vcpu); 9686 regs->r11 = kvm_r11_read(vcpu); 9687 regs->r12 = kvm_r12_read(vcpu); 9688 regs->r13 = kvm_r13_read(vcpu); 9689 regs->r14 = kvm_r14_read(vcpu); 9690 regs->r15 = kvm_r15_read(vcpu); 9691 #endif 9692 9693 regs->rip = kvm_rip_read(vcpu); 9694 regs->rflags = kvm_get_rflags(vcpu); 9695 } 9696 9697 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 9698 { 9699 vcpu_load(vcpu); 9700 __get_regs(vcpu, regs); 9701 vcpu_put(vcpu); 9702 return 0; 9703 } 9704 9705 static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 9706 { 9707 vcpu->arch.emulate_regs_need_sync_from_vcpu = true; 9708 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 9709 9710 kvm_rax_write(vcpu, regs->rax); 9711 kvm_rbx_write(vcpu, regs->rbx); 9712 kvm_rcx_write(vcpu, regs->rcx); 9713 kvm_rdx_write(vcpu, regs->rdx); 9714 kvm_rsi_write(vcpu, regs->rsi); 9715 kvm_rdi_write(vcpu, regs->rdi); 9716 kvm_rsp_write(vcpu, regs->rsp); 9717 kvm_rbp_write(vcpu, regs->rbp); 9718 #ifdef CONFIG_X86_64 9719 kvm_r8_write(vcpu, regs->r8); 9720 kvm_r9_write(vcpu, regs->r9); 9721 kvm_r10_write(vcpu, regs->r10); 9722 kvm_r11_write(vcpu, regs->r11); 9723 kvm_r12_write(vcpu, regs->r12); 9724 kvm_r13_write(vcpu, regs->r13); 9725 kvm_r14_write(vcpu, regs->r14); 9726 kvm_r15_write(vcpu, regs->r15); 9727 #endif 9728 9729 kvm_rip_write(vcpu, regs->rip); 9730 kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); 9731 9732 vcpu->arch.exception.pending = false; 9733 9734 kvm_make_request(KVM_REQ_EVENT, vcpu); 9735 } 9736 9737 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 9738 { 9739 vcpu_load(vcpu); 9740 __set_regs(vcpu, regs); 9741 vcpu_put(vcpu); 9742 return 0; 9743 } 9744 9745 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 9746 { 9747 struct kvm_segment cs; 9748 9749 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 9750 *db = cs.db; 9751 *l = cs.l; 9752 } 9753 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 9754 9755 static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 9756 { 9757 struct desc_ptr dt; 9758 9759 if (vcpu->arch.guest_state_protected) 9760 goto skip_protected_regs; 9761 9762 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 9763 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 9764 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 9765 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 9766 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 9767 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 9768 9769 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 9770 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 9771 9772 static_call(kvm_x86_get_idt)(vcpu, &dt); 9773 sregs->idt.limit = dt.size; 9774 sregs->idt.base = dt.address; 9775 static_call(kvm_x86_get_gdt)(vcpu, &dt); 9776 sregs->gdt.limit = dt.size; 9777 sregs->gdt.base = dt.address; 9778 9779 sregs->cr2 = vcpu->arch.cr2; 9780 sregs->cr3 = kvm_read_cr3(vcpu); 9781 9782 skip_protected_regs: 9783 sregs->cr0 = kvm_read_cr0(vcpu); 9784 sregs->cr4 = kvm_read_cr4(vcpu); 9785 sregs->cr8 = kvm_get_cr8(vcpu); 9786 sregs->efer = vcpu->arch.efer; 9787 sregs->apic_base = kvm_get_apic_base(vcpu); 9788 9789 memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); 9790 9791 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) 9792 set_bit(vcpu->arch.interrupt.nr, 9793 (unsigned long *)sregs->interrupt_bitmap); 9794 } 9795 9796 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 9797 struct kvm_sregs *sregs) 9798 { 9799 vcpu_load(vcpu); 9800 __get_sregs(vcpu, sregs); 9801 vcpu_put(vcpu); 9802 return 0; 9803 } 9804 9805 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 9806 struct kvm_mp_state *mp_state) 9807 { 9808 vcpu_load(vcpu); 9809 if (kvm_mpx_supported()) 9810 kvm_load_guest_fpu(vcpu); 9811 9812 kvm_apic_accept_events(vcpu); 9813 if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || 9814 vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && 9815 vcpu->arch.pv.pv_unhalted) 9816 mp_state->mp_state = KVM_MP_STATE_RUNNABLE; 9817 else 9818 mp_state->mp_state = vcpu->arch.mp_state; 9819 9820 if (kvm_mpx_supported()) 9821 kvm_put_guest_fpu(vcpu); 9822 vcpu_put(vcpu); 9823 return 0; 9824 } 9825 9826 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 9827 struct kvm_mp_state *mp_state) 9828 { 9829 int ret = -EINVAL; 9830 9831 vcpu_load(vcpu); 9832 9833 if (!lapic_in_kernel(vcpu) && 9834 mp_state->mp_state != KVM_MP_STATE_RUNNABLE) 9835 goto out; 9836 9837 /* 9838 * KVM_MP_STATE_INIT_RECEIVED means the processor is in 9839 * INIT state; latched init should be reported using 9840 * KVM_SET_VCPU_EVENTS, so reject it here. 9841 */ 9842 if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && 9843 (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || 9844 mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) 9845 goto out; 9846 9847 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { 9848 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 9849 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); 9850 } else 9851 vcpu->arch.mp_state = mp_state->mp_state; 9852 kvm_make_request(KVM_REQ_EVENT, vcpu); 9853 9854 ret = 0; 9855 out: 9856 vcpu_put(vcpu); 9857 return ret; 9858 } 9859 9860 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, 9861 int reason, bool has_error_code, u32 error_code) 9862 { 9863 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 9864 int ret; 9865 9866 init_emulate_ctxt(vcpu); 9867 9868 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, 9869 has_error_code, error_code); 9870 if (ret) { 9871 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 9872 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 9873 vcpu->run->internal.ndata = 0; 9874 return 0; 9875 } 9876 9877 kvm_rip_write(vcpu, ctxt->eip); 9878 kvm_set_rflags(vcpu, ctxt->eflags); 9879 return 1; 9880 } 9881 EXPORT_SYMBOL_GPL(kvm_task_switch); 9882 9883 static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 9884 { 9885 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { 9886 /* 9887 * When EFER.LME and CR0.PG are set, the processor is in 9888 * 64-bit mode (though maybe in a 32-bit code segment). 9889 * CR4.PAE and EFER.LMA must be set. 9890 */ 9891 if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) 9892 return false; 9893 if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) 9894 return false; 9895 } else { 9896 /* 9897 * Not in 64-bit mode: EFER.LMA is clear and the code 9898 * segment cannot be 64-bit. 9899 */ 9900 if (sregs->efer & EFER_LMA || sregs->cs.l) 9901 return false; 9902 } 9903 9904 return kvm_is_valid_cr4(vcpu, sregs->cr4); 9905 } 9906 9907 static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 9908 { 9909 struct msr_data apic_base_msr; 9910 int mmu_reset_needed = 0; 9911 int pending_vec, max_bits, idx; 9912 struct desc_ptr dt; 9913 int ret = -EINVAL; 9914 9915 if (!kvm_is_valid_sregs(vcpu, sregs)) 9916 goto out; 9917 9918 apic_base_msr.data = sregs->apic_base; 9919 apic_base_msr.host_initiated = true; 9920 if (kvm_set_apic_base(vcpu, &apic_base_msr)) 9921 goto out; 9922 9923 if (vcpu->arch.guest_state_protected) 9924 goto skip_protected_regs; 9925 9926 dt.size = sregs->idt.limit; 9927 dt.address = sregs->idt.base; 9928 static_call(kvm_x86_set_idt)(vcpu, &dt); 9929 dt.size = sregs->gdt.limit; 9930 dt.address = sregs->gdt.base; 9931 static_call(kvm_x86_set_gdt)(vcpu, &dt); 9932 9933 vcpu->arch.cr2 = sregs->cr2; 9934 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; 9935 vcpu->arch.cr3 = sregs->cr3; 9936 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 9937 9938 kvm_set_cr8(vcpu, sregs->cr8); 9939 9940 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 9941 static_call(kvm_x86_set_efer)(vcpu, sregs->efer); 9942 9943 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 9944 static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); 9945 vcpu->arch.cr0 = sregs->cr0; 9946 9947 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 9948 static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); 9949 9950 idx = srcu_read_lock(&vcpu->kvm->srcu); 9951 if (is_pae_paging(vcpu)) { 9952 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 9953 mmu_reset_needed = 1; 9954 } 9955 srcu_read_unlock(&vcpu->kvm->srcu, idx); 9956 9957 if (mmu_reset_needed) 9958 kvm_mmu_reset_context(vcpu); 9959 9960 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 9961 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 9962 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 9963 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 9964 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 9965 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 9966 9967 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 9968 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 9969 9970 update_cr8_intercept(vcpu); 9971 9972 /* Older userspace won't unhalt the vcpu on reset. */ 9973 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 9974 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 9975 !is_protmode(vcpu)) 9976 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 9977 9978 skip_protected_regs: 9979 max_bits = KVM_NR_INTERRUPTS; 9980 pending_vec = find_first_bit( 9981 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 9982 if (pending_vec < max_bits) { 9983 kvm_queue_interrupt(vcpu, pending_vec, false); 9984 pr_debug("Set back pending irq %d\n", pending_vec); 9985 } 9986 9987 kvm_make_request(KVM_REQ_EVENT, vcpu); 9988 9989 ret = 0; 9990 out: 9991 return ret; 9992 } 9993 9994 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 9995 struct kvm_sregs *sregs) 9996 { 9997 int ret; 9998 9999 vcpu_load(vcpu); 10000 ret = __set_sregs(vcpu, sregs); 10001 vcpu_put(vcpu); 10002 return ret; 10003 } 10004 10005 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 10006 struct kvm_guest_debug *dbg) 10007 { 10008 unsigned long rflags; 10009 int i, r; 10010 10011 if (vcpu->arch.guest_state_protected) 10012 return -EINVAL; 10013 10014 vcpu_load(vcpu); 10015 10016 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 10017 r = -EBUSY; 10018 if (vcpu->arch.exception.pending) 10019 goto out; 10020 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 10021 kvm_queue_exception(vcpu, DB_VECTOR); 10022 else 10023 kvm_queue_exception(vcpu, BP_VECTOR); 10024 } 10025 10026 /* 10027 * Read rflags as long as potentially injected trace flags are still 10028 * filtered out. 10029 */ 10030 rflags = kvm_get_rflags(vcpu); 10031 10032 vcpu->guest_debug = dbg->control; 10033 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 10034 vcpu->guest_debug = 0; 10035 10036 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 10037 for (i = 0; i < KVM_NR_DB_REGS; ++i) 10038 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 10039 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; 10040 } else { 10041 for (i = 0; i < KVM_NR_DB_REGS; i++) 10042 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 10043 } 10044 kvm_update_dr7(vcpu); 10045 10046 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 10047 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 10048 get_segment_base(vcpu, VCPU_SREG_CS); 10049 10050 /* 10051 * Trigger an rflags update that will inject or remove the trace 10052 * flags. 10053 */ 10054 kvm_set_rflags(vcpu, rflags); 10055 10056 static_call(kvm_x86_update_exception_bitmap)(vcpu); 10057 10058 r = 0; 10059 10060 out: 10061 vcpu_put(vcpu); 10062 return r; 10063 } 10064 10065 /* 10066 * Translate a guest virtual address to a guest physical address. 10067 */ 10068 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 10069 struct kvm_translation *tr) 10070 { 10071 unsigned long vaddr = tr->linear_address; 10072 gpa_t gpa; 10073 int idx; 10074 10075 vcpu_load(vcpu); 10076 10077 idx = srcu_read_lock(&vcpu->kvm->srcu); 10078 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 10079 srcu_read_unlock(&vcpu->kvm->srcu, idx); 10080 tr->physical_address = gpa; 10081 tr->valid = gpa != UNMAPPED_GVA; 10082 tr->writeable = 1; 10083 tr->usermode = 0; 10084 10085 vcpu_put(vcpu); 10086 return 0; 10087 } 10088 10089 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 10090 { 10091 struct fxregs_state *fxsave; 10092 10093 if (!vcpu->arch.guest_fpu) 10094 return 0; 10095 10096 vcpu_load(vcpu); 10097 10098 fxsave = &vcpu->arch.guest_fpu->state.fxsave; 10099 memcpy(fpu->fpr, fxsave->st_space, 128); 10100 fpu->fcw = fxsave->cwd; 10101 fpu->fsw = fxsave->swd; 10102 fpu->ftwx = fxsave->twd; 10103 fpu->last_opcode = fxsave->fop; 10104 fpu->last_ip = fxsave->rip; 10105 fpu->last_dp = fxsave->rdp; 10106 memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space)); 10107 10108 vcpu_put(vcpu); 10109 return 0; 10110 } 10111 10112 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 10113 { 10114 struct fxregs_state *fxsave; 10115 10116 if (!vcpu->arch.guest_fpu) 10117 return 0; 10118 10119 vcpu_load(vcpu); 10120 10121 fxsave = &vcpu->arch.guest_fpu->state.fxsave; 10122 10123 memcpy(fxsave->st_space, fpu->fpr, 128); 10124 fxsave->cwd = fpu->fcw; 10125 fxsave->swd = fpu->fsw; 10126 fxsave->twd = fpu->ftwx; 10127 fxsave->fop = fpu->last_opcode; 10128 fxsave->rip = fpu->last_ip; 10129 fxsave->rdp = fpu->last_dp; 10130 memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space)); 10131 10132 vcpu_put(vcpu); 10133 return 0; 10134 } 10135 10136 static void store_regs(struct kvm_vcpu *vcpu) 10137 { 10138 BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES); 10139 10140 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS) 10141 __get_regs(vcpu, &vcpu->run->s.regs.regs); 10142 10143 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS) 10144 __get_sregs(vcpu, &vcpu->run->s.regs.sregs); 10145 10146 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS) 10147 kvm_vcpu_ioctl_x86_get_vcpu_events( 10148 vcpu, &vcpu->run->s.regs.events); 10149 } 10150 10151 static int sync_regs(struct kvm_vcpu *vcpu) 10152 { 10153 if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) 10154 return -EINVAL; 10155 10156 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { 10157 __set_regs(vcpu, &vcpu->run->s.regs.regs); 10158 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; 10159 } 10160 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { 10161 if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs)) 10162 return -EINVAL; 10163 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; 10164 } 10165 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { 10166 if (kvm_vcpu_ioctl_x86_set_vcpu_events( 10167 vcpu, &vcpu->run->s.regs.events)) 10168 return -EINVAL; 10169 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; 10170 } 10171 10172 return 0; 10173 } 10174 10175 static void fx_init(struct kvm_vcpu *vcpu) 10176 { 10177 if (!vcpu->arch.guest_fpu) 10178 return; 10179 10180 fpstate_init(&vcpu->arch.guest_fpu->state); 10181 if (boot_cpu_has(X86_FEATURE_XSAVES)) 10182 vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = 10183 host_xcr0 | XSTATE_COMPACTION_ENABLED; 10184 10185 /* 10186 * Ensure guest xcr0 is valid for loading 10187 */ 10188 vcpu->arch.xcr0 = XFEATURE_MASK_FP; 10189 10190 vcpu->arch.cr0 |= X86_CR0_ET; 10191 } 10192 10193 void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) 10194 { 10195 if (vcpu->arch.guest_fpu) { 10196 kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); 10197 vcpu->arch.guest_fpu = NULL; 10198 } 10199 } 10200 EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); 10201 10202 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 10203 { 10204 if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) 10205 pr_warn_once("kvm: SMP vm created on host with unstable TSC; " 10206 "guest TSC will not be reliable\n"); 10207 10208 return 0; 10209 } 10210 10211 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 10212 { 10213 struct page *page; 10214 int r; 10215 10216 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 10217 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 10218 else 10219 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 10220 10221 kvm_set_tsc_khz(vcpu, max_tsc_khz); 10222 10223 r = kvm_mmu_create(vcpu); 10224 if (r < 0) 10225 return r; 10226 10227 if (irqchip_in_kernel(vcpu->kvm)) { 10228 r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); 10229 if (r < 0) 10230 goto fail_mmu_destroy; 10231 if (kvm_apicv_activated(vcpu->kvm)) 10232 vcpu->arch.apicv_active = true; 10233 } else 10234 static_branch_inc(&kvm_has_noapic_vcpu); 10235 10236 r = -ENOMEM; 10237 10238 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 10239 if (!page) 10240 goto fail_free_lapic; 10241 vcpu->arch.pio_data = page_address(page); 10242 10243 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 10244 GFP_KERNEL_ACCOUNT); 10245 if (!vcpu->arch.mce_banks) 10246 goto fail_free_pio_data; 10247 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 10248 10249 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, 10250 GFP_KERNEL_ACCOUNT)) 10251 goto fail_free_mce_banks; 10252 10253 if (!alloc_emulate_ctxt(vcpu)) 10254 goto free_wbinvd_dirty_mask; 10255 10256 vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, 10257 GFP_KERNEL_ACCOUNT); 10258 if (!vcpu->arch.user_fpu) { 10259 pr_err("kvm: failed to allocate userspace's fpu\n"); 10260 goto free_emulate_ctxt; 10261 } 10262 10263 vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 10264 GFP_KERNEL_ACCOUNT); 10265 if (!vcpu->arch.guest_fpu) { 10266 pr_err("kvm: failed to allocate vcpu's fpu\n"); 10267 goto free_user_fpu; 10268 } 10269 fx_init(vcpu); 10270 10271 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 10272 vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); 10273 10274 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 10275 10276 kvm_async_pf_hash_reset(vcpu); 10277 kvm_pmu_init(vcpu); 10278 10279 vcpu->arch.pending_external_vector = -1; 10280 vcpu->arch.preempted_in_kernel = false; 10281 10282 r = static_call(kvm_x86_vcpu_create)(vcpu); 10283 if (r) 10284 goto free_guest_fpu; 10285 10286 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); 10287 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; 10288 kvm_vcpu_mtrr_init(vcpu); 10289 vcpu_load(vcpu); 10290 kvm_vcpu_reset(vcpu, false); 10291 kvm_init_mmu(vcpu, false); 10292 vcpu_put(vcpu); 10293 return 0; 10294 10295 free_guest_fpu: 10296 kvm_free_guest_fpu(vcpu); 10297 free_user_fpu: 10298 kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); 10299 free_emulate_ctxt: 10300 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); 10301 free_wbinvd_dirty_mask: 10302 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 10303 fail_free_mce_banks: 10304 kfree(vcpu->arch.mce_banks); 10305 fail_free_pio_data: 10306 free_page((unsigned long)vcpu->arch.pio_data); 10307 fail_free_lapic: 10308 kvm_free_lapic(vcpu); 10309 fail_mmu_destroy: 10310 kvm_mmu_destroy(vcpu); 10311 return r; 10312 } 10313 10314 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 10315 { 10316 struct kvm *kvm = vcpu->kvm; 10317 10318 if (mutex_lock_killable(&vcpu->mutex)) 10319 return; 10320 vcpu_load(vcpu); 10321 kvm_synchronize_tsc(vcpu, 0); 10322 vcpu_put(vcpu); 10323 10324 /* poll control enabled by default */ 10325 vcpu->arch.msr_kvm_poll_control = 1; 10326 10327 mutex_unlock(&vcpu->mutex); 10328 10329 if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) 10330 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 10331 KVMCLOCK_SYNC_PERIOD); 10332 } 10333 10334 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 10335 { 10336 struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; 10337 int idx; 10338 10339 kvm_release_pfn(cache->pfn, cache->dirty, cache); 10340 10341 kvmclock_reset(vcpu); 10342 10343 static_call(kvm_x86_vcpu_free)(vcpu); 10344 10345 kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); 10346 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 10347 kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); 10348 kvm_free_guest_fpu(vcpu); 10349 10350 kvm_hv_vcpu_uninit(vcpu); 10351 kvm_pmu_destroy(vcpu); 10352 kfree(vcpu->arch.mce_banks); 10353 kvm_free_lapic(vcpu); 10354 idx = srcu_read_lock(&vcpu->kvm->srcu); 10355 kvm_mmu_destroy(vcpu); 10356 srcu_read_unlock(&vcpu->kvm->srcu, idx); 10357 free_page((unsigned long)vcpu->arch.pio_data); 10358 kvfree(vcpu->arch.cpuid_entries); 10359 if (!lapic_in_kernel(vcpu)) 10360 static_branch_dec(&kvm_has_noapic_vcpu); 10361 } 10362 10363 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 10364 { 10365 kvm_lapic_reset(vcpu, init_event); 10366 10367 vcpu->arch.hflags = 0; 10368 10369 vcpu->arch.smi_pending = 0; 10370 vcpu->arch.smi_count = 0; 10371 atomic_set(&vcpu->arch.nmi_queued, 0); 10372 vcpu->arch.nmi_pending = 0; 10373 vcpu->arch.nmi_injected = false; 10374 kvm_clear_interrupt_queue(vcpu); 10375 kvm_clear_exception_queue(vcpu); 10376 10377 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 10378 kvm_update_dr0123(vcpu); 10379 vcpu->arch.dr6 = DR6_ACTIVE_LOW; 10380 vcpu->arch.dr7 = DR7_FIXED_1; 10381 kvm_update_dr7(vcpu); 10382 10383 vcpu->arch.cr2 = 0; 10384 10385 kvm_make_request(KVM_REQ_EVENT, vcpu); 10386 vcpu->arch.apf.msr_en_val = 0; 10387 vcpu->arch.apf.msr_int_val = 0; 10388 vcpu->arch.st.msr_val = 0; 10389 10390 kvmclock_reset(vcpu); 10391 10392 kvm_clear_async_pf_completion_queue(vcpu); 10393 kvm_async_pf_hash_reset(vcpu); 10394 vcpu->arch.apf.halted = false; 10395 10396 if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { 10397 void *mpx_state_buffer; 10398 10399 /* 10400 * To avoid have the INIT path from kvm_apic_has_events() that be 10401 * called with loaded FPU and does not let userspace fix the state. 10402 */ 10403 if (init_event) 10404 kvm_put_guest_fpu(vcpu); 10405 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, 10406 XFEATURE_BNDREGS); 10407 if (mpx_state_buffer) 10408 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); 10409 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, 10410 XFEATURE_BNDCSR); 10411 if (mpx_state_buffer) 10412 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); 10413 if (init_event) 10414 kvm_load_guest_fpu(vcpu); 10415 } 10416 10417 if (!init_event) { 10418 kvm_pmu_reset(vcpu); 10419 vcpu->arch.smbase = 0x30000; 10420 10421 vcpu->arch.msr_misc_features_enables = 0; 10422 10423 vcpu->arch.xcr0 = XFEATURE_MASK_FP; 10424 } 10425 10426 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 10427 vcpu->arch.regs_avail = ~0; 10428 vcpu->arch.regs_dirty = ~0; 10429 10430 vcpu->arch.ia32_xss = 0; 10431 10432 static_call(kvm_x86_vcpu_reset)(vcpu, init_event); 10433 } 10434 10435 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 10436 { 10437 struct kvm_segment cs; 10438 10439 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 10440 cs.selector = vector << 8; 10441 cs.base = vector << 12; 10442 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); 10443 kvm_rip_write(vcpu, 0); 10444 } 10445 EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); 10446 10447 int kvm_arch_hardware_enable(void) 10448 { 10449 struct kvm *kvm; 10450 struct kvm_vcpu *vcpu; 10451 int i; 10452 int ret; 10453 u64 local_tsc; 10454 u64 max_tsc = 0; 10455 bool stable, backwards_tsc = false; 10456 10457 kvm_user_return_msr_cpu_online(); 10458 ret = static_call(kvm_x86_hardware_enable)(); 10459 if (ret != 0) 10460 return ret; 10461 10462 local_tsc = rdtsc(); 10463 stable = !kvm_check_tsc_unstable(); 10464 list_for_each_entry(kvm, &vm_list, vm_list) { 10465 kvm_for_each_vcpu(i, vcpu, kvm) { 10466 if (!stable && vcpu->cpu == smp_processor_id()) 10467 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 10468 if (stable && vcpu->arch.last_host_tsc > local_tsc) { 10469 backwards_tsc = true; 10470 if (vcpu->arch.last_host_tsc > max_tsc) 10471 max_tsc = vcpu->arch.last_host_tsc; 10472 } 10473 } 10474 } 10475 10476 /* 10477 * Sometimes, even reliable TSCs go backwards. This happens on 10478 * platforms that reset TSC during suspend or hibernate actions, but 10479 * maintain synchronization. We must compensate. Fortunately, we can 10480 * detect that condition here, which happens early in CPU bringup, 10481 * before any KVM threads can be running. Unfortunately, we can't 10482 * bring the TSCs fully up to date with real time, as we aren't yet far 10483 * enough into CPU bringup that we know how much real time has actually 10484 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot 10485 * variables that haven't been updated yet. 10486 * 10487 * So we simply find the maximum observed TSC above, then record the 10488 * adjustment to TSC in each VCPU. When the VCPU later gets loaded, 10489 * the adjustment will be applied. Note that we accumulate 10490 * adjustments, in case multiple suspend cycles happen before some VCPU 10491 * gets a chance to run again. In the event that no KVM threads get a 10492 * chance to run, we will miss the entire elapsed period, as we'll have 10493 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may 10494 * loose cycle time. This isn't too big a deal, since the loss will be 10495 * uniform across all VCPUs (not to mention the scenario is extremely 10496 * unlikely). It is possible that a second hibernate recovery happens 10497 * much faster than a first, causing the observed TSC here to be 10498 * smaller; this would require additional padding adjustment, which is 10499 * why we set last_host_tsc to the local tsc observed here. 10500 * 10501 * N.B. - this code below runs only on platforms with reliable TSC, 10502 * as that is the only way backwards_tsc is set above. Also note 10503 * that this runs for ALL vcpus, which is not a bug; all VCPUs should 10504 * have the same delta_cyc adjustment applied if backwards_tsc 10505 * is detected. Note further, this adjustment is only done once, 10506 * as we reset last_host_tsc on all VCPUs to stop this from being 10507 * called multiple times (one for each physical CPU bringup). 10508 * 10509 * Platforms with unreliable TSCs don't have to deal with this, they 10510 * will be compensated by the logic in vcpu_load, which sets the TSC to 10511 * catchup mode. This will catchup all VCPUs to real time, but cannot 10512 * guarantee that they stay in perfect synchronization. 10513 */ 10514 if (backwards_tsc) { 10515 u64 delta_cyc = max_tsc - local_tsc; 10516 list_for_each_entry(kvm, &vm_list, vm_list) { 10517 kvm->arch.backwards_tsc_observed = true; 10518 kvm_for_each_vcpu(i, vcpu, kvm) { 10519 vcpu->arch.tsc_offset_adjustment += delta_cyc; 10520 vcpu->arch.last_host_tsc = local_tsc; 10521 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 10522 } 10523 10524 /* 10525 * We have to disable TSC offset matching.. if you were 10526 * booting a VM while issuing an S4 host suspend.... 10527 * you may have some problem. Solving this issue is 10528 * left as an exercise to the reader. 10529 */ 10530 kvm->arch.last_tsc_nsec = 0; 10531 kvm->arch.last_tsc_write = 0; 10532 } 10533 10534 } 10535 return 0; 10536 } 10537 10538 void kvm_arch_hardware_disable(void) 10539 { 10540 static_call(kvm_x86_hardware_disable)(); 10541 drop_user_return_notifiers(); 10542 } 10543 10544 int kvm_arch_hardware_setup(void *opaque) 10545 { 10546 struct kvm_x86_init_ops *ops = opaque; 10547 int r; 10548 10549 rdmsrl_safe(MSR_EFER, &host_efer); 10550 10551 if (boot_cpu_has(X86_FEATURE_XSAVES)) 10552 rdmsrl(MSR_IA32_XSS, host_xss); 10553 10554 r = ops->hardware_setup(); 10555 if (r != 0) 10556 return r; 10557 10558 memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); 10559 kvm_ops_static_call_update(); 10560 10561 if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) 10562 supported_xss = 0; 10563 10564 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) 10565 cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); 10566 #undef __kvm_cpu_cap_has 10567 10568 if (kvm_has_tsc_control) { 10569 /* 10570 * Make sure the user can only configure tsc_khz values that 10571 * fit into a signed integer. 10572 * A min value is not calculated because it will always 10573 * be 1 on all machines. 10574 */ 10575 u64 max = min(0x7fffffffULL, 10576 __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); 10577 kvm_max_guest_tsc_khz = max; 10578 10579 kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; 10580 } 10581 10582 kvm_init_msr_list(); 10583 return 0; 10584 } 10585 10586 void kvm_arch_hardware_unsetup(void) 10587 { 10588 static_call(kvm_x86_hardware_unsetup)(); 10589 } 10590 10591 int kvm_arch_check_processor_compat(void *opaque) 10592 { 10593 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); 10594 struct kvm_x86_init_ops *ops = opaque; 10595 10596 WARN_ON(!irqs_disabled()); 10597 10598 if (__cr4_reserved_bits(cpu_has, c) != 10599 __cr4_reserved_bits(cpu_has, &boot_cpu_data)) 10600 return -EIO; 10601 10602 return ops->check_processor_compatibility(); 10603 } 10604 10605 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) 10606 { 10607 return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; 10608 } 10609 EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); 10610 10611 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) 10612 { 10613 return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; 10614 } 10615 10616 __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); 10617 EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); 10618 10619 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) 10620 { 10621 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 10622 10623 vcpu->arch.l1tf_flush_l1d = true; 10624 if (pmu->version && unlikely(pmu->event_count)) { 10625 pmu->need_cleanup = true; 10626 kvm_make_request(KVM_REQ_PMU, vcpu); 10627 } 10628 static_call(kvm_x86_sched_in)(vcpu, cpu); 10629 } 10630 10631 void kvm_arch_free_vm(struct kvm *kvm) 10632 { 10633 kfree(to_kvm_hv(kvm)->hv_pa_pg); 10634 vfree(kvm); 10635 } 10636 10637 10638 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 10639 { 10640 if (type) 10641 return -EINVAL; 10642 10643 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); 10644 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 10645 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); 10646 INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); 10647 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 10648 atomic_set(&kvm->arch.noncoherent_dma_count, 0); 10649 10650 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 10651 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 10652 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ 10653 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 10654 &kvm->arch.irq_sources_bitmap); 10655 10656 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 10657 mutex_init(&kvm->arch.apic_map_lock); 10658 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); 10659 10660 kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); 10661 pvclock_update_vm_gtod_copy(kvm); 10662 10663 kvm->arch.guest_can_read_msr_platform_info = true; 10664 10665 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); 10666 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); 10667 10668 kvm_hv_init_vm(kvm); 10669 kvm_page_track_init(kvm); 10670 kvm_mmu_init_vm(kvm); 10671 10672 return static_call(kvm_x86_vm_init)(kvm); 10673 } 10674 10675 int kvm_arch_post_init_vm(struct kvm *kvm) 10676 { 10677 return kvm_mmu_post_init_vm(kvm); 10678 } 10679 10680 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 10681 { 10682 vcpu_load(vcpu); 10683 kvm_mmu_unload(vcpu); 10684 vcpu_put(vcpu); 10685 } 10686 10687 static void kvm_free_vcpus(struct kvm *kvm) 10688 { 10689 unsigned int i; 10690 struct kvm_vcpu *vcpu; 10691 10692 /* 10693 * Unpin any mmu pages first. 10694 */ 10695 kvm_for_each_vcpu(i, vcpu, kvm) { 10696 kvm_clear_async_pf_completion_queue(vcpu); 10697 kvm_unload_vcpu_mmu(vcpu); 10698 } 10699 kvm_for_each_vcpu(i, vcpu, kvm) 10700 kvm_vcpu_destroy(vcpu); 10701 10702 mutex_lock(&kvm->lock); 10703 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 10704 kvm->vcpus[i] = NULL; 10705 10706 atomic_set(&kvm->online_vcpus, 0); 10707 mutex_unlock(&kvm->lock); 10708 } 10709 10710 void kvm_arch_sync_events(struct kvm *kvm) 10711 { 10712 cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); 10713 cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); 10714 kvm_free_pit(kvm); 10715 } 10716 10717 #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) 10718 10719 /** 10720 * __x86_set_memory_region: Setup KVM internal memory slot 10721 * 10722 * @kvm: the kvm pointer to the VM. 10723 * @id: the slot ID to setup. 10724 * @gpa: the GPA to install the slot (unused when @size == 0). 10725 * @size: the size of the slot. Set to zero to uninstall a slot. 10726 * 10727 * This function helps to setup a KVM internal memory slot. Specify 10728 * @size > 0 to install a new slot, while @size == 0 to uninstall a 10729 * slot. The return code can be one of the following: 10730 * 10731 * HVA: on success (uninstall will return a bogus HVA) 10732 * -errno: on error 10733 * 10734 * The caller should always use IS_ERR() to check the return value 10735 * before use. Note, the KVM internal memory slots are guaranteed to 10736 * remain valid and unchanged until the VM is destroyed, i.e., the 10737 * GPA->HVA translation will not change. However, the HVA is a user 10738 * address, i.e. its accessibility is not guaranteed, and must be 10739 * accessed via __copy_{to,from}_user(). 10740 */ 10741 void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, 10742 u32 size) 10743 { 10744 int i, r; 10745 unsigned long hva, old_npages; 10746 struct kvm_memslots *slots = kvm_memslots(kvm); 10747 struct kvm_memory_slot *slot; 10748 10749 /* Called with kvm->slots_lock held. */ 10750 if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) 10751 return ERR_PTR_USR(-EINVAL); 10752 10753 slot = id_to_memslot(slots, id); 10754 if (size) { 10755 if (slot && slot->npages) 10756 return ERR_PTR_USR(-EEXIST); 10757 10758 /* 10759 * MAP_SHARED to prevent internal slot pages from being moved 10760 * by fork()/COW. 10761 */ 10762 hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, 10763 MAP_SHARED | MAP_ANONYMOUS, 0); 10764 if (IS_ERR((void *)hva)) 10765 return (void __user *)hva; 10766 } else { 10767 if (!slot || !slot->npages) 10768 return NULL; 10769 10770 old_npages = slot->npages; 10771 hva = slot->userspace_addr; 10772 } 10773 10774 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 10775 struct kvm_userspace_memory_region m; 10776 10777 m.slot = id | (i << 16); 10778 m.flags = 0; 10779 m.guest_phys_addr = gpa; 10780 m.userspace_addr = hva; 10781 m.memory_size = size; 10782 r = __kvm_set_memory_region(kvm, &m); 10783 if (r < 0) 10784 return ERR_PTR_USR(r); 10785 } 10786 10787 if (!size) 10788 vm_munmap(hva, old_npages * PAGE_SIZE); 10789 10790 return (void __user *)hva; 10791 } 10792 EXPORT_SYMBOL_GPL(__x86_set_memory_region); 10793 10794 void kvm_arch_pre_destroy_vm(struct kvm *kvm) 10795 { 10796 kvm_mmu_pre_destroy_vm(kvm); 10797 } 10798 10799 void kvm_arch_destroy_vm(struct kvm *kvm) 10800 { 10801 if (current->mm == kvm->mm) { 10802 /* 10803 * Free memory regions allocated on behalf of userspace, 10804 * unless the the memory map has changed due to process exit 10805 * or fd copying. 10806 */ 10807 mutex_lock(&kvm->slots_lock); 10808 __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 10809 0, 0); 10810 __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 10811 0, 0); 10812 __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); 10813 mutex_unlock(&kvm->slots_lock); 10814 } 10815 static_call_cond(kvm_x86_vm_destroy)(kvm); 10816 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); 10817 kvm_pic_destroy(kvm); 10818 kvm_ioapic_destroy(kvm); 10819 kvm_free_vcpus(kvm); 10820 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); 10821 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); 10822 kvm_mmu_uninit_vm(kvm); 10823 kvm_page_track_cleanup(kvm); 10824 kvm_xen_destroy_vm(kvm); 10825 kvm_hv_destroy_vm(kvm); 10826 } 10827 10828 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 10829 { 10830 int i; 10831 10832 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 10833 kvfree(slot->arch.rmap[i]); 10834 slot->arch.rmap[i] = NULL; 10835 10836 if (i == 0) 10837 continue; 10838 10839 kvfree(slot->arch.lpage_info[i - 1]); 10840 slot->arch.lpage_info[i - 1] = NULL; 10841 } 10842 10843 kvm_page_track_free_memslot(slot); 10844 } 10845 10846 static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, 10847 unsigned long npages) 10848 { 10849 int i; 10850 10851 /* 10852 * Clear out the previous array pointers for the KVM_MR_MOVE case. The 10853 * old arrays will be freed by __kvm_set_memory_region() if installing 10854 * the new memslot is successful. 10855 */ 10856 memset(&slot->arch, 0, sizeof(slot->arch)); 10857 10858 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 10859 struct kvm_lpage_info *linfo; 10860 unsigned long ugfn; 10861 int lpages; 10862 int level = i + 1; 10863 10864 lpages = gfn_to_index(slot->base_gfn + npages - 1, 10865 slot->base_gfn, level) + 1; 10866 10867 slot->arch.rmap[i] = 10868 kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), 10869 GFP_KERNEL_ACCOUNT); 10870 if (!slot->arch.rmap[i]) 10871 goto out_free; 10872 if (i == 0) 10873 continue; 10874 10875 linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); 10876 if (!linfo) 10877 goto out_free; 10878 10879 slot->arch.lpage_info[i - 1] = linfo; 10880 10881 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 10882 linfo[0].disallow_lpage = 1; 10883 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 10884 linfo[lpages - 1].disallow_lpage = 1; 10885 ugfn = slot->userspace_addr >> PAGE_SHIFT; 10886 /* 10887 * If the gfn and userspace address are not aligned wrt each 10888 * other, disable large page support for this slot. 10889 */ 10890 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { 10891 unsigned long j; 10892 10893 for (j = 0; j < lpages; ++j) 10894 linfo[j].disallow_lpage = 1; 10895 } 10896 } 10897 10898 if (kvm_page_track_create_memslot(slot, npages)) 10899 goto out_free; 10900 10901 return 0; 10902 10903 out_free: 10904 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { 10905 kvfree(slot->arch.rmap[i]); 10906 slot->arch.rmap[i] = NULL; 10907 if (i == 0) 10908 continue; 10909 10910 kvfree(slot->arch.lpage_info[i - 1]); 10911 slot->arch.lpage_info[i - 1] = NULL; 10912 } 10913 return -ENOMEM; 10914 } 10915 10916 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 10917 { 10918 struct kvm_vcpu *vcpu; 10919 int i; 10920 10921 /* 10922 * memslots->generation has been incremented. 10923 * mmio generation may have reached its maximum value. 10924 */ 10925 kvm_mmu_invalidate_mmio_sptes(kvm, gen); 10926 10927 /* Force re-initialization of steal_time cache */ 10928 kvm_for_each_vcpu(i, vcpu, kvm) 10929 kvm_vcpu_kick(vcpu); 10930 } 10931 10932 int kvm_arch_prepare_memory_region(struct kvm *kvm, 10933 struct kvm_memory_slot *memslot, 10934 const struct kvm_userspace_memory_region *mem, 10935 enum kvm_mr_change change) 10936 { 10937 if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) 10938 return kvm_alloc_memslot_metadata(memslot, 10939 mem->memory_size >> PAGE_SHIFT); 10940 return 0; 10941 } 10942 10943 10944 static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) 10945 { 10946 struct kvm_arch *ka = &kvm->arch; 10947 10948 if (!kvm_x86_ops.cpu_dirty_log_size) 10949 return; 10950 10951 if ((enable && ++ka->cpu_dirty_logging_count == 1) || 10952 (!enable && --ka->cpu_dirty_logging_count == 0)) 10953 kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); 10954 10955 WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); 10956 } 10957 10958 static void kvm_mmu_slot_apply_flags(struct kvm *kvm, 10959 struct kvm_memory_slot *old, 10960 struct kvm_memory_slot *new, 10961 enum kvm_mr_change change) 10962 { 10963 bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; 10964 10965 /* 10966 * Update CPU dirty logging if dirty logging is being toggled. This 10967 * applies to all operations. 10968 */ 10969 if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) 10970 kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); 10971 10972 /* 10973 * Nothing more to do for RO slots (which can't be dirtied and can't be 10974 * made writable) or CREATE/MOVE/DELETE of a slot. 10975 * 10976 * For a memslot with dirty logging disabled: 10977 * CREATE: No dirty mappings will already exist. 10978 * MOVE/DELETE: The old mappings will already have been cleaned up by 10979 * kvm_arch_flush_shadow_memslot() 10980 * 10981 * For a memslot with dirty logging enabled: 10982 * CREATE: No shadow pages exist, thus nothing to write-protect 10983 * and no dirty bits to clear. 10984 * MOVE/DELETE: The old mappings will already have been cleaned up by 10985 * kvm_arch_flush_shadow_memslot(). 10986 */ 10987 if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) 10988 return; 10989 10990 /* 10991 * READONLY and non-flags changes were filtered out above, and the only 10992 * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty 10993 * logging isn't being toggled on or off. 10994 */ 10995 if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) 10996 return; 10997 10998 if (!log_dirty_pages) { 10999 /* 11000 * Dirty logging tracks sptes in 4k granularity, meaning that 11001 * large sptes have to be split. If live migration succeeds, 11002 * the guest in the source machine will be destroyed and large 11003 * sptes will be created in the destination. However, if the 11004 * guest continues to run in the source machine (for example if 11005 * live migration fails), small sptes will remain around and 11006 * cause bad performance. 11007 * 11008 * Scan sptes if dirty logging has been stopped, dropping those 11009 * which can be collapsed into a single large-page spte. Later 11010 * page faults will create the large-page sptes. 11011 */ 11012 kvm_mmu_zap_collapsible_sptes(kvm, new); 11013 } else { 11014 /* By default, write-protect everything to log writes. */ 11015 int level = PG_LEVEL_4K; 11016 11017 if (kvm_x86_ops.cpu_dirty_log_size) { 11018 /* 11019 * Clear all dirty bits, unless pages are treated as 11020 * dirty from the get-go. 11021 */ 11022 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) 11023 kvm_mmu_slot_leaf_clear_dirty(kvm, new); 11024 11025 /* 11026 * Write-protect large pages on write so that dirty 11027 * logging happens at 4k granularity. No need to 11028 * write-protect small SPTEs since write accesses are 11029 * logged by the CPU via dirty bits. 11030 */ 11031 level = PG_LEVEL_2M; 11032 } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { 11033 /* 11034 * If we're with initial-all-set, we don't need 11035 * to write protect any small page because 11036 * they're reported as dirty already. However 11037 * we still need to write-protect huge pages 11038 * so that the page split can happen lazily on 11039 * the first write to the huge page. 11040 */ 11041 level = PG_LEVEL_2M; 11042 } 11043 kvm_mmu_slot_remove_write_access(kvm, new, level); 11044 } 11045 } 11046 11047 void kvm_arch_commit_memory_region(struct kvm *kvm, 11048 const struct kvm_userspace_memory_region *mem, 11049 struct kvm_memory_slot *old, 11050 const struct kvm_memory_slot *new, 11051 enum kvm_mr_change change) 11052 { 11053 if (!kvm->arch.n_requested_mmu_pages) 11054 kvm_mmu_change_mmu_pages(kvm, 11055 kvm_mmu_calculate_default_mmu_pages(kvm)); 11056 11057 /* 11058 * FIXME: const-ify all uses of struct kvm_memory_slot. 11059 */ 11060 kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); 11061 11062 /* Free the arrays associated with the old memslot. */ 11063 if (change == KVM_MR_MOVE) 11064 kvm_arch_free_memslot(kvm, old); 11065 } 11066 11067 void kvm_arch_flush_shadow_all(struct kvm *kvm) 11068 { 11069 kvm_mmu_zap_all(kvm); 11070 } 11071 11072 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 11073 struct kvm_memory_slot *slot) 11074 { 11075 kvm_page_track_flush_slot(kvm, slot); 11076 } 11077 11078 static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 11079 { 11080 return (is_guest_mode(vcpu) && 11081 kvm_x86_ops.guest_apic_has_interrupt && 11082 static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); 11083 } 11084 11085 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 11086 { 11087 if (!list_empty_careful(&vcpu->async_pf.done)) 11088 return true; 11089 11090 if (kvm_apic_has_events(vcpu)) 11091 return true; 11092 11093 if (vcpu->arch.pv.pv_unhalted) 11094 return true; 11095 11096 if (vcpu->arch.exception.pending) 11097 return true; 11098 11099 if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11100 (vcpu->arch.nmi_pending && 11101 static_call(kvm_x86_nmi_allowed)(vcpu, false))) 11102 return true; 11103 11104 if (kvm_test_request(KVM_REQ_SMI, vcpu) || 11105 (vcpu->arch.smi_pending && 11106 static_call(kvm_x86_smi_allowed)(vcpu, false))) 11107 return true; 11108 11109 if (kvm_arch_interrupt_allowed(vcpu) && 11110 (kvm_cpu_has_interrupt(vcpu) || 11111 kvm_guest_apic_has_interrupt(vcpu))) 11112 return true; 11113 11114 if (kvm_hv_has_stimer_pending(vcpu)) 11115 return true; 11116 11117 if (is_guest_mode(vcpu) && 11118 kvm_x86_ops.nested_ops->hv_timer_pending && 11119 kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) 11120 return true; 11121 11122 return false; 11123 } 11124 11125 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 11126 { 11127 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 11128 } 11129 11130 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 11131 { 11132 if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) 11133 return true; 11134 11135 return false; 11136 } 11137 11138 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 11139 { 11140 if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 11141 return true; 11142 11143 if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11144 kvm_test_request(KVM_REQ_SMI, vcpu) || 11145 kvm_test_request(KVM_REQ_EVENT, vcpu)) 11146 return true; 11147 11148 return kvm_arch_dy_has_pending_interrupt(vcpu); 11149 } 11150 11151 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 11152 { 11153 if (vcpu->arch.guest_state_protected) 11154 return true; 11155 11156 return vcpu->arch.preempted_in_kernel; 11157 } 11158 11159 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 11160 { 11161 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; 11162 } 11163 11164 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 11165 { 11166 return static_call(kvm_x86_interrupt_allowed)(vcpu, false); 11167 } 11168 11169 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) 11170 { 11171 /* Can't read the RIP when guest state is protected, just return 0 */ 11172 if (vcpu->arch.guest_state_protected) 11173 return 0; 11174 11175 if (is_64_bit_mode(vcpu)) 11176 return kvm_rip_read(vcpu); 11177 return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + 11178 kvm_rip_read(vcpu)); 11179 } 11180 EXPORT_SYMBOL_GPL(kvm_get_linear_rip); 11181 11182 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) 11183 { 11184 return kvm_get_linear_rip(vcpu) == linear_rip; 11185 } 11186 EXPORT_SYMBOL_GPL(kvm_is_linear_rip); 11187 11188 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 11189 { 11190 unsigned long rflags; 11191 11192 rflags = static_call(kvm_x86_get_rflags)(vcpu); 11193 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 11194 rflags &= ~X86_EFLAGS_TF; 11195 return rflags; 11196 } 11197 EXPORT_SYMBOL_GPL(kvm_get_rflags); 11198 11199 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 11200 { 11201 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 11202 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 11203 rflags |= X86_EFLAGS_TF; 11204 static_call(kvm_x86_set_rflags)(vcpu, rflags); 11205 } 11206 11207 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 11208 { 11209 __kvm_set_rflags(vcpu, rflags); 11210 kvm_make_request(KVM_REQ_EVENT, vcpu); 11211 } 11212 EXPORT_SYMBOL_GPL(kvm_set_rflags); 11213 11214 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) 11215 { 11216 int r; 11217 11218 if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) || 11219 work->wakeup_all) 11220 return; 11221 11222 r = kvm_mmu_reload(vcpu); 11223 if (unlikely(r)) 11224 return; 11225 11226 if (!vcpu->arch.mmu->direct_map && 11227 work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) 11228 return; 11229 11230 kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); 11231 } 11232 11233 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) 11234 { 11235 BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); 11236 11237 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); 11238 } 11239 11240 static inline u32 kvm_async_pf_next_probe(u32 key) 11241 { 11242 return (key + 1) & (ASYNC_PF_PER_VCPU - 1); 11243 } 11244 11245 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 11246 { 11247 u32 key = kvm_async_pf_hash_fn(gfn); 11248 11249 while (vcpu->arch.apf.gfns[key] != ~0) 11250 key = kvm_async_pf_next_probe(key); 11251 11252 vcpu->arch.apf.gfns[key] = gfn; 11253 } 11254 11255 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) 11256 { 11257 int i; 11258 u32 key = kvm_async_pf_hash_fn(gfn); 11259 11260 for (i = 0; i < ASYNC_PF_PER_VCPU && 11261 (vcpu->arch.apf.gfns[key] != gfn && 11262 vcpu->arch.apf.gfns[key] != ~0); i++) 11263 key = kvm_async_pf_next_probe(key); 11264 11265 return key; 11266 } 11267 11268 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 11269 { 11270 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; 11271 } 11272 11273 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 11274 { 11275 u32 i, j, k; 11276 11277 i = j = kvm_async_pf_gfn_slot(vcpu, gfn); 11278 11279 if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn)) 11280 return; 11281 11282 while (true) { 11283 vcpu->arch.apf.gfns[i] = ~0; 11284 do { 11285 j = kvm_async_pf_next_probe(j); 11286 if (vcpu->arch.apf.gfns[j] == ~0) 11287 return; 11288 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); 11289 /* 11290 * k lies cyclically in ]i,j] 11291 * | i.k.j | 11292 * |....j i.k.| or |.k..j i...| 11293 */ 11294 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); 11295 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; 11296 i = j; 11297 } 11298 } 11299 11300 static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) 11301 { 11302 u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; 11303 11304 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason, 11305 sizeof(reason)); 11306 } 11307 11308 static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) 11309 { 11310 unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); 11311 11312 return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, 11313 &token, offset, sizeof(token)); 11314 } 11315 11316 static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) 11317 { 11318 unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); 11319 u32 val; 11320 11321 if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, 11322 &val, offset, sizeof(val))) 11323 return false; 11324 11325 return !val; 11326 } 11327 11328 static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) 11329 { 11330 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) 11331 return false; 11332 11333 if (!kvm_pv_async_pf_enabled(vcpu) || 11334 (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0)) 11335 return false; 11336 11337 return true; 11338 } 11339 11340 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) 11341 { 11342 if (unlikely(!lapic_in_kernel(vcpu) || 11343 kvm_event_needs_reinjection(vcpu) || 11344 vcpu->arch.exception.pending)) 11345 return false; 11346 11347 if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) 11348 return false; 11349 11350 /* 11351 * If interrupts are off we cannot even use an artificial 11352 * halt state. 11353 */ 11354 return kvm_arch_interrupt_allowed(vcpu); 11355 } 11356 11357 bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, 11358 struct kvm_async_pf *work) 11359 { 11360 struct x86_exception fault; 11361 11362 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); 11363 kvm_add_async_pf_gfn(vcpu, work->arch.gfn); 11364 11365 if (kvm_can_deliver_async_pf(vcpu) && 11366 !apf_put_user_notpresent(vcpu)) { 11367 fault.vector = PF_VECTOR; 11368 fault.error_code_valid = true; 11369 fault.error_code = 0; 11370 fault.nested_page_fault = false; 11371 fault.address = work->arch.token; 11372 fault.async_page_fault = true; 11373 kvm_inject_page_fault(vcpu, &fault); 11374 return true; 11375 } else { 11376 /* 11377 * It is not possible to deliver a paravirtualized asynchronous 11378 * page fault, but putting the guest in an artificial halt state 11379 * can be beneficial nevertheless: if an interrupt arrives, we 11380 * can deliver it timely and perhaps the guest will schedule 11381 * another process. When the instruction that triggered a page 11382 * fault is retried, hopefully the page will be ready in the host. 11383 */ 11384 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 11385 return false; 11386 } 11387 } 11388 11389 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, 11390 struct kvm_async_pf *work) 11391 { 11392 struct kvm_lapic_irq irq = { 11393 .delivery_mode = APIC_DM_FIXED, 11394 .vector = vcpu->arch.apf.vec 11395 }; 11396 11397 if (work->wakeup_all) 11398 work->arch.token = ~0; /* broadcast wakeup */ 11399 else 11400 kvm_del_async_pf_gfn(vcpu, work->arch.gfn); 11401 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); 11402 11403 if ((work->wakeup_all || work->notpresent_injected) && 11404 kvm_pv_async_pf_enabled(vcpu) && 11405 !apf_put_user_ready(vcpu, work->arch.token)) { 11406 vcpu->arch.apf.pageready_pending = true; 11407 kvm_apic_set_irq(vcpu, &irq, NULL); 11408 } 11409 11410 vcpu->arch.apf.halted = false; 11411 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 11412 } 11413 11414 void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) 11415 { 11416 kvm_make_request(KVM_REQ_APF_READY, vcpu); 11417 if (!vcpu->arch.apf.pageready_pending) 11418 kvm_vcpu_kick(vcpu); 11419 } 11420 11421 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) 11422 { 11423 if (!kvm_pv_async_pf_enabled(vcpu)) 11424 return true; 11425 else 11426 return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); 11427 } 11428 11429 void kvm_arch_start_assignment(struct kvm *kvm) 11430 { 11431 atomic_inc(&kvm->arch.assigned_device_count); 11432 } 11433 EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); 11434 11435 void kvm_arch_end_assignment(struct kvm *kvm) 11436 { 11437 atomic_dec(&kvm->arch.assigned_device_count); 11438 } 11439 EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); 11440 11441 bool kvm_arch_has_assigned_device(struct kvm *kvm) 11442 { 11443 return atomic_read(&kvm->arch.assigned_device_count); 11444 } 11445 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); 11446 11447 void kvm_arch_register_noncoherent_dma(struct kvm *kvm) 11448 { 11449 atomic_inc(&kvm->arch.noncoherent_dma_count); 11450 } 11451 EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); 11452 11453 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) 11454 { 11455 atomic_dec(&kvm->arch.noncoherent_dma_count); 11456 } 11457 EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); 11458 11459 bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) 11460 { 11461 return atomic_read(&kvm->arch.noncoherent_dma_count); 11462 } 11463 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); 11464 11465 bool kvm_arch_has_irq_bypass(void) 11466 { 11467 return true; 11468 } 11469 11470 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, 11471 struct irq_bypass_producer *prod) 11472 { 11473 struct kvm_kernel_irqfd *irqfd = 11474 container_of(cons, struct kvm_kernel_irqfd, consumer); 11475 int ret; 11476 11477 irqfd->producer = prod; 11478 kvm_arch_start_assignment(irqfd->kvm); 11479 ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, 11480 prod->irq, irqfd->gsi, 1); 11481 11482 if (ret) 11483 kvm_arch_end_assignment(irqfd->kvm); 11484 11485 return ret; 11486 } 11487 11488 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, 11489 struct irq_bypass_producer *prod) 11490 { 11491 int ret; 11492 struct kvm_kernel_irqfd *irqfd = 11493 container_of(cons, struct kvm_kernel_irqfd, consumer); 11494 11495 WARN_ON(irqfd->producer != prod); 11496 irqfd->producer = NULL; 11497 11498 /* 11499 * When producer of consumer is unregistered, we change back to 11500 * remapped mode, so we can re-use the current implementation 11501 * when the irq is masked/disabled or the consumer side (KVM 11502 * int this case doesn't want to receive the interrupts. 11503 */ 11504 ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); 11505 if (ret) 11506 printk(KERN_INFO "irq bypass consumer (token %p) unregistration" 11507 " fails: %d\n", irqfd->consumer.token, ret); 11508 11509 kvm_arch_end_assignment(irqfd->kvm); 11510 } 11511 11512 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, 11513 uint32_t guest_irq, bool set) 11514 { 11515 return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); 11516 } 11517 11518 bool kvm_vector_hashing_enabled(void) 11519 { 11520 return vector_hashing; 11521 } 11522 11523 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) 11524 { 11525 return (vcpu->arch.msr_kvm_poll_control & 1) == 0; 11526 } 11527 EXPORT_SYMBOL_GPL(kvm_arch_no_poll); 11528 11529 11530 int kvm_spec_ctrl_test_value(u64 value) 11531 { 11532 /* 11533 * test that setting IA32_SPEC_CTRL to given value 11534 * is allowed by the host processor 11535 */ 11536 11537 u64 saved_value; 11538 unsigned long flags; 11539 int ret = 0; 11540 11541 local_irq_save(flags); 11542 11543 if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) 11544 ret = 1; 11545 else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) 11546 ret = 1; 11547 else 11548 wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); 11549 11550 local_irq_restore(flags); 11551 11552 return ret; 11553 } 11554 EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); 11555 11556 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) 11557 { 11558 struct x86_exception fault; 11559 u32 access = error_code & 11560 (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); 11561 11562 if (!(error_code & PFERR_PRESENT_MASK) || 11563 vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) { 11564 /* 11565 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page 11566 * tables probably do not match the TLB. Just proceed 11567 * with the error code that the processor gave. 11568 */ 11569 fault.vector = PF_VECTOR; 11570 fault.error_code_valid = true; 11571 fault.error_code = error_code; 11572 fault.nested_page_fault = false; 11573 fault.address = gva; 11574 } 11575 vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); 11576 } 11577 EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); 11578 11579 /* 11580 * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns 11581 * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value 11582 * indicates whether exit to userspace is needed. 11583 */ 11584 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, 11585 struct x86_exception *e) 11586 { 11587 if (r == X86EMUL_PROPAGATE_FAULT) { 11588 kvm_inject_emulated_page_fault(vcpu, e); 11589 return 1; 11590 } 11591 11592 /* 11593 * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED 11594 * while handling a VMX instruction KVM could've handled the request 11595 * correctly by exiting to userspace and performing I/O but there 11596 * doesn't seem to be a real use-case behind such requests, just return 11597 * KVM_EXIT_INTERNAL_ERROR for now. 11598 */ 11599 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 11600 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 11601 vcpu->run->internal.ndata = 0; 11602 11603 return 0; 11604 } 11605 EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); 11606 11607 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) 11608 { 11609 bool pcid_enabled; 11610 struct x86_exception e; 11611 unsigned i; 11612 unsigned long roots_to_free = 0; 11613 struct { 11614 u64 pcid; 11615 u64 gla; 11616 } operand; 11617 int r; 11618 11619 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 11620 if (r != X86EMUL_CONTINUE) 11621 return kvm_handle_memory_failure(vcpu, r, &e); 11622 11623 if (operand.pcid >> 12 != 0) { 11624 kvm_inject_gp(vcpu, 0); 11625 return 1; 11626 } 11627 11628 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 11629 11630 switch (type) { 11631 case INVPCID_TYPE_INDIV_ADDR: 11632 if ((!pcid_enabled && (operand.pcid != 0)) || 11633 is_noncanonical_address(operand.gla, vcpu)) { 11634 kvm_inject_gp(vcpu, 0); 11635 return 1; 11636 } 11637 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); 11638 return kvm_skip_emulated_instruction(vcpu); 11639 11640 case INVPCID_TYPE_SINGLE_CTXT: 11641 if (!pcid_enabled && (operand.pcid != 0)) { 11642 kvm_inject_gp(vcpu, 0); 11643 return 1; 11644 } 11645 11646 if (kvm_get_active_pcid(vcpu) == operand.pcid) { 11647 kvm_mmu_sync_roots(vcpu); 11648 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 11649 } 11650 11651 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 11652 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) 11653 == operand.pcid) 11654 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 11655 11656 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); 11657 /* 11658 * If neither the current cr3 nor any of the prev_roots use the 11659 * given PCID, then nothing needs to be done here because a 11660 * resync will happen anyway before switching to any other CR3. 11661 */ 11662 11663 return kvm_skip_emulated_instruction(vcpu); 11664 11665 case INVPCID_TYPE_ALL_NON_GLOBAL: 11666 /* 11667 * Currently, KVM doesn't mark global entries in the shadow 11668 * page tables, so a non-global flush just degenerates to a 11669 * global flush. If needed, we could optimize this later by 11670 * keeping track of global entries in shadow page tables. 11671 */ 11672 11673 fallthrough; 11674 case INVPCID_TYPE_ALL_INCL_GLOBAL: 11675 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 11676 return kvm_skip_emulated_instruction(vcpu); 11677 11678 default: 11679 BUG(); /* We have already checked above that type <= 3 */ 11680 } 11681 } 11682 EXPORT_SYMBOL_GPL(kvm_handle_invpcid); 11683 11684 static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) 11685 { 11686 struct kvm_run *run = vcpu->run; 11687 struct kvm_mmio_fragment *frag; 11688 unsigned int len; 11689 11690 BUG_ON(!vcpu->mmio_needed); 11691 11692 /* Complete previous fragment */ 11693 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; 11694 len = min(8u, frag->len); 11695 if (!vcpu->mmio_is_write) 11696 memcpy(frag->data, run->mmio.data, len); 11697 11698 if (frag->len <= 8) { 11699 /* Switch to the next fragment. */ 11700 frag++; 11701 vcpu->mmio_cur_fragment++; 11702 } else { 11703 /* Go forward to the next mmio piece. */ 11704 frag->data += len; 11705 frag->gpa += len; 11706 frag->len -= len; 11707 } 11708 11709 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { 11710 vcpu->mmio_needed = 0; 11711 11712 // VMG change, at this point, we're always done 11713 // RIP has already been advanced 11714 return 1; 11715 } 11716 11717 // More MMIO is needed 11718 run->mmio.phys_addr = frag->gpa; 11719 run->mmio.len = min(8u, frag->len); 11720 run->mmio.is_write = vcpu->mmio_is_write; 11721 if (run->mmio.is_write) 11722 memcpy(run->mmio.data, frag->data, min(8u, frag->len)); 11723 run->exit_reason = KVM_EXIT_MMIO; 11724 11725 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; 11726 11727 return 0; 11728 } 11729 11730 int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, 11731 void *data) 11732 { 11733 int handled; 11734 struct kvm_mmio_fragment *frag; 11735 11736 if (!data) 11737 return -EINVAL; 11738 11739 handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data); 11740 if (handled == bytes) 11741 return 1; 11742 11743 bytes -= handled; 11744 gpa += handled; 11745 data += handled; 11746 11747 /*TODO: Check if need to increment number of frags */ 11748 frag = vcpu->mmio_fragments; 11749 vcpu->mmio_nr_fragments = 1; 11750 frag->len = bytes; 11751 frag->gpa = gpa; 11752 frag->data = data; 11753 11754 vcpu->mmio_needed = 1; 11755 vcpu->mmio_cur_fragment = 0; 11756 11757 vcpu->run->mmio.phys_addr = gpa; 11758 vcpu->run->mmio.len = min(8u, frag->len); 11759 vcpu->run->mmio.is_write = 1; 11760 memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); 11761 vcpu->run->exit_reason = KVM_EXIT_MMIO; 11762 11763 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; 11764 11765 return 0; 11766 } 11767 EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); 11768 11769 int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, 11770 void *data) 11771 { 11772 int handled; 11773 struct kvm_mmio_fragment *frag; 11774 11775 if (!data) 11776 return -EINVAL; 11777 11778 handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data); 11779 if (handled == bytes) 11780 return 1; 11781 11782 bytes -= handled; 11783 gpa += handled; 11784 data += handled; 11785 11786 /*TODO: Check if need to increment number of frags */ 11787 frag = vcpu->mmio_fragments; 11788 vcpu->mmio_nr_fragments = 1; 11789 frag->len = bytes; 11790 frag->gpa = gpa; 11791 frag->data = data; 11792 11793 vcpu->mmio_needed = 1; 11794 vcpu->mmio_cur_fragment = 0; 11795 11796 vcpu->run->mmio.phys_addr = gpa; 11797 vcpu->run->mmio.len = min(8u, frag->len); 11798 vcpu->run->mmio.is_write = 0; 11799 vcpu->run->exit_reason = KVM_EXIT_MMIO; 11800 11801 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; 11802 11803 return 0; 11804 } 11805 EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); 11806 11807 static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) 11808 { 11809 memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, 11810 vcpu->arch.pio.count * vcpu->arch.pio.size); 11811 vcpu->arch.pio.count = 0; 11812 11813 return 1; 11814 } 11815 11816 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, 11817 unsigned int port, void *data, unsigned int count) 11818 { 11819 int ret; 11820 11821 ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, 11822 data, count); 11823 if (ret) 11824 return ret; 11825 11826 vcpu->arch.pio.count = 0; 11827 11828 return 0; 11829 } 11830 11831 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, 11832 unsigned int port, void *data, unsigned int count) 11833 { 11834 int ret; 11835 11836 ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, 11837 data, count); 11838 if (ret) { 11839 vcpu->arch.pio.count = 0; 11840 } else { 11841 vcpu->arch.guest_ins_data = data; 11842 vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; 11843 } 11844 11845 return 0; 11846 } 11847 11848 int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, 11849 unsigned int port, void *data, unsigned int count, 11850 int in) 11851 { 11852 return in ? kvm_sev_es_ins(vcpu, size, port, data, count) 11853 : kvm_sev_es_outs(vcpu, size, port, data, count); 11854 } 11855 EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); 11856 11857 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); 11858 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 11859 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); 11860 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 11861 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 11862 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 11863 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 11864 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 11865 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 11866 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 11867 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 11868 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); 11869 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 11870 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 11871 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 11872 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 11873 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); 11874 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); 11875 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); 11876 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); 11877 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); 11878 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); 11879 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); 11880 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); 11881 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); 11882 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); 11883 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); 11884