1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 * Amit Shah <amit.shah@qumranet.com> 15 * Ben-Ami Yassour <benami@il.ibm.com> 16 * 17 * This work is licensed under the terms of the GNU GPL, version 2. See 18 * the COPYING file in the top-level directory. 19 * 20 */ 21 22 #include <linux/kvm_host.h> 23 #include "irq.h" 24 #include "mmu.h" 25 #include "i8254.h" 26 #include "tss.h" 27 #include "kvm_cache_regs.h" 28 #include "x86.h" 29 30 #include <linux/clocksource.h> 31 #include <linux/interrupt.h> 32 #include <linux/kvm.h> 33 #include <linux/fs.h> 34 #include <linux/vmalloc.h> 35 #include <linux/module.h> 36 #include <linux/mman.h> 37 #include <linux/highmem.h> 38 #include <linux/iommu.h> 39 #include <linux/intel-iommu.h> 40 #include <linux/cpufreq.h> 41 #include <linux/user-return-notifier.h> 42 #include <linux/srcu.h> 43 #include <linux/slab.h> 44 #include <linux/perf_event.h> 45 #include <linux/uaccess.h> 46 #include <linux/hash.h> 47 #include <trace/events/kvm.h> 48 49 #define CREATE_TRACE_POINTS 50 #include "trace.h" 51 52 #include <asm/debugreg.h> 53 #include <asm/msr.h> 54 #include <asm/desc.h> 55 #include <asm/mtrr.h> 56 #include <asm/mce.h> 57 #include <asm/i387.h> 58 #include <asm/xcr.h> 59 #include <asm/pvclock.h> 60 #include <asm/div64.h> 61 62 #define MAX_IO_MSRS 256 63 #define CR0_RESERVED_BITS \ 64 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 65 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 66 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 67 #define CR4_RESERVED_BITS \ 68 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 69 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 70 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 71 | X86_CR4_OSXSAVE \ 72 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 73 74 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 75 76 #define KVM_MAX_MCE_BANKS 32 77 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) 78 79 /* EFER defaults: 80 * - enable syscall per default because its emulated by KVM 81 * - enable LME and LMA per default on 64 bit KVM 82 */ 83 #ifdef CONFIG_X86_64 84 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 85 #else 86 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 87 #endif 88 89 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 90 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 91 92 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 93 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 94 struct kvm_cpuid_entry2 __user *entries); 95 96 struct kvm_x86_ops *kvm_x86_ops; 97 EXPORT_SYMBOL_GPL(kvm_x86_ops); 98 99 int ignore_msrs = 0; 100 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 101 102 #define KVM_NR_SHARED_MSRS 16 103 104 struct kvm_shared_msrs_global { 105 int nr; 106 u32 msrs[KVM_NR_SHARED_MSRS]; 107 }; 108 109 struct kvm_shared_msrs { 110 struct user_return_notifier urn; 111 bool registered; 112 struct kvm_shared_msr_values { 113 u64 host; 114 u64 curr; 115 } values[KVM_NR_SHARED_MSRS]; 116 }; 117 118 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 119 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); 120 121 struct kvm_stats_debugfs_item debugfs_entries[] = { 122 { "pf_fixed", VCPU_STAT(pf_fixed) }, 123 { "pf_guest", VCPU_STAT(pf_guest) }, 124 { "tlb_flush", VCPU_STAT(tlb_flush) }, 125 { "invlpg", VCPU_STAT(invlpg) }, 126 { "exits", VCPU_STAT(exits) }, 127 { "io_exits", VCPU_STAT(io_exits) }, 128 { "mmio_exits", VCPU_STAT(mmio_exits) }, 129 { "signal_exits", VCPU_STAT(signal_exits) }, 130 { "irq_window", VCPU_STAT(irq_window_exits) }, 131 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 132 { "halt_exits", VCPU_STAT(halt_exits) }, 133 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 134 { "hypercalls", VCPU_STAT(hypercalls) }, 135 { "request_irq", VCPU_STAT(request_irq_exits) }, 136 { "irq_exits", VCPU_STAT(irq_exits) }, 137 { "host_state_reload", VCPU_STAT(host_state_reload) }, 138 { "efer_reload", VCPU_STAT(efer_reload) }, 139 { "fpu_reload", VCPU_STAT(fpu_reload) }, 140 { "insn_emulation", VCPU_STAT(insn_emulation) }, 141 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 142 { "irq_injections", VCPU_STAT(irq_injections) }, 143 { "nmi_injections", VCPU_STAT(nmi_injections) }, 144 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 145 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 146 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 147 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 148 { "mmu_flooded", VM_STAT(mmu_flooded) }, 149 { "mmu_recycled", VM_STAT(mmu_recycled) }, 150 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 151 { "mmu_unsync", VM_STAT(mmu_unsync) }, 152 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 153 { "largepages", VM_STAT(lpages) }, 154 { NULL } 155 }; 156 157 u64 __read_mostly host_xcr0; 158 159 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 160 { 161 int i; 162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) 163 vcpu->arch.apf.gfns[i] = ~0; 164 } 165 166 static void kvm_on_user_return(struct user_return_notifier *urn) 167 { 168 unsigned slot; 169 struct kvm_shared_msrs *locals 170 = container_of(urn, struct kvm_shared_msrs, urn); 171 struct kvm_shared_msr_values *values; 172 173 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 174 values = &locals->values[slot]; 175 if (values->host != values->curr) { 176 wrmsrl(shared_msrs_global.msrs[slot], values->host); 177 values->curr = values->host; 178 } 179 } 180 locals->registered = false; 181 user_return_notifier_unregister(urn); 182 } 183 184 static void shared_msr_update(unsigned slot, u32 msr) 185 { 186 struct kvm_shared_msrs *smsr; 187 u64 value; 188 189 smsr = &__get_cpu_var(shared_msrs); 190 /* only read, and nobody should modify it at this time, 191 * so don't need lock */ 192 if (slot >= shared_msrs_global.nr) { 193 printk(KERN_ERR "kvm: invalid MSR slot!"); 194 return; 195 } 196 rdmsrl_safe(msr, &value); 197 smsr->values[slot].host = value; 198 smsr->values[slot].curr = value; 199 } 200 201 void kvm_define_shared_msr(unsigned slot, u32 msr) 202 { 203 if (slot >= shared_msrs_global.nr) 204 shared_msrs_global.nr = slot + 1; 205 shared_msrs_global.msrs[slot] = msr; 206 /* we need ensured the shared_msr_global have been updated */ 207 smp_wmb(); 208 } 209 EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 210 211 static void kvm_shared_msr_cpu_online(void) 212 { 213 unsigned i; 214 215 for (i = 0; i < shared_msrs_global.nr; ++i) 216 shared_msr_update(i, shared_msrs_global.msrs[i]); 217 } 218 219 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 220 { 221 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 222 223 if (((value ^ smsr->values[slot].curr) & mask) == 0) 224 return; 225 smsr->values[slot].curr = value; 226 wrmsrl(shared_msrs_global.msrs[slot], value); 227 if (!smsr->registered) { 228 smsr->urn.on_user_return = kvm_on_user_return; 229 user_return_notifier_register(&smsr->urn); 230 smsr->registered = true; 231 } 232 } 233 EXPORT_SYMBOL_GPL(kvm_set_shared_msr); 234 235 static void drop_user_return_notifiers(void *ignore) 236 { 237 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 238 239 if (smsr->registered) 240 kvm_on_user_return(&smsr->urn); 241 } 242 243 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 244 { 245 if (irqchip_in_kernel(vcpu->kvm)) 246 return vcpu->arch.apic_base; 247 else 248 return vcpu->arch.apic_base; 249 } 250 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 251 252 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 253 { 254 /* TODO: reserve bits check */ 255 if (irqchip_in_kernel(vcpu->kvm)) 256 kvm_lapic_set_base(vcpu, data); 257 else 258 vcpu->arch.apic_base = data; 259 } 260 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 261 262 #define EXCPT_BENIGN 0 263 #define EXCPT_CONTRIBUTORY 1 264 #define EXCPT_PF 2 265 266 static int exception_class(int vector) 267 { 268 switch (vector) { 269 case PF_VECTOR: 270 return EXCPT_PF; 271 case DE_VECTOR: 272 case TS_VECTOR: 273 case NP_VECTOR: 274 case SS_VECTOR: 275 case GP_VECTOR: 276 return EXCPT_CONTRIBUTORY; 277 default: 278 break; 279 } 280 return EXCPT_BENIGN; 281 } 282 283 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 284 unsigned nr, bool has_error, u32 error_code, 285 bool reinject) 286 { 287 u32 prev_nr; 288 int class1, class2; 289 290 kvm_make_request(KVM_REQ_EVENT, vcpu); 291 292 if (!vcpu->arch.exception.pending) { 293 queue: 294 vcpu->arch.exception.pending = true; 295 vcpu->arch.exception.has_error_code = has_error; 296 vcpu->arch.exception.nr = nr; 297 vcpu->arch.exception.error_code = error_code; 298 vcpu->arch.exception.reinject = reinject; 299 return; 300 } 301 302 /* to check exception */ 303 prev_nr = vcpu->arch.exception.nr; 304 if (prev_nr == DF_VECTOR) { 305 /* triple fault -> shutdown */ 306 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 307 return; 308 } 309 class1 = exception_class(prev_nr); 310 class2 = exception_class(nr); 311 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 312 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 313 /* generate double fault per SDM Table 5-5 */ 314 vcpu->arch.exception.pending = true; 315 vcpu->arch.exception.has_error_code = true; 316 vcpu->arch.exception.nr = DF_VECTOR; 317 vcpu->arch.exception.error_code = 0; 318 } else 319 /* replace previous exception with a new one in a hope 320 that instruction re-execution will regenerate lost 321 exception */ 322 goto queue; 323 } 324 325 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 326 { 327 kvm_multiple_exception(vcpu, nr, false, 0, false); 328 } 329 EXPORT_SYMBOL_GPL(kvm_queue_exception); 330 331 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) 332 { 333 kvm_multiple_exception(vcpu, nr, false, 0, true); 334 } 335 EXPORT_SYMBOL_GPL(kvm_requeue_exception); 336 337 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) 338 { 339 if (err) 340 kvm_inject_gp(vcpu, 0); 341 else 342 kvm_x86_ops->skip_emulated_instruction(vcpu); 343 } 344 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); 345 346 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 347 { 348 ++vcpu->stat.pf_guest; 349 vcpu->arch.cr2 = fault->address; 350 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 351 } 352 353 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 354 { 355 if (mmu_is_nested(vcpu) && !fault->nested_page_fault) 356 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); 357 else 358 vcpu->arch.mmu.inject_page_fault(vcpu, fault); 359 } 360 361 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 362 { 363 kvm_make_request(KVM_REQ_EVENT, vcpu); 364 vcpu->arch.nmi_pending = 1; 365 } 366 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 367 368 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 369 { 370 kvm_multiple_exception(vcpu, nr, true, error_code, false); 371 } 372 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 373 374 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 375 { 376 kvm_multiple_exception(vcpu, nr, true, error_code, true); 377 } 378 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); 379 380 /* 381 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 382 * a #GP and return false. 383 */ 384 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 385 { 386 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 387 return true; 388 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 389 return false; 390 } 391 EXPORT_SYMBOL_GPL(kvm_require_cpl); 392 393 /* 394 * This function will be used to read from the physical memory of the currently 395 * running guest. The difference to kvm_read_guest_page is that this function 396 * can read from guest physical or from the guest's guest physical memory. 397 */ 398 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 399 gfn_t ngfn, void *data, int offset, int len, 400 u32 access) 401 { 402 gfn_t real_gfn; 403 gpa_t ngpa; 404 405 ngpa = gfn_to_gpa(ngfn); 406 real_gfn = mmu->translate_gpa(vcpu, ngpa, access); 407 if (real_gfn == UNMAPPED_GVA) 408 return -EFAULT; 409 410 real_gfn = gpa_to_gfn(real_gfn); 411 412 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); 413 } 414 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 415 416 int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 417 void *data, int offset, int len, u32 access) 418 { 419 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, 420 data, offset, len, access); 421 } 422 423 /* 424 * Load the pae pdptrs. Return true is they are all valid. 425 */ 426 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) 427 { 428 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 429 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 430 int i; 431 int ret; 432 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; 433 434 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, 435 offset * sizeof(u64), sizeof(pdpte), 436 PFERR_USER_MASK|PFERR_WRITE_MASK); 437 if (ret < 0) { 438 ret = 0; 439 goto out; 440 } 441 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 442 if (is_present_gpte(pdpte[i]) && 443 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 444 ret = 0; 445 goto out; 446 } 447 } 448 ret = 1; 449 450 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); 451 __set_bit(VCPU_EXREG_PDPTR, 452 (unsigned long *)&vcpu->arch.regs_avail); 453 __set_bit(VCPU_EXREG_PDPTR, 454 (unsigned long *)&vcpu->arch.regs_dirty); 455 out: 456 457 return ret; 458 } 459 EXPORT_SYMBOL_GPL(load_pdptrs); 460 461 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 462 { 463 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; 464 bool changed = true; 465 int offset; 466 gfn_t gfn; 467 int r; 468 469 if (is_long_mode(vcpu) || !is_pae(vcpu)) 470 return false; 471 472 if (!test_bit(VCPU_EXREG_PDPTR, 473 (unsigned long *)&vcpu->arch.regs_avail)) 474 return true; 475 476 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; 477 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); 478 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 479 PFERR_USER_MASK | PFERR_WRITE_MASK); 480 if (r < 0) 481 goto out; 482 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; 483 out: 484 485 return changed; 486 } 487 488 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 489 { 490 unsigned long old_cr0 = kvm_read_cr0(vcpu); 491 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | 492 X86_CR0_CD | X86_CR0_NW; 493 494 cr0 |= X86_CR0_ET; 495 496 #ifdef CONFIG_X86_64 497 if (cr0 & 0xffffffff00000000UL) 498 return 1; 499 #endif 500 501 cr0 &= ~CR0_RESERVED_BITS; 502 503 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) 504 return 1; 505 506 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) 507 return 1; 508 509 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 510 #ifdef CONFIG_X86_64 511 if ((vcpu->arch.efer & EFER_LME)) { 512 int cs_db, cs_l; 513 514 if (!is_pae(vcpu)) 515 return 1; 516 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 517 if (cs_l) 518 return 1; 519 } else 520 #endif 521 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 522 kvm_read_cr3(vcpu))) 523 return 1; 524 } 525 526 kvm_x86_ops->set_cr0(vcpu, cr0); 527 528 if ((cr0 ^ old_cr0) & X86_CR0_PG) 529 kvm_clear_async_pf_completion_queue(vcpu); 530 531 if ((cr0 ^ old_cr0) & update_bits) 532 kvm_mmu_reset_context(vcpu); 533 return 0; 534 } 535 EXPORT_SYMBOL_GPL(kvm_set_cr0); 536 537 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 538 { 539 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 540 } 541 EXPORT_SYMBOL_GPL(kvm_lmsw); 542 543 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 544 { 545 u64 xcr0; 546 547 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 548 if (index != XCR_XFEATURE_ENABLED_MASK) 549 return 1; 550 xcr0 = xcr; 551 if (kvm_x86_ops->get_cpl(vcpu) != 0) 552 return 1; 553 if (!(xcr0 & XSTATE_FP)) 554 return 1; 555 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) 556 return 1; 557 if (xcr0 & ~host_xcr0) 558 return 1; 559 vcpu->arch.xcr0 = xcr0; 560 vcpu->guest_xcr0_loaded = 0; 561 return 0; 562 } 563 564 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 565 { 566 if (__kvm_set_xcr(vcpu, index, xcr)) { 567 kvm_inject_gp(vcpu, 0); 568 return 1; 569 } 570 return 0; 571 } 572 EXPORT_SYMBOL_GPL(kvm_set_xcr); 573 574 static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 575 { 576 struct kvm_cpuid_entry2 *best; 577 578 best = kvm_find_cpuid_entry(vcpu, 1, 0); 579 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 580 } 581 582 static void update_cpuid(struct kvm_vcpu *vcpu) 583 { 584 struct kvm_cpuid_entry2 *best; 585 586 best = kvm_find_cpuid_entry(vcpu, 1, 0); 587 if (!best) 588 return; 589 590 /* Update OSXSAVE bit */ 591 if (cpu_has_xsave && best->function == 0x1) { 592 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); 593 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) 594 best->ecx |= bit(X86_FEATURE_OSXSAVE); 595 } 596 } 597 598 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 599 { 600 unsigned long old_cr4 = kvm_read_cr4(vcpu); 601 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 602 603 if (cr4 & CR4_RESERVED_BITS) 604 return 1; 605 606 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 607 return 1; 608 609 if (is_long_mode(vcpu)) { 610 if (!(cr4 & X86_CR4_PAE)) 611 return 1; 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 613 && ((cr4 ^ old_cr4) & pdptr_bits) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 615 kvm_read_cr3(vcpu))) 616 return 1; 617 618 if (cr4 & X86_CR4_VMXE) 619 return 1; 620 621 kvm_x86_ops->set_cr4(vcpu, cr4); 622 623 if ((cr4 ^ old_cr4) & pdptr_bits) 624 kvm_mmu_reset_context(vcpu); 625 626 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 627 update_cpuid(vcpu); 628 629 return 0; 630 } 631 EXPORT_SYMBOL_GPL(kvm_set_cr4); 632 633 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 634 { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { 636 kvm_mmu_sync_roots(vcpu); 637 kvm_mmu_flush_tlb(vcpu); 638 return 0; 639 } 640 641 if (is_long_mode(vcpu)) { 642 if (cr3 & CR3_L_MODE_RESERVED_BITS) 643 return 1; 644 } else { 645 if (is_pae(vcpu)) { 646 if (cr3 & CR3_PAE_RESERVED_BITS) 647 return 1; 648 if (is_paging(vcpu) && 649 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 650 return 1; 651 } 652 /* 653 * We don't check reserved bits in nonpae mode, because 654 * this isn't enforced, and VMware depends on this. 655 */ 656 } 657 658 /* 659 * Does the new cr3 value map to physical memory? (Note, we 660 * catch an invalid cr3 even in real-mode, because it would 661 * cause trouble later on when we turn on paging anyway.) 662 * 663 * A real CPU would silently accept an invalid cr3 and would 664 * attempt to use it - with largely undefined (and often hard 665 * to debug) behavior on the guest side. 666 */ 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 668 return 1; 669 vcpu->arch.cr3 = cr3; 670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 671 vcpu->arch.mmu.new_cr3(vcpu); 672 return 0; 673 } 674 EXPORT_SYMBOL_GPL(kvm_set_cr3); 675 676 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 677 { 678 if (cr8 & CR8_RESERVED_BITS) 679 return 1; 680 if (irqchip_in_kernel(vcpu->kvm)) 681 kvm_lapic_set_tpr(vcpu, cr8); 682 else 683 vcpu->arch.cr8 = cr8; 684 return 0; 685 } 686 EXPORT_SYMBOL_GPL(kvm_set_cr8); 687 688 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 689 { 690 if (irqchip_in_kernel(vcpu->kvm)) 691 return kvm_lapic_get_cr8(vcpu); 692 else 693 return vcpu->arch.cr8; 694 } 695 EXPORT_SYMBOL_GPL(kvm_get_cr8); 696 697 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 698 { 699 switch (dr) { 700 case 0 ... 3: 701 vcpu->arch.db[dr] = val; 702 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 703 vcpu->arch.eff_db[dr] = val; 704 break; 705 case 4: 706 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 707 return 1; /* #UD */ 708 /* fall through */ 709 case 6: 710 if (val & 0xffffffff00000000ULL) 711 return -1; /* #GP */ 712 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 713 break; 714 case 5: 715 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 716 return 1; /* #UD */ 717 /* fall through */ 718 default: /* 7 */ 719 if (val & 0xffffffff00000000ULL) 720 return -1; /* #GP */ 721 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 722 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 723 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 724 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); 725 } 726 break; 727 } 728 729 return 0; 730 } 731 732 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 733 { 734 int res; 735 736 res = __kvm_set_dr(vcpu, dr, val); 737 if (res > 0) 738 kvm_queue_exception(vcpu, UD_VECTOR); 739 else if (res < 0) 740 kvm_inject_gp(vcpu, 0); 741 742 return res; 743 } 744 EXPORT_SYMBOL_GPL(kvm_set_dr); 745 746 static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 747 { 748 switch (dr) { 749 case 0 ... 3: 750 *val = vcpu->arch.db[dr]; 751 break; 752 case 4: 753 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 754 return 1; 755 /* fall through */ 756 case 6: 757 *val = vcpu->arch.dr6; 758 break; 759 case 5: 760 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 761 return 1; 762 /* fall through */ 763 default: /* 7 */ 764 *val = vcpu->arch.dr7; 765 break; 766 } 767 768 return 0; 769 } 770 771 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 772 { 773 if (_kvm_get_dr(vcpu, dr, val)) { 774 kvm_queue_exception(vcpu, UD_VECTOR); 775 return 1; 776 } 777 return 0; 778 } 779 EXPORT_SYMBOL_GPL(kvm_get_dr); 780 781 /* 782 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 783 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 784 * 785 * This list is modified at module load time to reflect the 786 * capabilities of the host cpu. This capabilities test skips MSRs that are 787 * kvm-specific. Those are put in the beginning of the list. 788 */ 789 790 #define KVM_SAVE_MSRS_BEGIN 8 791 static u32 msrs_to_save[] = { 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 797 MSR_STAR, 798 #ifdef CONFIG_X86_64 799 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 800 #endif 801 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 802 }; 803 804 static unsigned num_msrs_to_save; 805 806 static u32 emulated_msrs[] = { 807 MSR_IA32_MISC_ENABLE, 808 MSR_IA32_MCG_STATUS, 809 MSR_IA32_MCG_CTL, 810 }; 811 812 static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 813 { 814 u64 old_efer = vcpu->arch.efer; 815 816 if (efer & efer_reserved_bits) 817 return 1; 818 819 if (is_paging(vcpu) 820 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 821 return 1; 822 823 if (efer & EFER_FFXSR) { 824 struct kvm_cpuid_entry2 *feat; 825 826 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 827 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 828 return 1; 829 } 830 831 if (efer & EFER_SVME) { 832 struct kvm_cpuid_entry2 *feat; 833 834 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 835 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 836 return 1; 837 } 838 839 efer &= ~EFER_LMA; 840 efer |= vcpu->arch.efer & EFER_LMA; 841 842 kvm_x86_ops->set_efer(vcpu, efer); 843 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 845 846 /* Update reserved bits */ 847 if ((efer ^ old_efer) & EFER_NX) 848 kvm_mmu_reset_context(vcpu); 849 850 return 0; 851 } 852 853 void kvm_enable_efer_bits(u64 mask) 854 { 855 efer_reserved_bits &= ~mask; 856 } 857 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 858 859 860 /* 861 * Writes msr value into into the appropriate "register". 862 * Returns 0 on success, non-0 otherwise. 863 * Assumes vcpu_load() was already called. 864 */ 865 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 866 { 867 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 868 } 869 870 /* 871 * Adapt set_msr() to msr_io()'s calling convention 872 */ 873 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 874 { 875 return kvm_set_msr(vcpu, index, *data); 876 } 877 878 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 879 { 880 int version; 881 int r; 882 struct pvclock_wall_clock wc; 883 struct timespec boot; 884 885 if (!wall_clock) 886 return; 887 888 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); 889 if (r) 890 return; 891 892 if (version & 1) 893 ++version; /* first time write, random junk */ 894 895 ++version; 896 897 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 898 899 /* 900 * The guest calculates current wall clock time by adding 901 * system time (updated by kvm_guest_time_update below) to the 902 * wall clock specified here. guest system time equals host 903 * system time for us, thus we must fill in host boot time here. 904 */ 905 getboottime(&boot); 906 907 wc.sec = boot.tv_sec; 908 wc.nsec = boot.tv_nsec; 909 wc.version = version; 910 911 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 912 913 version++; 914 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 915 } 916 917 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 918 { 919 uint32_t quotient, remainder; 920 921 /* Don't try to replace with do_div(), this one calculates 922 * "(dividend << 32) / divisor" */ 923 __asm__ ( "divl %4" 924 : "=a" (quotient), "=d" (remainder) 925 : "0" (0), "1" (dividend), "r" (divisor) ); 926 return quotient; 927 } 928 929 static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, 930 s8 *pshift, u32 *pmultiplier) 931 { 932 uint64_t scaled64; 933 int32_t shift = 0; 934 uint64_t tps64; 935 uint32_t tps32; 936 937 tps64 = base_khz * 1000LL; 938 scaled64 = scaled_khz * 1000LL; 939 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { 940 tps64 >>= 1; 941 shift--; 942 } 943 944 tps32 = (uint32_t)tps64; 945 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { 946 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) 947 scaled64 >>= 1; 948 else 949 tps32 <<= 1; 950 shift++; 951 } 952 953 *pshift = shift; 954 *pmultiplier = div_frac(scaled64, tps32); 955 956 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", 957 __func__, base_khz, scaled_khz, shift, *pmultiplier); 958 } 959 960 static inline u64 get_kernel_ns(void) 961 { 962 struct timespec ts; 963 964 WARN_ON(preemptible()); 965 ktime_get_ts(&ts); 966 monotonic_to_bootbased(&ts); 967 return timespec_to_ns(&ts); 968 } 969 970 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 971 unsigned long max_tsc_khz; 972 973 static inline int kvm_tsc_changes_freq(void) 974 { 975 int cpu = get_cpu(); 976 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 977 cpufreq_quick_get(cpu) != 0; 978 put_cpu(); 979 return ret; 980 } 981 982 static inline u64 nsec_to_cycles(u64 nsec) 983 { 984 u64 ret; 985 986 WARN_ON(preemptible()); 987 if (kvm_tsc_changes_freq()) 988 printk_once(KERN_WARNING 989 "kvm: unreliable cycle conversion on adjustable rate TSC\n"); 990 ret = nsec * __this_cpu_read(cpu_tsc_khz); 991 do_div(ret, USEC_PER_SEC); 992 return ret; 993 } 994 995 static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) 996 { 997 /* Compute a scale to convert nanoseconds in TSC cycles */ 998 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 999 &kvm->arch.virtual_tsc_shift, 1000 &kvm->arch.virtual_tsc_mult); 1001 kvm->arch.virtual_tsc_khz = this_tsc_khz; 1002 } 1003 1004 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1005 { 1006 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1007 vcpu->kvm->arch.virtual_tsc_mult, 1008 vcpu->kvm->arch.virtual_tsc_shift); 1009 tsc += vcpu->arch.last_tsc_write; 1010 return tsc; 1011 } 1012 1013 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) 1014 { 1015 struct kvm *kvm = vcpu->kvm; 1016 u64 offset, ns, elapsed; 1017 unsigned long flags; 1018 s64 sdiff; 1019 1020 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1021 offset = data - native_read_tsc(); 1022 ns = get_kernel_ns(); 1023 elapsed = ns - kvm->arch.last_tsc_nsec; 1024 sdiff = data - kvm->arch.last_tsc_write; 1025 if (sdiff < 0) 1026 sdiff = -sdiff; 1027 1028 /* 1029 * Special case: close write to TSC within 5 seconds of 1030 * another CPU is interpreted as an attempt to synchronize 1031 * The 5 seconds is to accomodate host load / swapping as 1032 * well as any reset of TSC during the boot process. 1033 * 1034 * In that case, for a reliable TSC, we can match TSC offsets, 1035 * or make a best guest using elapsed value. 1036 */ 1037 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && 1038 elapsed < 5ULL * NSEC_PER_SEC) { 1039 if (!check_tsc_unstable()) { 1040 offset = kvm->arch.last_tsc_offset; 1041 pr_debug("kvm: matched tsc offset for %llu\n", data); 1042 } else { 1043 u64 delta = nsec_to_cycles(elapsed); 1044 offset += delta; 1045 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1046 } 1047 ns = kvm->arch.last_tsc_nsec; 1048 } 1049 kvm->arch.last_tsc_nsec = ns; 1050 kvm->arch.last_tsc_write = data; 1051 kvm->arch.last_tsc_offset = offset; 1052 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1053 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1054 1055 /* Reset of TSC must disable overshoot protection below */ 1056 vcpu->arch.hv_clock.tsc_timestamp = 0; 1057 vcpu->arch.last_tsc_write = data; 1058 vcpu->arch.last_tsc_nsec = ns; 1059 } 1060 EXPORT_SYMBOL_GPL(kvm_write_tsc); 1061 1062 static int kvm_guest_time_update(struct kvm_vcpu *v) 1063 { 1064 unsigned long flags; 1065 struct kvm_vcpu_arch *vcpu = &v->arch; 1066 void *shared_kaddr; 1067 unsigned long this_tsc_khz; 1068 s64 kernel_ns, max_kernel_ns; 1069 u64 tsc_timestamp; 1070 1071 /* Keep irq disabled to prevent changes to the clock */ 1072 local_irq_save(flags); 1073 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1074 kernel_ns = get_kernel_ns(); 1075 this_tsc_khz = __this_cpu_read(cpu_tsc_khz); 1076 1077 if (unlikely(this_tsc_khz == 0)) { 1078 local_irq_restore(flags); 1079 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1080 return 1; 1081 } 1082 1083 /* 1084 * We may have to catch up the TSC to match elapsed wall clock 1085 * time for two reasons, even if kvmclock is used. 1086 * 1) CPU could have been running below the maximum TSC rate 1087 * 2) Broken TSC compensation resets the base at each VCPU 1088 * entry to avoid unknown leaps of TSC even when running 1089 * again on the same CPU. This may cause apparent elapsed 1090 * time to disappear, and the guest to stand still or run 1091 * very slowly. 1092 */ 1093 if (vcpu->tsc_catchup) { 1094 u64 tsc = compute_guest_tsc(v, kernel_ns); 1095 if (tsc > tsc_timestamp) { 1096 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); 1097 tsc_timestamp = tsc; 1098 } 1099 } 1100 1101 local_irq_restore(flags); 1102 1103 if (!vcpu->time_page) 1104 return 0; 1105 1106 /* 1107 * Time as measured by the TSC may go backwards when resetting the base 1108 * tsc_timestamp. The reason for this is that the TSC resolution is 1109 * higher than the resolution of the other clock scales. Thus, many 1110 * possible measurments of the TSC correspond to one measurement of any 1111 * other clock, and so a spread of values is possible. This is not a 1112 * problem for the computation of the nanosecond clock; with TSC rates 1113 * around 1GHZ, there can only be a few cycles which correspond to one 1114 * nanosecond value, and any path through this code will inevitably 1115 * take longer than that. However, with the kernel_ns value itself, 1116 * the precision may be much lower, down to HZ granularity. If the 1117 * first sampling of TSC against kernel_ns ends in the low part of the 1118 * range, and the second in the high end of the range, we can get: 1119 * 1120 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new 1121 * 1122 * As the sampling errors potentially range in the thousands of cycles, 1123 * it is possible such a time value has already been observed by the 1124 * guest. To protect against this, we must compute the system time as 1125 * observed by the guest and ensure the new system time is greater. 1126 */ 1127 max_kernel_ns = 0; 1128 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { 1129 max_kernel_ns = vcpu->last_guest_tsc - 1130 vcpu->hv_clock.tsc_timestamp; 1131 max_kernel_ns = pvclock_scale_delta(max_kernel_ns, 1132 vcpu->hv_clock.tsc_to_system_mul, 1133 vcpu->hv_clock.tsc_shift); 1134 max_kernel_ns += vcpu->last_kernel_ns; 1135 } 1136 1137 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { 1138 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, 1139 &vcpu->hv_clock.tsc_shift, 1140 &vcpu->hv_clock.tsc_to_system_mul); 1141 vcpu->hw_tsc_khz = this_tsc_khz; 1142 } 1143 1144 if (max_kernel_ns > kernel_ns) 1145 kernel_ns = max_kernel_ns; 1146 1147 /* With all the info we got, fill in the values */ 1148 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1149 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1150 vcpu->last_kernel_ns = kernel_ns; 1151 vcpu->last_guest_tsc = tsc_timestamp; 1152 vcpu->hv_clock.flags = 0; 1153 1154 /* 1155 * The interface expects us to write an even number signaling that the 1156 * update is finished. Since the guest won't see the intermediate 1157 * state, we just increase by 2 at the end. 1158 */ 1159 vcpu->hv_clock.version += 2; 1160 1161 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 1162 1163 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1164 sizeof(vcpu->hv_clock)); 1165 1166 kunmap_atomic(shared_kaddr, KM_USER0); 1167 1168 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1169 return 0; 1170 } 1171 1172 static bool msr_mtrr_valid(unsigned msr) 1173 { 1174 switch (msr) { 1175 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 1176 case MSR_MTRRfix64K_00000: 1177 case MSR_MTRRfix16K_80000: 1178 case MSR_MTRRfix16K_A0000: 1179 case MSR_MTRRfix4K_C0000: 1180 case MSR_MTRRfix4K_C8000: 1181 case MSR_MTRRfix4K_D0000: 1182 case MSR_MTRRfix4K_D8000: 1183 case MSR_MTRRfix4K_E0000: 1184 case MSR_MTRRfix4K_E8000: 1185 case MSR_MTRRfix4K_F0000: 1186 case MSR_MTRRfix4K_F8000: 1187 case MSR_MTRRdefType: 1188 case MSR_IA32_CR_PAT: 1189 return true; 1190 case 0x2f8: 1191 return true; 1192 } 1193 return false; 1194 } 1195 1196 static bool valid_pat_type(unsigned t) 1197 { 1198 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 1199 } 1200 1201 static bool valid_mtrr_type(unsigned t) 1202 { 1203 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 1204 } 1205 1206 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1207 { 1208 int i; 1209 1210 if (!msr_mtrr_valid(msr)) 1211 return false; 1212 1213 if (msr == MSR_IA32_CR_PAT) { 1214 for (i = 0; i < 8; i++) 1215 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 1216 return false; 1217 return true; 1218 } else if (msr == MSR_MTRRdefType) { 1219 if (data & ~0xcff) 1220 return false; 1221 return valid_mtrr_type(data & 0xff); 1222 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 1223 for (i = 0; i < 8 ; i++) 1224 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 1225 return false; 1226 return true; 1227 } 1228 1229 /* variable MTRRs */ 1230 return valid_mtrr_type(data & 0xff); 1231 } 1232 1233 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1234 { 1235 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1236 1237 if (!mtrr_valid(vcpu, msr, data)) 1238 return 1; 1239 1240 if (msr == MSR_MTRRdefType) { 1241 vcpu->arch.mtrr_state.def_type = data; 1242 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 1243 } else if (msr == MSR_MTRRfix64K_00000) 1244 p[0] = data; 1245 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1246 p[1 + msr - MSR_MTRRfix16K_80000] = data; 1247 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1248 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 1249 else if (msr == MSR_IA32_CR_PAT) 1250 vcpu->arch.pat = data; 1251 else { /* Variable MTRRs */ 1252 int idx, is_mtrr_mask; 1253 u64 *pt; 1254 1255 idx = (msr - 0x200) / 2; 1256 is_mtrr_mask = msr - 0x200 - 2 * idx; 1257 if (!is_mtrr_mask) 1258 pt = 1259 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1260 else 1261 pt = 1262 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1263 *pt = data; 1264 } 1265 1266 kvm_mmu_reset_context(vcpu); 1267 return 0; 1268 } 1269 1270 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1271 { 1272 u64 mcg_cap = vcpu->arch.mcg_cap; 1273 unsigned bank_num = mcg_cap & 0xff; 1274 1275 switch (msr) { 1276 case MSR_IA32_MCG_STATUS: 1277 vcpu->arch.mcg_status = data; 1278 break; 1279 case MSR_IA32_MCG_CTL: 1280 if (!(mcg_cap & MCG_CTL_P)) 1281 return 1; 1282 if (data != 0 && data != ~(u64)0) 1283 return -1; 1284 vcpu->arch.mcg_ctl = data; 1285 break; 1286 default: 1287 if (msr >= MSR_IA32_MC0_CTL && 1288 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1289 u32 offset = msr - MSR_IA32_MC0_CTL; 1290 /* only 0 or all 1s can be written to IA32_MCi_CTL 1291 * some Linux kernels though clear bit 10 in bank 4 to 1292 * workaround a BIOS/GART TBL issue on AMD K8s, ignore 1293 * this to avoid an uncatched #GP in the guest 1294 */ 1295 if ((offset & 0x3) == 0 && 1296 data != 0 && (data | (1 << 10)) != ~(u64)0) 1297 return -1; 1298 vcpu->arch.mce_banks[offset] = data; 1299 break; 1300 } 1301 return 1; 1302 } 1303 return 0; 1304 } 1305 1306 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) 1307 { 1308 struct kvm *kvm = vcpu->kvm; 1309 int lm = is_long_mode(vcpu); 1310 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 1311 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; 1312 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 1313 : kvm->arch.xen_hvm_config.blob_size_32; 1314 u32 page_num = data & ~PAGE_MASK; 1315 u64 page_addr = data & PAGE_MASK; 1316 u8 *page; 1317 int r; 1318 1319 r = -E2BIG; 1320 if (page_num >= blob_size) 1321 goto out; 1322 r = -ENOMEM; 1323 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 1324 if (!page) 1325 goto out; 1326 r = -EFAULT; 1327 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) 1328 goto out_free; 1329 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 1330 goto out_free; 1331 r = 0; 1332 out_free: 1333 kfree(page); 1334 out: 1335 return r; 1336 } 1337 1338 static bool kvm_hv_hypercall_enabled(struct kvm *kvm) 1339 { 1340 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; 1341 } 1342 1343 static bool kvm_hv_msr_partition_wide(u32 msr) 1344 { 1345 bool r = false; 1346 switch (msr) { 1347 case HV_X64_MSR_GUEST_OS_ID: 1348 case HV_X64_MSR_HYPERCALL: 1349 r = true; 1350 break; 1351 } 1352 1353 return r; 1354 } 1355 1356 static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1357 { 1358 struct kvm *kvm = vcpu->kvm; 1359 1360 switch (msr) { 1361 case HV_X64_MSR_GUEST_OS_ID: 1362 kvm->arch.hv_guest_os_id = data; 1363 /* setting guest os id to zero disables hypercall page */ 1364 if (!kvm->arch.hv_guest_os_id) 1365 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; 1366 break; 1367 case HV_X64_MSR_HYPERCALL: { 1368 u64 gfn; 1369 unsigned long addr; 1370 u8 instructions[4]; 1371 1372 /* if guest os id is not set hypercall should remain disabled */ 1373 if (!kvm->arch.hv_guest_os_id) 1374 break; 1375 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { 1376 kvm->arch.hv_hypercall = data; 1377 break; 1378 } 1379 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; 1380 addr = gfn_to_hva(kvm, gfn); 1381 if (kvm_is_error_hva(addr)) 1382 return 1; 1383 kvm_x86_ops->patch_hypercall(vcpu, instructions); 1384 ((unsigned char *)instructions)[3] = 0xc3; /* ret */ 1385 if (copy_to_user((void __user *)addr, instructions, 4)) 1386 return 1; 1387 kvm->arch.hv_hypercall = data; 1388 break; 1389 } 1390 default: 1391 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1392 "data 0x%llx\n", msr, data); 1393 return 1; 1394 } 1395 return 0; 1396 } 1397 1398 static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1399 { 1400 switch (msr) { 1401 case HV_X64_MSR_APIC_ASSIST_PAGE: { 1402 unsigned long addr; 1403 1404 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1405 vcpu->arch.hv_vapic = data; 1406 break; 1407 } 1408 addr = gfn_to_hva(vcpu->kvm, data >> 1409 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1410 if (kvm_is_error_hva(addr)) 1411 return 1; 1412 if (clear_user((void __user *)addr, PAGE_SIZE)) 1413 return 1; 1414 vcpu->arch.hv_vapic = data; 1415 break; 1416 } 1417 case HV_X64_MSR_EOI: 1418 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); 1419 case HV_X64_MSR_ICR: 1420 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); 1421 case HV_X64_MSR_TPR: 1422 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1423 default: 1424 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1425 "data 0x%llx\n", msr, data); 1426 return 1; 1427 } 1428 1429 return 0; 1430 } 1431 1432 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) 1433 { 1434 gpa_t gpa = data & ~0x3f; 1435 1436 /* Bits 2:5 are resrved, Should be zero */ 1437 if (data & 0x3c) 1438 return 1; 1439 1440 vcpu->arch.apf.msr_val = data; 1441 1442 if (!(data & KVM_ASYNC_PF_ENABLED)) { 1443 kvm_clear_async_pf_completion_queue(vcpu); 1444 kvm_async_pf_hash_reset(vcpu); 1445 return 0; 1446 } 1447 1448 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) 1449 return 1; 1450 1451 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); 1452 kvm_async_pf_wakeup_all(vcpu); 1453 return 0; 1454 } 1455 1456 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1457 { 1458 switch (msr) { 1459 case MSR_EFER: 1460 return set_efer(vcpu, data); 1461 case MSR_K7_HWCR: 1462 data &= ~(u64)0x40; /* ignore flush filter disable */ 1463 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1464 if (data != 0) { 1465 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1466 data); 1467 return 1; 1468 } 1469 break; 1470 case MSR_FAM10H_MMIO_CONF_BASE: 1471 if (data != 0) { 1472 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1473 "0x%llx\n", data); 1474 return 1; 1475 } 1476 break; 1477 case MSR_AMD64_NB_CFG: 1478 break; 1479 case MSR_IA32_DEBUGCTLMSR: 1480 if (!data) { 1481 /* We support the non-activated case already */ 1482 break; 1483 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 1484 /* Values other than LBR and BTF are vendor-specific, 1485 thus reserved and should throw a #GP */ 1486 return 1; 1487 } 1488 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1489 __func__, data); 1490 break; 1491 case MSR_IA32_UCODE_REV: 1492 case MSR_IA32_UCODE_WRITE: 1493 case MSR_VM_HSAVE_PA: 1494 case MSR_AMD64_PATCH_LOADER: 1495 break; 1496 case 0x200 ... 0x2ff: 1497 return set_msr_mtrr(vcpu, msr, data); 1498 case MSR_IA32_APICBASE: 1499 kvm_set_apic_base(vcpu, data); 1500 break; 1501 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1502 return kvm_x2apic_msr_write(vcpu, msr, data); 1503 case MSR_IA32_MISC_ENABLE: 1504 vcpu->arch.ia32_misc_enable_msr = data; 1505 break; 1506 case MSR_KVM_WALL_CLOCK_NEW: 1507 case MSR_KVM_WALL_CLOCK: 1508 vcpu->kvm->arch.wall_clock = data; 1509 kvm_write_wall_clock(vcpu->kvm, data); 1510 break; 1511 case MSR_KVM_SYSTEM_TIME_NEW: 1512 case MSR_KVM_SYSTEM_TIME: { 1513 if (vcpu->arch.time_page) { 1514 kvm_release_page_dirty(vcpu->arch.time_page); 1515 vcpu->arch.time_page = NULL; 1516 } 1517 1518 vcpu->arch.time = data; 1519 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1520 1521 /* we verify if the enable bit is set... */ 1522 if (!(data & 1)) 1523 break; 1524 1525 /* ...but clean it before doing the actual write */ 1526 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 1527 1528 vcpu->arch.time_page = 1529 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1530 1531 if (is_error_page(vcpu->arch.time_page)) { 1532 kvm_release_page_clean(vcpu->arch.time_page); 1533 vcpu->arch.time_page = NULL; 1534 } 1535 break; 1536 } 1537 case MSR_KVM_ASYNC_PF_EN: 1538 if (kvm_pv_enable_async_pf(vcpu, data)) 1539 return 1; 1540 break; 1541 case MSR_IA32_MCG_CTL: 1542 case MSR_IA32_MCG_STATUS: 1543 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1544 return set_msr_mce(vcpu, msr, data); 1545 1546 /* Performance counters are not protected by a CPUID bit, 1547 * so we should check all of them in the generic path for the sake of 1548 * cross vendor migration. 1549 * Writing a zero into the event select MSRs disables them, 1550 * which we perfectly emulate ;-). Any other value should be at least 1551 * reported, some guests depend on them. 1552 */ 1553 case MSR_P6_EVNTSEL0: 1554 case MSR_P6_EVNTSEL1: 1555 case MSR_K7_EVNTSEL0: 1556 case MSR_K7_EVNTSEL1: 1557 case MSR_K7_EVNTSEL2: 1558 case MSR_K7_EVNTSEL3: 1559 if (data != 0) 1560 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1561 "0x%x data 0x%llx\n", msr, data); 1562 break; 1563 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1564 * so we ignore writes to make it happy. 1565 */ 1566 case MSR_P6_PERFCTR0: 1567 case MSR_P6_PERFCTR1: 1568 case MSR_K7_PERFCTR0: 1569 case MSR_K7_PERFCTR1: 1570 case MSR_K7_PERFCTR2: 1571 case MSR_K7_PERFCTR3: 1572 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1573 "0x%x data 0x%llx\n", msr, data); 1574 break; 1575 case MSR_K7_CLK_CTL: 1576 /* 1577 * Ignore all writes to this no longer documented MSR. 1578 * Writes are only relevant for old K7 processors, 1579 * all pre-dating SVM, but a recommended workaround from 1580 * AMD for these chips. It is possible to speicify the 1581 * affected processor models on the command line, hence 1582 * the need to ignore the workaround. 1583 */ 1584 break; 1585 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1586 if (kvm_hv_msr_partition_wide(msr)) { 1587 int r; 1588 mutex_lock(&vcpu->kvm->lock); 1589 r = set_msr_hyperv_pw(vcpu, msr, data); 1590 mutex_unlock(&vcpu->kvm->lock); 1591 return r; 1592 } else 1593 return set_msr_hyperv(vcpu, msr, data); 1594 break; 1595 default: 1596 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1597 return xen_hvm_config(vcpu, data); 1598 if (!ignore_msrs) { 1599 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1600 msr, data); 1601 return 1; 1602 } else { 1603 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1604 msr, data); 1605 break; 1606 } 1607 } 1608 return 0; 1609 } 1610 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1611 1612 1613 /* 1614 * Reads an msr value (of 'msr_index') into 'pdata'. 1615 * Returns 0 on success, non-0 otherwise. 1616 * Assumes vcpu_load() was already called. 1617 */ 1618 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1619 { 1620 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 1621 } 1622 1623 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1624 { 1625 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1626 1627 if (!msr_mtrr_valid(msr)) 1628 return 1; 1629 1630 if (msr == MSR_MTRRdefType) 1631 *pdata = vcpu->arch.mtrr_state.def_type + 1632 (vcpu->arch.mtrr_state.enabled << 10); 1633 else if (msr == MSR_MTRRfix64K_00000) 1634 *pdata = p[0]; 1635 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1636 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 1637 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1638 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 1639 else if (msr == MSR_IA32_CR_PAT) 1640 *pdata = vcpu->arch.pat; 1641 else { /* Variable MTRRs */ 1642 int idx, is_mtrr_mask; 1643 u64 *pt; 1644 1645 idx = (msr - 0x200) / 2; 1646 is_mtrr_mask = msr - 0x200 - 2 * idx; 1647 if (!is_mtrr_mask) 1648 pt = 1649 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1650 else 1651 pt = 1652 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1653 *pdata = *pt; 1654 } 1655 1656 return 0; 1657 } 1658 1659 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1660 { 1661 u64 data; 1662 u64 mcg_cap = vcpu->arch.mcg_cap; 1663 unsigned bank_num = mcg_cap & 0xff; 1664 1665 switch (msr) { 1666 case MSR_IA32_P5_MC_ADDR: 1667 case MSR_IA32_P5_MC_TYPE: 1668 data = 0; 1669 break; 1670 case MSR_IA32_MCG_CAP: 1671 data = vcpu->arch.mcg_cap; 1672 break; 1673 case MSR_IA32_MCG_CTL: 1674 if (!(mcg_cap & MCG_CTL_P)) 1675 return 1; 1676 data = vcpu->arch.mcg_ctl; 1677 break; 1678 case MSR_IA32_MCG_STATUS: 1679 data = vcpu->arch.mcg_status; 1680 break; 1681 default: 1682 if (msr >= MSR_IA32_MC0_CTL && 1683 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1684 u32 offset = msr - MSR_IA32_MC0_CTL; 1685 data = vcpu->arch.mce_banks[offset]; 1686 break; 1687 } 1688 return 1; 1689 } 1690 *pdata = data; 1691 return 0; 1692 } 1693 1694 static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1695 { 1696 u64 data = 0; 1697 struct kvm *kvm = vcpu->kvm; 1698 1699 switch (msr) { 1700 case HV_X64_MSR_GUEST_OS_ID: 1701 data = kvm->arch.hv_guest_os_id; 1702 break; 1703 case HV_X64_MSR_HYPERCALL: 1704 data = kvm->arch.hv_hypercall; 1705 break; 1706 default: 1707 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1708 return 1; 1709 } 1710 1711 *pdata = data; 1712 return 0; 1713 } 1714 1715 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1716 { 1717 u64 data = 0; 1718 1719 switch (msr) { 1720 case HV_X64_MSR_VP_INDEX: { 1721 int r; 1722 struct kvm_vcpu *v; 1723 kvm_for_each_vcpu(r, v, vcpu->kvm) 1724 if (v == vcpu) 1725 data = r; 1726 break; 1727 } 1728 case HV_X64_MSR_EOI: 1729 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); 1730 case HV_X64_MSR_ICR: 1731 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); 1732 case HV_X64_MSR_TPR: 1733 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); 1734 default: 1735 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1736 return 1; 1737 } 1738 *pdata = data; 1739 return 0; 1740 } 1741 1742 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1743 { 1744 u64 data; 1745 1746 switch (msr) { 1747 case MSR_IA32_PLATFORM_ID: 1748 case MSR_IA32_UCODE_REV: 1749 case MSR_IA32_EBL_CR_POWERON: 1750 case MSR_IA32_DEBUGCTLMSR: 1751 case MSR_IA32_LASTBRANCHFROMIP: 1752 case MSR_IA32_LASTBRANCHTOIP: 1753 case MSR_IA32_LASTINTFROMIP: 1754 case MSR_IA32_LASTINTTOIP: 1755 case MSR_K8_SYSCFG: 1756 case MSR_K7_HWCR: 1757 case MSR_VM_HSAVE_PA: 1758 case MSR_P6_PERFCTR0: 1759 case MSR_P6_PERFCTR1: 1760 case MSR_P6_EVNTSEL0: 1761 case MSR_P6_EVNTSEL1: 1762 case MSR_K7_EVNTSEL0: 1763 case MSR_K7_PERFCTR0: 1764 case MSR_K8_INT_PENDING_MSG: 1765 case MSR_AMD64_NB_CFG: 1766 case MSR_FAM10H_MMIO_CONF_BASE: 1767 data = 0; 1768 break; 1769 case MSR_MTRRcap: 1770 data = 0x500 | KVM_NR_VAR_MTRR; 1771 break; 1772 case 0x200 ... 0x2ff: 1773 return get_msr_mtrr(vcpu, msr, pdata); 1774 case 0xcd: /* fsb frequency */ 1775 data = 3; 1776 break; 1777 /* 1778 * MSR_EBC_FREQUENCY_ID 1779 * Conservative value valid for even the basic CPU models. 1780 * Models 0,1: 000 in bits 23:21 indicating a bus speed of 1781 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, 1782 * and 266MHz for model 3, or 4. Set Core Clock 1783 * Frequency to System Bus Frequency Ratio to 1 (bits 1784 * 31:24) even though these are only valid for CPU 1785 * models > 2, however guests may end up dividing or 1786 * multiplying by zero otherwise. 1787 */ 1788 case MSR_EBC_FREQUENCY_ID: 1789 data = 1 << 24; 1790 break; 1791 case MSR_IA32_APICBASE: 1792 data = kvm_get_apic_base(vcpu); 1793 break; 1794 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1795 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1796 break; 1797 case MSR_IA32_MISC_ENABLE: 1798 data = vcpu->arch.ia32_misc_enable_msr; 1799 break; 1800 case MSR_IA32_PERF_STATUS: 1801 /* TSC increment by tick */ 1802 data = 1000ULL; 1803 /* CPU multiplier */ 1804 data |= (((uint64_t)4ULL) << 40); 1805 break; 1806 case MSR_EFER: 1807 data = vcpu->arch.efer; 1808 break; 1809 case MSR_KVM_WALL_CLOCK: 1810 case MSR_KVM_WALL_CLOCK_NEW: 1811 data = vcpu->kvm->arch.wall_clock; 1812 break; 1813 case MSR_KVM_SYSTEM_TIME: 1814 case MSR_KVM_SYSTEM_TIME_NEW: 1815 data = vcpu->arch.time; 1816 break; 1817 case MSR_KVM_ASYNC_PF_EN: 1818 data = vcpu->arch.apf.msr_val; 1819 break; 1820 case MSR_IA32_P5_MC_ADDR: 1821 case MSR_IA32_P5_MC_TYPE: 1822 case MSR_IA32_MCG_CAP: 1823 case MSR_IA32_MCG_CTL: 1824 case MSR_IA32_MCG_STATUS: 1825 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1826 return get_msr_mce(vcpu, msr, pdata); 1827 case MSR_K7_CLK_CTL: 1828 /* 1829 * Provide expected ramp-up count for K7. All other 1830 * are set to zero, indicating minimum divisors for 1831 * every field. 1832 * 1833 * This prevents guest kernels on AMD host with CPU 1834 * type 6, model 8 and higher from exploding due to 1835 * the rdmsr failing. 1836 */ 1837 data = 0x20000000; 1838 break; 1839 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1840 if (kvm_hv_msr_partition_wide(msr)) { 1841 int r; 1842 mutex_lock(&vcpu->kvm->lock); 1843 r = get_msr_hyperv_pw(vcpu, msr, pdata); 1844 mutex_unlock(&vcpu->kvm->lock); 1845 return r; 1846 } else 1847 return get_msr_hyperv(vcpu, msr, pdata); 1848 break; 1849 default: 1850 if (!ignore_msrs) { 1851 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1852 return 1; 1853 } else { 1854 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1855 data = 0; 1856 } 1857 break; 1858 } 1859 *pdata = data; 1860 return 0; 1861 } 1862 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1863 1864 /* 1865 * Read or write a bunch of msrs. All parameters are kernel addresses. 1866 * 1867 * @return number of msrs set successfully. 1868 */ 1869 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1870 struct kvm_msr_entry *entries, 1871 int (*do_msr)(struct kvm_vcpu *vcpu, 1872 unsigned index, u64 *data)) 1873 { 1874 int i, idx; 1875 1876 idx = srcu_read_lock(&vcpu->kvm->srcu); 1877 for (i = 0; i < msrs->nmsrs; ++i) 1878 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1879 break; 1880 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1881 1882 return i; 1883 } 1884 1885 /* 1886 * Read or write a bunch of msrs. Parameters are user addresses. 1887 * 1888 * @return number of msrs set successfully. 1889 */ 1890 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1891 int (*do_msr)(struct kvm_vcpu *vcpu, 1892 unsigned index, u64 *data), 1893 int writeback) 1894 { 1895 struct kvm_msrs msrs; 1896 struct kvm_msr_entry *entries; 1897 int r, n; 1898 unsigned size; 1899 1900 r = -EFAULT; 1901 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1902 goto out; 1903 1904 r = -E2BIG; 1905 if (msrs.nmsrs >= MAX_IO_MSRS) 1906 goto out; 1907 1908 r = -ENOMEM; 1909 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1910 entries = kmalloc(size, GFP_KERNEL); 1911 if (!entries) 1912 goto out; 1913 1914 r = -EFAULT; 1915 if (copy_from_user(entries, user_msrs->entries, size)) 1916 goto out_free; 1917 1918 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1919 if (r < 0) 1920 goto out_free; 1921 1922 r = -EFAULT; 1923 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1924 goto out_free; 1925 1926 r = n; 1927 1928 out_free: 1929 kfree(entries); 1930 out: 1931 return r; 1932 } 1933 1934 int kvm_dev_ioctl_check_extension(long ext) 1935 { 1936 int r; 1937 1938 switch (ext) { 1939 case KVM_CAP_IRQCHIP: 1940 case KVM_CAP_HLT: 1941 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1942 case KVM_CAP_SET_TSS_ADDR: 1943 case KVM_CAP_EXT_CPUID: 1944 case KVM_CAP_CLOCKSOURCE: 1945 case KVM_CAP_PIT: 1946 case KVM_CAP_NOP_IO_DELAY: 1947 case KVM_CAP_MP_STATE: 1948 case KVM_CAP_SYNC_MMU: 1949 case KVM_CAP_USER_NMI: 1950 case KVM_CAP_REINJECT_CONTROL: 1951 case KVM_CAP_IRQ_INJECT_STATUS: 1952 case KVM_CAP_ASSIGN_DEV_IRQ: 1953 case KVM_CAP_IRQFD: 1954 case KVM_CAP_IOEVENTFD: 1955 case KVM_CAP_PIT2: 1956 case KVM_CAP_PIT_STATE2: 1957 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1958 case KVM_CAP_XEN_HVM: 1959 case KVM_CAP_ADJUST_CLOCK: 1960 case KVM_CAP_VCPU_EVENTS: 1961 case KVM_CAP_HYPERV: 1962 case KVM_CAP_HYPERV_VAPIC: 1963 case KVM_CAP_HYPERV_SPIN: 1964 case KVM_CAP_PCI_SEGMENT: 1965 case KVM_CAP_DEBUGREGS: 1966 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1967 case KVM_CAP_XSAVE: 1968 case KVM_CAP_ASYNC_PF: 1969 r = 1; 1970 break; 1971 case KVM_CAP_COALESCED_MMIO: 1972 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1973 break; 1974 case KVM_CAP_VAPIC: 1975 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1976 break; 1977 case KVM_CAP_NR_VCPUS: 1978 r = KVM_MAX_VCPUS; 1979 break; 1980 case KVM_CAP_NR_MEMSLOTS: 1981 r = KVM_MEMORY_SLOTS; 1982 break; 1983 case KVM_CAP_PV_MMU: /* obsolete */ 1984 r = 0; 1985 break; 1986 case KVM_CAP_IOMMU: 1987 r = iommu_found(); 1988 break; 1989 case KVM_CAP_MCE: 1990 r = KVM_MAX_MCE_BANKS; 1991 break; 1992 case KVM_CAP_XCRS: 1993 r = cpu_has_xsave; 1994 break; 1995 default: 1996 r = 0; 1997 break; 1998 } 1999 return r; 2000 2001 } 2002 2003 long kvm_arch_dev_ioctl(struct file *filp, 2004 unsigned int ioctl, unsigned long arg) 2005 { 2006 void __user *argp = (void __user *)arg; 2007 long r; 2008 2009 switch (ioctl) { 2010 case KVM_GET_MSR_INDEX_LIST: { 2011 struct kvm_msr_list __user *user_msr_list = argp; 2012 struct kvm_msr_list msr_list; 2013 unsigned n; 2014 2015 r = -EFAULT; 2016 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 2017 goto out; 2018 n = msr_list.nmsrs; 2019 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 2020 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 2021 goto out; 2022 r = -E2BIG; 2023 if (n < msr_list.nmsrs) 2024 goto out; 2025 r = -EFAULT; 2026 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 2027 num_msrs_to_save * sizeof(u32))) 2028 goto out; 2029 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 2030 &emulated_msrs, 2031 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 2032 goto out; 2033 r = 0; 2034 break; 2035 } 2036 case KVM_GET_SUPPORTED_CPUID: { 2037 struct kvm_cpuid2 __user *cpuid_arg = argp; 2038 struct kvm_cpuid2 cpuid; 2039 2040 r = -EFAULT; 2041 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2042 goto out; 2043 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 2044 cpuid_arg->entries); 2045 if (r) 2046 goto out; 2047 2048 r = -EFAULT; 2049 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 2050 goto out; 2051 r = 0; 2052 break; 2053 } 2054 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 2055 u64 mce_cap; 2056 2057 mce_cap = KVM_MCE_CAP_SUPPORTED; 2058 r = -EFAULT; 2059 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 2060 goto out; 2061 r = 0; 2062 break; 2063 } 2064 default: 2065 r = -EINVAL; 2066 } 2067 out: 2068 return r; 2069 } 2070 2071 static void wbinvd_ipi(void *garbage) 2072 { 2073 wbinvd(); 2074 } 2075 2076 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 2077 { 2078 return vcpu->kvm->arch.iommu_domain && 2079 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); 2080 } 2081 2082 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2083 { 2084 /* Address WBINVD may be executed by guest */ 2085 if (need_emulate_wbinvd(vcpu)) { 2086 if (kvm_x86_ops->has_wbinvd_exit()) 2087 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 2088 else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 2089 smp_call_function_single(vcpu->cpu, 2090 wbinvd_ipi, NULL, 1); 2091 } 2092 2093 kvm_x86_ops->vcpu_load(vcpu, cpu); 2094 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2095 /* Make sure TSC doesn't go backwards */ 2096 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2097 native_read_tsc() - vcpu->arch.last_host_tsc; 2098 if (tsc_delta < 0) 2099 mark_tsc_unstable("KVM discovered backwards TSC"); 2100 if (check_tsc_unstable()) { 2101 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2102 vcpu->arch.tsc_catchup = 1; 2103 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2104 } 2105 if (vcpu->cpu != cpu) 2106 kvm_migrate_timers(vcpu); 2107 vcpu->cpu = cpu; 2108 } 2109 } 2110 2111 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2112 { 2113 kvm_x86_ops->vcpu_put(vcpu); 2114 kvm_put_guest_fpu(vcpu); 2115 vcpu->arch.last_host_tsc = native_read_tsc(); 2116 } 2117 2118 static int is_efer_nx(void) 2119 { 2120 unsigned long long efer = 0; 2121 2122 rdmsrl_safe(MSR_EFER, &efer); 2123 return efer & EFER_NX; 2124 } 2125 2126 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 2127 { 2128 int i; 2129 struct kvm_cpuid_entry2 *e, *entry; 2130 2131 entry = NULL; 2132 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2133 e = &vcpu->arch.cpuid_entries[i]; 2134 if (e->function == 0x80000001) { 2135 entry = e; 2136 break; 2137 } 2138 } 2139 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 2140 entry->edx &= ~(1 << 20); 2141 printk(KERN_INFO "kvm: guest NX capability removed\n"); 2142 } 2143 } 2144 2145 /* when an old userspace process fills a new kernel module */ 2146 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 2147 struct kvm_cpuid *cpuid, 2148 struct kvm_cpuid_entry __user *entries) 2149 { 2150 int r, i; 2151 struct kvm_cpuid_entry *cpuid_entries; 2152 2153 r = -E2BIG; 2154 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 2155 goto out; 2156 r = -ENOMEM; 2157 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 2158 if (!cpuid_entries) 2159 goto out; 2160 r = -EFAULT; 2161 if (copy_from_user(cpuid_entries, entries, 2162 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 2163 goto out_free; 2164 for (i = 0; i < cpuid->nent; i++) { 2165 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 2166 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 2167 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 2168 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 2169 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 2170 vcpu->arch.cpuid_entries[i].index = 0; 2171 vcpu->arch.cpuid_entries[i].flags = 0; 2172 vcpu->arch.cpuid_entries[i].padding[0] = 0; 2173 vcpu->arch.cpuid_entries[i].padding[1] = 0; 2174 vcpu->arch.cpuid_entries[i].padding[2] = 0; 2175 } 2176 vcpu->arch.cpuid_nent = cpuid->nent; 2177 cpuid_fix_nx_cap(vcpu); 2178 r = 0; 2179 kvm_apic_set_version(vcpu); 2180 kvm_x86_ops->cpuid_update(vcpu); 2181 update_cpuid(vcpu); 2182 2183 out_free: 2184 vfree(cpuid_entries); 2185 out: 2186 return r; 2187 } 2188 2189 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 2190 struct kvm_cpuid2 *cpuid, 2191 struct kvm_cpuid_entry2 __user *entries) 2192 { 2193 int r; 2194 2195 r = -E2BIG; 2196 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 2197 goto out; 2198 r = -EFAULT; 2199 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 2200 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 2201 goto out; 2202 vcpu->arch.cpuid_nent = cpuid->nent; 2203 kvm_apic_set_version(vcpu); 2204 kvm_x86_ops->cpuid_update(vcpu); 2205 update_cpuid(vcpu); 2206 return 0; 2207 2208 out: 2209 return r; 2210 } 2211 2212 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 2213 struct kvm_cpuid2 *cpuid, 2214 struct kvm_cpuid_entry2 __user *entries) 2215 { 2216 int r; 2217 2218 r = -E2BIG; 2219 if (cpuid->nent < vcpu->arch.cpuid_nent) 2220 goto out; 2221 r = -EFAULT; 2222 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 2223 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 2224 goto out; 2225 return 0; 2226 2227 out: 2228 cpuid->nent = vcpu->arch.cpuid_nent; 2229 return r; 2230 } 2231 2232 static void cpuid_mask(u32 *word, int wordnum) 2233 { 2234 *word &= boot_cpu_data.x86_capability[wordnum]; 2235 } 2236 2237 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2238 u32 index) 2239 { 2240 entry->function = function; 2241 entry->index = index; 2242 cpuid_count(entry->function, entry->index, 2243 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 2244 entry->flags = 0; 2245 } 2246 2247 #define F(x) bit(X86_FEATURE_##x) 2248 2249 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2250 u32 index, int *nent, int maxnent) 2251 { 2252 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 2253 #ifdef CONFIG_X86_64 2254 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) 2255 ? F(GBPAGES) : 0; 2256 unsigned f_lm = F(LM); 2257 #else 2258 unsigned f_gbpages = 0; 2259 unsigned f_lm = 0; 2260 #endif 2261 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 2262 2263 /* cpuid 1.edx */ 2264 const u32 kvm_supported_word0_x86_features = 2265 F(FPU) | F(VME) | F(DE) | F(PSE) | 2266 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 2267 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 2268 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 2269 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 2270 0 /* Reserved, DS, ACPI */ | F(MMX) | 2271 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 2272 0 /* HTT, TM, Reserved, PBE */; 2273 /* cpuid 0x80000001.edx */ 2274 const u32 kvm_supported_word1_x86_features = 2275 F(FPU) | F(VME) | F(DE) | F(PSE) | 2276 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 2277 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 2278 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 2279 F(PAT) | F(PSE36) | 0 /* Reserved */ | 2280 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 2281 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 2282 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 2283 /* cpuid 1.ecx */ 2284 const u32 kvm_supported_word4_x86_features = 2285 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 2286 0 /* DS-CPL, VMX, SMX, EST */ | 2287 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 2288 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 2289 0 /* Reserved, DCA */ | F(XMM4_1) | 2290 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2291 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 2292 F(F16C); 2293 /* cpuid 0x80000001.ecx */ 2294 const u32 kvm_supported_word6_x86_features = 2295 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 2296 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2297 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 2298 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 2299 2300 /* all calls to cpuid_count() should be made on the same cpu */ 2301 get_cpu(); 2302 do_cpuid_1_ent(entry, function, index); 2303 ++*nent; 2304 2305 switch (function) { 2306 case 0: 2307 entry->eax = min(entry->eax, (u32)0xd); 2308 break; 2309 case 1: 2310 entry->edx &= kvm_supported_word0_x86_features; 2311 cpuid_mask(&entry->edx, 0); 2312 entry->ecx &= kvm_supported_word4_x86_features; 2313 cpuid_mask(&entry->ecx, 4); 2314 /* we support x2apic emulation even if host does not support 2315 * it since we emulate x2apic in software */ 2316 entry->ecx |= F(X2APIC); 2317 break; 2318 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 2319 * may return different values. This forces us to get_cpu() before 2320 * issuing the first command, and also to emulate this annoying behavior 2321 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 2322 case 2: { 2323 int t, times = entry->eax & 0xff; 2324 2325 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 2326 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2327 for (t = 1; t < times && *nent < maxnent; ++t) { 2328 do_cpuid_1_ent(&entry[t], function, 0); 2329 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 2330 ++*nent; 2331 } 2332 break; 2333 } 2334 /* function 4 and 0xb have additional index. */ 2335 case 4: { 2336 int i, cache_type; 2337 2338 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2339 /* read more entries until cache_type is zero */ 2340 for (i = 1; *nent < maxnent; ++i) { 2341 cache_type = entry[i - 1].eax & 0x1f; 2342 if (!cache_type) 2343 break; 2344 do_cpuid_1_ent(&entry[i], function, i); 2345 entry[i].flags |= 2346 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2347 ++*nent; 2348 } 2349 break; 2350 } 2351 case 0xb: { 2352 int i, level_type; 2353 2354 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2355 /* read more entries until level_type is zero */ 2356 for (i = 1; *nent < maxnent; ++i) { 2357 level_type = entry[i - 1].ecx & 0xff00; 2358 if (!level_type) 2359 break; 2360 do_cpuid_1_ent(&entry[i], function, i); 2361 entry[i].flags |= 2362 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2363 ++*nent; 2364 } 2365 break; 2366 } 2367 case 0xd: { 2368 int i; 2369 2370 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2371 for (i = 1; *nent < maxnent; ++i) { 2372 if (entry[i - 1].eax == 0 && i != 2) 2373 break; 2374 do_cpuid_1_ent(&entry[i], function, i); 2375 entry[i].flags |= 2376 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2377 ++*nent; 2378 } 2379 break; 2380 } 2381 case KVM_CPUID_SIGNATURE: { 2382 char signature[12] = "KVMKVMKVM\0\0"; 2383 u32 *sigptr = (u32 *)signature; 2384 entry->eax = 0; 2385 entry->ebx = sigptr[0]; 2386 entry->ecx = sigptr[1]; 2387 entry->edx = sigptr[2]; 2388 break; 2389 } 2390 case KVM_CPUID_FEATURES: 2391 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 2392 (1 << KVM_FEATURE_NOP_IO_DELAY) | 2393 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2394 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2395 entry->ebx = 0; 2396 entry->ecx = 0; 2397 entry->edx = 0; 2398 break; 2399 case 0x80000000: 2400 entry->eax = min(entry->eax, 0x8000001a); 2401 break; 2402 case 0x80000001: 2403 entry->edx &= kvm_supported_word1_x86_features; 2404 cpuid_mask(&entry->edx, 1); 2405 entry->ecx &= kvm_supported_word6_x86_features; 2406 cpuid_mask(&entry->ecx, 6); 2407 break; 2408 } 2409 2410 kvm_x86_ops->set_supported_cpuid(function, entry); 2411 2412 put_cpu(); 2413 } 2414 2415 #undef F 2416 2417 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 2418 struct kvm_cpuid_entry2 __user *entries) 2419 { 2420 struct kvm_cpuid_entry2 *cpuid_entries; 2421 int limit, nent = 0, r = -E2BIG; 2422 u32 func; 2423 2424 if (cpuid->nent < 1) 2425 goto out; 2426 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 2427 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 2428 r = -ENOMEM; 2429 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 2430 if (!cpuid_entries) 2431 goto out; 2432 2433 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 2434 limit = cpuid_entries[0].eax; 2435 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 2436 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2437 &nent, cpuid->nent); 2438 r = -E2BIG; 2439 if (nent >= cpuid->nent) 2440 goto out_free; 2441 2442 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 2443 limit = cpuid_entries[nent - 1].eax; 2444 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 2445 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2446 &nent, cpuid->nent); 2447 2448 2449 2450 r = -E2BIG; 2451 if (nent >= cpuid->nent) 2452 goto out_free; 2453 2454 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, 2455 cpuid->nent); 2456 2457 r = -E2BIG; 2458 if (nent >= cpuid->nent) 2459 goto out_free; 2460 2461 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, 2462 cpuid->nent); 2463 2464 r = -E2BIG; 2465 if (nent >= cpuid->nent) 2466 goto out_free; 2467 2468 r = -EFAULT; 2469 if (copy_to_user(entries, cpuid_entries, 2470 nent * sizeof(struct kvm_cpuid_entry2))) 2471 goto out_free; 2472 cpuid->nent = nent; 2473 r = 0; 2474 2475 out_free: 2476 vfree(cpuid_entries); 2477 out: 2478 return r; 2479 } 2480 2481 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2482 struct kvm_lapic_state *s) 2483 { 2484 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2485 2486 return 0; 2487 } 2488 2489 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2490 struct kvm_lapic_state *s) 2491 { 2492 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2493 kvm_apic_post_state_restore(vcpu); 2494 update_cr8_intercept(vcpu); 2495 2496 return 0; 2497 } 2498 2499 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2500 struct kvm_interrupt *irq) 2501 { 2502 if (irq->irq < 0 || irq->irq >= 256) 2503 return -EINVAL; 2504 if (irqchip_in_kernel(vcpu->kvm)) 2505 return -ENXIO; 2506 2507 kvm_queue_interrupt(vcpu, irq->irq, false); 2508 kvm_make_request(KVM_REQ_EVENT, vcpu); 2509 2510 return 0; 2511 } 2512 2513 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2514 { 2515 kvm_inject_nmi(vcpu); 2516 2517 return 0; 2518 } 2519 2520 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 2521 struct kvm_tpr_access_ctl *tac) 2522 { 2523 if (tac->flags) 2524 return -EINVAL; 2525 vcpu->arch.tpr_access_reporting = !!tac->enabled; 2526 return 0; 2527 } 2528 2529 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 2530 u64 mcg_cap) 2531 { 2532 int r; 2533 unsigned bank_num = mcg_cap & 0xff, bank; 2534 2535 r = -EINVAL; 2536 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2537 goto out; 2538 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 2539 goto out; 2540 r = 0; 2541 vcpu->arch.mcg_cap = mcg_cap; 2542 /* Init IA32_MCG_CTL to all 1s */ 2543 if (mcg_cap & MCG_CTL_P) 2544 vcpu->arch.mcg_ctl = ~(u64)0; 2545 /* Init IA32_MCi_CTL to all 1s */ 2546 for (bank = 0; bank < bank_num; bank++) 2547 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2548 out: 2549 return r; 2550 } 2551 2552 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 2553 struct kvm_x86_mce *mce) 2554 { 2555 u64 mcg_cap = vcpu->arch.mcg_cap; 2556 unsigned bank_num = mcg_cap & 0xff; 2557 u64 *banks = vcpu->arch.mce_banks; 2558 2559 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 2560 return -EINVAL; 2561 /* 2562 * if IA32_MCG_CTL is not all 1s, the uncorrected error 2563 * reporting is disabled 2564 */ 2565 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 2566 vcpu->arch.mcg_ctl != ~(u64)0) 2567 return 0; 2568 banks += 4 * mce->bank; 2569 /* 2570 * if IA32_MCi_CTL is not all 1s, the uncorrected error 2571 * reporting is disabled for the bank 2572 */ 2573 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 2574 return 0; 2575 if (mce->status & MCI_STATUS_UC) { 2576 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2577 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2578 printk(KERN_DEBUG "kvm: set_mce: " 2579 "injects mce exception while " 2580 "previous one is in progress!\n"); 2581 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2582 return 0; 2583 } 2584 if (banks[1] & MCI_STATUS_VAL) 2585 mce->status |= MCI_STATUS_OVER; 2586 banks[2] = mce->addr; 2587 banks[3] = mce->misc; 2588 vcpu->arch.mcg_status = mce->mcg_status; 2589 banks[1] = mce->status; 2590 kvm_queue_exception(vcpu, MC_VECTOR); 2591 } else if (!(banks[1] & MCI_STATUS_VAL) 2592 || !(banks[1] & MCI_STATUS_UC)) { 2593 if (banks[1] & MCI_STATUS_VAL) 2594 mce->status |= MCI_STATUS_OVER; 2595 banks[2] = mce->addr; 2596 banks[3] = mce->misc; 2597 banks[1] = mce->status; 2598 } else 2599 banks[1] |= MCI_STATUS_OVER; 2600 return 0; 2601 } 2602 2603 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2604 struct kvm_vcpu_events *events) 2605 { 2606 events->exception.injected = 2607 vcpu->arch.exception.pending && 2608 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2609 events->exception.nr = vcpu->arch.exception.nr; 2610 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2611 events->exception.pad = 0; 2612 events->exception.error_code = vcpu->arch.exception.error_code; 2613 2614 events->interrupt.injected = 2615 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; 2616 events->interrupt.nr = vcpu->arch.interrupt.nr; 2617 events->interrupt.soft = 0; 2618 events->interrupt.shadow = 2619 kvm_x86_ops->get_interrupt_shadow(vcpu, 2620 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); 2621 2622 events->nmi.injected = vcpu->arch.nmi_injected; 2623 events->nmi.pending = vcpu->arch.nmi_pending; 2624 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2625 events->nmi.pad = 0; 2626 2627 events->sipi_vector = vcpu->arch.sipi_vector; 2628 2629 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2630 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2631 | KVM_VCPUEVENT_VALID_SHADOW); 2632 memset(&events->reserved, 0, sizeof(events->reserved)); 2633 } 2634 2635 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2636 struct kvm_vcpu_events *events) 2637 { 2638 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2639 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2640 | KVM_VCPUEVENT_VALID_SHADOW)) 2641 return -EINVAL; 2642 2643 vcpu->arch.exception.pending = events->exception.injected; 2644 vcpu->arch.exception.nr = events->exception.nr; 2645 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2646 vcpu->arch.exception.error_code = events->exception.error_code; 2647 2648 vcpu->arch.interrupt.pending = events->interrupt.injected; 2649 vcpu->arch.interrupt.nr = events->interrupt.nr; 2650 vcpu->arch.interrupt.soft = events->interrupt.soft; 2651 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2652 kvm_pic_clear_isr_ack(vcpu->kvm); 2653 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2654 kvm_x86_ops->set_interrupt_shadow(vcpu, 2655 events->interrupt.shadow); 2656 2657 vcpu->arch.nmi_injected = events->nmi.injected; 2658 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2659 vcpu->arch.nmi_pending = events->nmi.pending; 2660 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2661 2662 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2663 vcpu->arch.sipi_vector = events->sipi_vector; 2664 2665 kvm_make_request(KVM_REQ_EVENT, vcpu); 2666 2667 return 0; 2668 } 2669 2670 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2671 struct kvm_debugregs *dbgregs) 2672 { 2673 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2674 dbgregs->dr6 = vcpu->arch.dr6; 2675 dbgregs->dr7 = vcpu->arch.dr7; 2676 dbgregs->flags = 0; 2677 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); 2678 } 2679 2680 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2681 struct kvm_debugregs *dbgregs) 2682 { 2683 if (dbgregs->flags) 2684 return -EINVAL; 2685 2686 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2687 vcpu->arch.dr6 = dbgregs->dr6; 2688 vcpu->arch.dr7 = dbgregs->dr7; 2689 2690 return 0; 2691 } 2692 2693 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, 2694 struct kvm_xsave *guest_xsave) 2695 { 2696 if (cpu_has_xsave) 2697 memcpy(guest_xsave->region, 2698 &vcpu->arch.guest_fpu.state->xsave, 2699 xstate_size); 2700 else { 2701 memcpy(guest_xsave->region, 2702 &vcpu->arch.guest_fpu.state->fxsave, 2703 sizeof(struct i387_fxsave_struct)); 2704 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 2705 XSTATE_FPSSE; 2706 } 2707 } 2708 2709 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 2710 struct kvm_xsave *guest_xsave) 2711 { 2712 u64 xstate_bv = 2713 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; 2714 2715 if (cpu_has_xsave) 2716 memcpy(&vcpu->arch.guest_fpu.state->xsave, 2717 guest_xsave->region, xstate_size); 2718 else { 2719 if (xstate_bv & ~XSTATE_FPSSE) 2720 return -EINVAL; 2721 memcpy(&vcpu->arch.guest_fpu.state->fxsave, 2722 guest_xsave->region, sizeof(struct i387_fxsave_struct)); 2723 } 2724 return 0; 2725 } 2726 2727 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, 2728 struct kvm_xcrs *guest_xcrs) 2729 { 2730 if (!cpu_has_xsave) { 2731 guest_xcrs->nr_xcrs = 0; 2732 return; 2733 } 2734 2735 guest_xcrs->nr_xcrs = 1; 2736 guest_xcrs->flags = 0; 2737 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; 2738 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; 2739 } 2740 2741 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, 2742 struct kvm_xcrs *guest_xcrs) 2743 { 2744 int i, r = 0; 2745 2746 if (!cpu_has_xsave) 2747 return -EINVAL; 2748 2749 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) 2750 return -EINVAL; 2751 2752 for (i = 0; i < guest_xcrs->nr_xcrs; i++) 2753 /* Only support XCR0 currently */ 2754 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { 2755 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, 2756 guest_xcrs->xcrs[0].value); 2757 break; 2758 } 2759 if (r) 2760 r = -EINVAL; 2761 return r; 2762 } 2763 2764 long kvm_arch_vcpu_ioctl(struct file *filp, 2765 unsigned int ioctl, unsigned long arg) 2766 { 2767 struct kvm_vcpu *vcpu = filp->private_data; 2768 void __user *argp = (void __user *)arg; 2769 int r; 2770 union { 2771 struct kvm_lapic_state *lapic; 2772 struct kvm_xsave *xsave; 2773 struct kvm_xcrs *xcrs; 2774 void *buffer; 2775 } u; 2776 2777 u.buffer = NULL; 2778 switch (ioctl) { 2779 case KVM_GET_LAPIC: { 2780 r = -EINVAL; 2781 if (!vcpu->arch.apic) 2782 goto out; 2783 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2784 2785 r = -ENOMEM; 2786 if (!u.lapic) 2787 goto out; 2788 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); 2789 if (r) 2790 goto out; 2791 r = -EFAULT; 2792 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) 2793 goto out; 2794 r = 0; 2795 break; 2796 } 2797 case KVM_SET_LAPIC: { 2798 r = -EINVAL; 2799 if (!vcpu->arch.apic) 2800 goto out; 2801 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2802 r = -ENOMEM; 2803 if (!u.lapic) 2804 goto out; 2805 r = -EFAULT; 2806 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) 2807 goto out; 2808 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 2809 if (r) 2810 goto out; 2811 r = 0; 2812 break; 2813 } 2814 case KVM_INTERRUPT: { 2815 struct kvm_interrupt irq; 2816 2817 r = -EFAULT; 2818 if (copy_from_user(&irq, argp, sizeof irq)) 2819 goto out; 2820 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 2821 if (r) 2822 goto out; 2823 r = 0; 2824 break; 2825 } 2826 case KVM_NMI: { 2827 r = kvm_vcpu_ioctl_nmi(vcpu); 2828 if (r) 2829 goto out; 2830 r = 0; 2831 break; 2832 } 2833 case KVM_SET_CPUID: { 2834 struct kvm_cpuid __user *cpuid_arg = argp; 2835 struct kvm_cpuid cpuid; 2836 2837 r = -EFAULT; 2838 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2839 goto out; 2840 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 2841 if (r) 2842 goto out; 2843 break; 2844 } 2845 case KVM_SET_CPUID2: { 2846 struct kvm_cpuid2 __user *cpuid_arg = argp; 2847 struct kvm_cpuid2 cpuid; 2848 2849 r = -EFAULT; 2850 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2851 goto out; 2852 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 2853 cpuid_arg->entries); 2854 if (r) 2855 goto out; 2856 break; 2857 } 2858 case KVM_GET_CPUID2: { 2859 struct kvm_cpuid2 __user *cpuid_arg = argp; 2860 struct kvm_cpuid2 cpuid; 2861 2862 r = -EFAULT; 2863 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2864 goto out; 2865 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 2866 cpuid_arg->entries); 2867 if (r) 2868 goto out; 2869 r = -EFAULT; 2870 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 2871 goto out; 2872 r = 0; 2873 break; 2874 } 2875 case KVM_GET_MSRS: 2876 r = msr_io(vcpu, argp, kvm_get_msr, 1); 2877 break; 2878 case KVM_SET_MSRS: 2879 r = msr_io(vcpu, argp, do_set_msr, 0); 2880 break; 2881 case KVM_TPR_ACCESS_REPORTING: { 2882 struct kvm_tpr_access_ctl tac; 2883 2884 r = -EFAULT; 2885 if (copy_from_user(&tac, argp, sizeof tac)) 2886 goto out; 2887 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 2888 if (r) 2889 goto out; 2890 r = -EFAULT; 2891 if (copy_to_user(argp, &tac, sizeof tac)) 2892 goto out; 2893 r = 0; 2894 break; 2895 }; 2896 case KVM_SET_VAPIC_ADDR: { 2897 struct kvm_vapic_addr va; 2898 2899 r = -EINVAL; 2900 if (!irqchip_in_kernel(vcpu->kvm)) 2901 goto out; 2902 r = -EFAULT; 2903 if (copy_from_user(&va, argp, sizeof va)) 2904 goto out; 2905 r = 0; 2906 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 2907 break; 2908 } 2909 case KVM_X86_SETUP_MCE: { 2910 u64 mcg_cap; 2911 2912 r = -EFAULT; 2913 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 2914 goto out; 2915 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 2916 break; 2917 } 2918 case KVM_X86_SET_MCE: { 2919 struct kvm_x86_mce mce; 2920 2921 r = -EFAULT; 2922 if (copy_from_user(&mce, argp, sizeof mce)) 2923 goto out; 2924 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2925 break; 2926 } 2927 case KVM_GET_VCPU_EVENTS: { 2928 struct kvm_vcpu_events events; 2929 2930 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 2931 2932 r = -EFAULT; 2933 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 2934 break; 2935 r = 0; 2936 break; 2937 } 2938 case KVM_SET_VCPU_EVENTS: { 2939 struct kvm_vcpu_events events; 2940 2941 r = -EFAULT; 2942 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 2943 break; 2944 2945 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2946 break; 2947 } 2948 case KVM_GET_DEBUGREGS: { 2949 struct kvm_debugregs dbgregs; 2950 2951 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); 2952 2953 r = -EFAULT; 2954 if (copy_to_user(argp, &dbgregs, 2955 sizeof(struct kvm_debugregs))) 2956 break; 2957 r = 0; 2958 break; 2959 } 2960 case KVM_SET_DEBUGREGS: { 2961 struct kvm_debugregs dbgregs; 2962 2963 r = -EFAULT; 2964 if (copy_from_user(&dbgregs, argp, 2965 sizeof(struct kvm_debugregs))) 2966 break; 2967 2968 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2969 break; 2970 } 2971 case KVM_GET_XSAVE: { 2972 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2973 r = -ENOMEM; 2974 if (!u.xsave) 2975 break; 2976 2977 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); 2978 2979 r = -EFAULT; 2980 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) 2981 break; 2982 r = 0; 2983 break; 2984 } 2985 case KVM_SET_XSAVE: { 2986 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2987 r = -ENOMEM; 2988 if (!u.xsave) 2989 break; 2990 2991 r = -EFAULT; 2992 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) 2993 break; 2994 2995 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 2996 break; 2997 } 2998 case KVM_GET_XCRS: { 2999 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 3000 r = -ENOMEM; 3001 if (!u.xcrs) 3002 break; 3003 3004 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); 3005 3006 r = -EFAULT; 3007 if (copy_to_user(argp, u.xcrs, 3008 sizeof(struct kvm_xcrs))) 3009 break; 3010 r = 0; 3011 break; 3012 } 3013 case KVM_SET_XCRS: { 3014 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 3015 r = -ENOMEM; 3016 if (!u.xcrs) 3017 break; 3018 3019 r = -EFAULT; 3020 if (copy_from_user(u.xcrs, argp, 3021 sizeof(struct kvm_xcrs))) 3022 break; 3023 3024 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3025 break; 3026 } 3027 default: 3028 r = -EINVAL; 3029 } 3030 out: 3031 kfree(u.buffer); 3032 return r; 3033 } 3034 3035 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 3036 { 3037 int ret; 3038 3039 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 3040 return -1; 3041 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 3042 return ret; 3043 } 3044 3045 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 3046 u64 ident_addr) 3047 { 3048 kvm->arch.ept_identity_map_addr = ident_addr; 3049 return 0; 3050 } 3051 3052 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 3053 u32 kvm_nr_mmu_pages) 3054 { 3055 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 3056 return -EINVAL; 3057 3058 mutex_lock(&kvm->slots_lock); 3059 spin_lock(&kvm->mmu_lock); 3060 3061 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 3062 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 3063 3064 spin_unlock(&kvm->mmu_lock); 3065 mutex_unlock(&kvm->slots_lock); 3066 return 0; 3067 } 3068 3069 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3070 { 3071 return kvm->arch.n_max_mmu_pages; 3072 } 3073 3074 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3075 { 3076 int r; 3077 3078 r = 0; 3079 switch (chip->chip_id) { 3080 case KVM_IRQCHIP_PIC_MASTER: 3081 memcpy(&chip->chip.pic, 3082 &pic_irqchip(kvm)->pics[0], 3083 sizeof(struct kvm_pic_state)); 3084 break; 3085 case KVM_IRQCHIP_PIC_SLAVE: 3086 memcpy(&chip->chip.pic, 3087 &pic_irqchip(kvm)->pics[1], 3088 sizeof(struct kvm_pic_state)); 3089 break; 3090 case KVM_IRQCHIP_IOAPIC: 3091 r = kvm_get_ioapic(kvm, &chip->chip.ioapic); 3092 break; 3093 default: 3094 r = -EINVAL; 3095 break; 3096 } 3097 return r; 3098 } 3099 3100 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3101 { 3102 int r; 3103 3104 r = 0; 3105 switch (chip->chip_id) { 3106 case KVM_IRQCHIP_PIC_MASTER: 3107 spin_lock(&pic_irqchip(kvm)->lock); 3108 memcpy(&pic_irqchip(kvm)->pics[0], 3109 &chip->chip.pic, 3110 sizeof(struct kvm_pic_state)); 3111 spin_unlock(&pic_irqchip(kvm)->lock); 3112 break; 3113 case KVM_IRQCHIP_PIC_SLAVE: 3114 spin_lock(&pic_irqchip(kvm)->lock); 3115 memcpy(&pic_irqchip(kvm)->pics[1], 3116 &chip->chip.pic, 3117 sizeof(struct kvm_pic_state)); 3118 spin_unlock(&pic_irqchip(kvm)->lock); 3119 break; 3120 case KVM_IRQCHIP_IOAPIC: 3121 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3122 break; 3123 default: 3124 r = -EINVAL; 3125 break; 3126 } 3127 kvm_pic_update_irq(pic_irqchip(kvm)); 3128 return r; 3129 } 3130 3131 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3132 { 3133 int r = 0; 3134 3135 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3136 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 3137 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3138 return r; 3139 } 3140 3141 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 3142 { 3143 int r = 0; 3144 3145 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3146 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 3147 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 3148 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3149 return r; 3150 } 3151 3152 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3153 { 3154 int r = 0; 3155 3156 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3157 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 3158 sizeof(ps->channels)); 3159 ps->flags = kvm->arch.vpit->pit_state.flags; 3160 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3161 memset(&ps->reserved, 0, sizeof(ps->reserved)); 3162 return r; 3163 } 3164 3165 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 3166 { 3167 int r = 0, start = 0; 3168 u32 prev_legacy, cur_legacy; 3169 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3170 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 3171 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 3172 if (!prev_legacy && cur_legacy) 3173 start = 1; 3174 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 3175 sizeof(kvm->arch.vpit->pit_state.channels)); 3176 kvm->arch.vpit->pit_state.flags = ps->flags; 3177 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 3178 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3179 return r; 3180 } 3181 3182 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 3183 struct kvm_reinject_control *control) 3184 { 3185 if (!kvm->arch.vpit) 3186 return -ENXIO; 3187 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3188 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 3189 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3190 return 0; 3191 } 3192 3193 /* 3194 * Get (and clear) the dirty memory log for a memory slot. 3195 */ 3196 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 3197 struct kvm_dirty_log *log) 3198 { 3199 int r, i; 3200 struct kvm_memory_slot *memslot; 3201 unsigned long n; 3202 unsigned long is_dirty = 0; 3203 3204 mutex_lock(&kvm->slots_lock); 3205 3206 r = -EINVAL; 3207 if (log->slot >= KVM_MEMORY_SLOTS) 3208 goto out; 3209 3210 memslot = &kvm->memslots->memslots[log->slot]; 3211 r = -ENOENT; 3212 if (!memslot->dirty_bitmap) 3213 goto out; 3214 3215 n = kvm_dirty_bitmap_bytes(memslot); 3216 3217 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 3218 is_dirty = memslot->dirty_bitmap[i]; 3219 3220 /* If nothing is dirty, don't bother messing with page tables. */ 3221 if (is_dirty) { 3222 struct kvm_memslots *slots, *old_slots; 3223 unsigned long *dirty_bitmap; 3224 3225 dirty_bitmap = memslot->dirty_bitmap_head; 3226 if (memslot->dirty_bitmap == dirty_bitmap) 3227 dirty_bitmap += n / sizeof(long); 3228 memset(dirty_bitmap, 0, n); 3229 3230 r = -ENOMEM; 3231 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3232 if (!slots) 3233 goto out; 3234 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3235 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3236 slots->generation++; 3237 3238 old_slots = kvm->memslots; 3239 rcu_assign_pointer(kvm->memslots, slots); 3240 synchronize_srcu_expedited(&kvm->srcu); 3241 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 3242 kfree(old_slots); 3243 3244 spin_lock(&kvm->mmu_lock); 3245 kvm_mmu_slot_remove_write_access(kvm, log->slot); 3246 spin_unlock(&kvm->mmu_lock); 3247 3248 r = -EFAULT; 3249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 3250 goto out; 3251 } else { 3252 r = -EFAULT; 3253 if (clear_user(log->dirty_bitmap, n)) 3254 goto out; 3255 } 3256 3257 r = 0; 3258 out: 3259 mutex_unlock(&kvm->slots_lock); 3260 return r; 3261 } 3262 3263 long kvm_arch_vm_ioctl(struct file *filp, 3264 unsigned int ioctl, unsigned long arg) 3265 { 3266 struct kvm *kvm = filp->private_data; 3267 void __user *argp = (void __user *)arg; 3268 int r = -ENOTTY; 3269 /* 3270 * This union makes it completely explicit to gcc-3.x 3271 * that these two variables' stack usage should be 3272 * combined, not added together. 3273 */ 3274 union { 3275 struct kvm_pit_state ps; 3276 struct kvm_pit_state2 ps2; 3277 struct kvm_pit_config pit_config; 3278 } u; 3279 3280 switch (ioctl) { 3281 case KVM_SET_TSS_ADDR: 3282 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 3283 if (r < 0) 3284 goto out; 3285 break; 3286 case KVM_SET_IDENTITY_MAP_ADDR: { 3287 u64 ident_addr; 3288 3289 r = -EFAULT; 3290 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 3291 goto out; 3292 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 3293 if (r < 0) 3294 goto out; 3295 break; 3296 } 3297 case KVM_SET_NR_MMU_PAGES: 3298 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3299 if (r) 3300 goto out; 3301 break; 3302 case KVM_GET_NR_MMU_PAGES: 3303 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 3304 break; 3305 case KVM_CREATE_IRQCHIP: { 3306 struct kvm_pic *vpic; 3307 3308 mutex_lock(&kvm->lock); 3309 r = -EEXIST; 3310 if (kvm->arch.vpic) 3311 goto create_irqchip_unlock; 3312 r = -ENOMEM; 3313 vpic = kvm_create_pic(kvm); 3314 if (vpic) { 3315 r = kvm_ioapic_init(kvm); 3316 if (r) { 3317 mutex_lock(&kvm->slots_lock); 3318 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3319 &vpic->dev); 3320 mutex_unlock(&kvm->slots_lock); 3321 kfree(vpic); 3322 goto create_irqchip_unlock; 3323 } 3324 } else 3325 goto create_irqchip_unlock; 3326 smp_wmb(); 3327 kvm->arch.vpic = vpic; 3328 smp_wmb(); 3329 r = kvm_setup_default_irq_routing(kvm); 3330 if (r) { 3331 mutex_lock(&kvm->slots_lock); 3332 mutex_lock(&kvm->irq_lock); 3333 kvm_ioapic_destroy(kvm); 3334 kvm_destroy_pic(kvm); 3335 mutex_unlock(&kvm->irq_lock); 3336 mutex_unlock(&kvm->slots_lock); 3337 } 3338 create_irqchip_unlock: 3339 mutex_unlock(&kvm->lock); 3340 break; 3341 } 3342 case KVM_CREATE_PIT: 3343 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 3344 goto create_pit; 3345 case KVM_CREATE_PIT2: 3346 r = -EFAULT; 3347 if (copy_from_user(&u.pit_config, argp, 3348 sizeof(struct kvm_pit_config))) 3349 goto out; 3350 create_pit: 3351 mutex_lock(&kvm->slots_lock); 3352 r = -EEXIST; 3353 if (kvm->arch.vpit) 3354 goto create_pit_unlock; 3355 r = -ENOMEM; 3356 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 3357 if (kvm->arch.vpit) 3358 r = 0; 3359 create_pit_unlock: 3360 mutex_unlock(&kvm->slots_lock); 3361 break; 3362 case KVM_IRQ_LINE_STATUS: 3363 case KVM_IRQ_LINE: { 3364 struct kvm_irq_level irq_event; 3365 3366 r = -EFAULT; 3367 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 3368 goto out; 3369 r = -ENXIO; 3370 if (irqchip_in_kernel(kvm)) { 3371 __s32 status; 3372 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 3373 irq_event.irq, irq_event.level); 3374 if (ioctl == KVM_IRQ_LINE_STATUS) { 3375 r = -EFAULT; 3376 irq_event.status = status; 3377 if (copy_to_user(argp, &irq_event, 3378 sizeof irq_event)) 3379 goto out; 3380 } 3381 r = 0; 3382 } 3383 break; 3384 } 3385 case KVM_GET_IRQCHIP: { 3386 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3387 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3388 3389 r = -ENOMEM; 3390 if (!chip) 3391 goto out; 3392 r = -EFAULT; 3393 if (copy_from_user(chip, argp, sizeof *chip)) 3394 goto get_irqchip_out; 3395 r = -ENXIO; 3396 if (!irqchip_in_kernel(kvm)) 3397 goto get_irqchip_out; 3398 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 3399 if (r) 3400 goto get_irqchip_out; 3401 r = -EFAULT; 3402 if (copy_to_user(argp, chip, sizeof *chip)) 3403 goto get_irqchip_out; 3404 r = 0; 3405 get_irqchip_out: 3406 kfree(chip); 3407 if (r) 3408 goto out; 3409 break; 3410 } 3411 case KVM_SET_IRQCHIP: { 3412 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3413 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3414 3415 r = -ENOMEM; 3416 if (!chip) 3417 goto out; 3418 r = -EFAULT; 3419 if (copy_from_user(chip, argp, sizeof *chip)) 3420 goto set_irqchip_out; 3421 r = -ENXIO; 3422 if (!irqchip_in_kernel(kvm)) 3423 goto set_irqchip_out; 3424 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 3425 if (r) 3426 goto set_irqchip_out; 3427 r = 0; 3428 set_irqchip_out: 3429 kfree(chip); 3430 if (r) 3431 goto out; 3432 break; 3433 } 3434 case KVM_GET_PIT: { 3435 r = -EFAULT; 3436 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 3437 goto out; 3438 r = -ENXIO; 3439 if (!kvm->arch.vpit) 3440 goto out; 3441 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 3442 if (r) 3443 goto out; 3444 r = -EFAULT; 3445 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 3446 goto out; 3447 r = 0; 3448 break; 3449 } 3450 case KVM_SET_PIT: { 3451 r = -EFAULT; 3452 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 3453 goto out; 3454 r = -ENXIO; 3455 if (!kvm->arch.vpit) 3456 goto out; 3457 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3458 if (r) 3459 goto out; 3460 r = 0; 3461 break; 3462 } 3463 case KVM_GET_PIT2: { 3464 r = -ENXIO; 3465 if (!kvm->arch.vpit) 3466 goto out; 3467 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 3468 if (r) 3469 goto out; 3470 r = -EFAULT; 3471 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 3472 goto out; 3473 r = 0; 3474 break; 3475 } 3476 case KVM_SET_PIT2: { 3477 r = -EFAULT; 3478 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 3479 goto out; 3480 r = -ENXIO; 3481 if (!kvm->arch.vpit) 3482 goto out; 3483 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3484 if (r) 3485 goto out; 3486 r = 0; 3487 break; 3488 } 3489 case KVM_REINJECT_CONTROL: { 3490 struct kvm_reinject_control control; 3491 r = -EFAULT; 3492 if (copy_from_user(&control, argp, sizeof(control))) 3493 goto out; 3494 r = kvm_vm_ioctl_reinject(kvm, &control); 3495 if (r) 3496 goto out; 3497 r = 0; 3498 break; 3499 } 3500 case KVM_XEN_HVM_CONFIG: { 3501 r = -EFAULT; 3502 if (copy_from_user(&kvm->arch.xen_hvm_config, argp, 3503 sizeof(struct kvm_xen_hvm_config))) 3504 goto out; 3505 r = -EINVAL; 3506 if (kvm->arch.xen_hvm_config.flags) 3507 goto out; 3508 r = 0; 3509 break; 3510 } 3511 case KVM_SET_CLOCK: { 3512 struct kvm_clock_data user_ns; 3513 u64 now_ns; 3514 s64 delta; 3515 3516 r = -EFAULT; 3517 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 3518 goto out; 3519 3520 r = -EINVAL; 3521 if (user_ns.flags) 3522 goto out; 3523 3524 r = 0; 3525 local_irq_disable(); 3526 now_ns = get_kernel_ns(); 3527 delta = user_ns.clock - now_ns; 3528 local_irq_enable(); 3529 kvm->arch.kvmclock_offset = delta; 3530 break; 3531 } 3532 case KVM_GET_CLOCK: { 3533 struct kvm_clock_data user_ns; 3534 u64 now_ns; 3535 3536 local_irq_disable(); 3537 now_ns = get_kernel_ns(); 3538 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3539 local_irq_enable(); 3540 user_ns.flags = 0; 3541 memset(&user_ns.pad, 0, sizeof(user_ns.pad)); 3542 3543 r = -EFAULT; 3544 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 3545 goto out; 3546 r = 0; 3547 break; 3548 } 3549 3550 default: 3551 ; 3552 } 3553 out: 3554 return r; 3555 } 3556 3557 static void kvm_init_msr_list(void) 3558 { 3559 u32 dummy[2]; 3560 unsigned i, j; 3561 3562 /* skip the first msrs in the list. KVM-specific */ 3563 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 3564 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 3565 continue; 3566 if (j < i) 3567 msrs_to_save[j] = msrs_to_save[i]; 3568 j++; 3569 } 3570 num_msrs_to_save = j; 3571 } 3572 3573 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3574 const void *v) 3575 { 3576 if (vcpu->arch.apic && 3577 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3578 return 0; 3579 3580 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3581 } 3582 3583 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3584 { 3585 if (vcpu->arch.apic && 3586 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3587 return 0; 3588 3589 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3590 } 3591 3592 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3593 struct kvm_segment *var, int seg) 3594 { 3595 kvm_x86_ops->set_segment(vcpu, var, seg); 3596 } 3597 3598 void kvm_get_segment(struct kvm_vcpu *vcpu, 3599 struct kvm_segment *var, int seg) 3600 { 3601 kvm_x86_ops->get_segment(vcpu, var, seg); 3602 } 3603 3604 static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3605 { 3606 return gpa; 3607 } 3608 3609 static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3610 { 3611 gpa_t t_gpa; 3612 struct x86_exception exception; 3613 3614 BUG_ON(!mmu_is_nested(vcpu)); 3615 3616 /* NPT walks are always user-walks */ 3617 access |= PFERR_USER_MASK; 3618 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); 3619 3620 return t_gpa; 3621 } 3622 3623 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, 3624 struct x86_exception *exception) 3625 { 3626 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3627 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3628 } 3629 3630 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, 3631 struct x86_exception *exception) 3632 { 3633 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3634 access |= PFERR_FETCH_MASK; 3635 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3636 } 3637 3638 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, 3639 struct x86_exception *exception) 3640 { 3641 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3642 access |= PFERR_WRITE_MASK; 3643 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3644 } 3645 3646 /* uses this to access any guest's mapped memory without checking CPL */ 3647 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, 3648 struct x86_exception *exception) 3649 { 3650 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); 3651 } 3652 3653 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3654 struct kvm_vcpu *vcpu, u32 access, 3655 struct x86_exception *exception) 3656 { 3657 void *data = val; 3658 int r = X86EMUL_CONTINUE; 3659 3660 while (bytes) { 3661 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3662 exception); 3663 unsigned offset = addr & (PAGE_SIZE-1); 3664 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3665 int ret; 3666 3667 if (gpa == UNMAPPED_GVA) 3668 return X86EMUL_PROPAGATE_FAULT; 3669 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3670 if (ret < 0) { 3671 r = X86EMUL_IO_NEEDED; 3672 goto out; 3673 } 3674 3675 bytes -= toread; 3676 data += toread; 3677 addr += toread; 3678 } 3679 out: 3680 return r; 3681 } 3682 3683 /* used for instruction fetching */ 3684 static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3685 struct kvm_vcpu *vcpu, 3686 struct x86_exception *exception) 3687 { 3688 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3689 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3690 access | PFERR_FETCH_MASK, 3691 exception); 3692 } 3693 3694 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3695 struct kvm_vcpu *vcpu, 3696 struct x86_exception *exception) 3697 { 3698 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3699 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3700 exception); 3701 } 3702 3703 static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3704 struct kvm_vcpu *vcpu, 3705 struct x86_exception *exception) 3706 { 3707 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3708 } 3709 3710 static int kvm_write_guest_virt_system(gva_t addr, void *val, 3711 unsigned int bytes, 3712 struct kvm_vcpu *vcpu, 3713 struct x86_exception *exception) 3714 { 3715 void *data = val; 3716 int r = X86EMUL_CONTINUE; 3717 3718 while (bytes) { 3719 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3720 PFERR_WRITE_MASK, 3721 exception); 3722 unsigned offset = addr & (PAGE_SIZE-1); 3723 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3724 int ret; 3725 3726 if (gpa == UNMAPPED_GVA) 3727 return X86EMUL_PROPAGATE_FAULT; 3728 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3729 if (ret < 0) { 3730 r = X86EMUL_IO_NEEDED; 3731 goto out; 3732 } 3733 3734 bytes -= towrite; 3735 data += towrite; 3736 addr += towrite; 3737 } 3738 out: 3739 return r; 3740 } 3741 3742 static int emulator_read_emulated(unsigned long addr, 3743 void *val, 3744 unsigned int bytes, 3745 struct x86_exception *exception, 3746 struct kvm_vcpu *vcpu) 3747 { 3748 gpa_t gpa; 3749 3750 if (vcpu->mmio_read_completed) { 3751 memcpy(val, vcpu->mmio_data, bytes); 3752 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 3753 vcpu->mmio_phys_addr, *(u64 *)val); 3754 vcpu->mmio_read_completed = 0; 3755 return X86EMUL_CONTINUE; 3756 } 3757 3758 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); 3759 3760 if (gpa == UNMAPPED_GVA) 3761 return X86EMUL_PROPAGATE_FAULT; 3762 3763 /* For APIC access vmexit */ 3764 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3765 goto mmio; 3766 3767 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) 3768 == X86EMUL_CONTINUE) 3769 return X86EMUL_CONTINUE; 3770 3771 mmio: 3772 /* 3773 * Is this MMIO handled locally? 3774 */ 3775 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 3776 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 3777 return X86EMUL_CONTINUE; 3778 } 3779 3780 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3781 3782 vcpu->mmio_needed = 1; 3783 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3784 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3785 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3786 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3787 3788 return X86EMUL_IO_NEEDED; 3789 } 3790 3791 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3792 const void *val, int bytes) 3793 { 3794 int ret; 3795 3796 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 3797 if (ret < 0) 3798 return 0; 3799 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 3800 return 1; 3801 } 3802 3803 static int emulator_write_emulated_onepage(unsigned long addr, 3804 const void *val, 3805 unsigned int bytes, 3806 struct x86_exception *exception, 3807 struct kvm_vcpu *vcpu) 3808 { 3809 gpa_t gpa; 3810 3811 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); 3812 3813 if (gpa == UNMAPPED_GVA) 3814 return X86EMUL_PROPAGATE_FAULT; 3815 3816 /* For APIC access vmexit */ 3817 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3818 goto mmio; 3819 3820 if (emulator_write_phys(vcpu, gpa, val, bytes)) 3821 return X86EMUL_CONTINUE; 3822 3823 mmio: 3824 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 3825 /* 3826 * Is this MMIO handled locally? 3827 */ 3828 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 3829 return X86EMUL_CONTINUE; 3830 3831 vcpu->mmio_needed = 1; 3832 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3833 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3834 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3835 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3836 memcpy(vcpu->run->mmio.data, val, bytes); 3837 3838 return X86EMUL_CONTINUE; 3839 } 3840 3841 int emulator_write_emulated(unsigned long addr, 3842 const void *val, 3843 unsigned int bytes, 3844 struct x86_exception *exception, 3845 struct kvm_vcpu *vcpu) 3846 { 3847 /* Crossing a page boundary? */ 3848 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3849 int rc, now; 3850 3851 now = -addr & ~PAGE_MASK; 3852 rc = emulator_write_emulated_onepage(addr, val, now, exception, 3853 vcpu); 3854 if (rc != X86EMUL_CONTINUE) 3855 return rc; 3856 addr += now; 3857 val += now; 3858 bytes -= now; 3859 } 3860 return emulator_write_emulated_onepage(addr, val, bytes, exception, 3861 vcpu); 3862 } 3863 3864 #define CMPXCHG_TYPE(t, ptr, old, new) \ 3865 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3866 3867 #ifdef CONFIG_X86_64 3868 # define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) 3869 #else 3870 # define CMPXCHG64(ptr, old, new) \ 3871 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 3872 #endif 3873 3874 static int emulator_cmpxchg_emulated(unsigned long addr, 3875 const void *old, 3876 const void *new, 3877 unsigned int bytes, 3878 struct x86_exception *exception, 3879 struct kvm_vcpu *vcpu) 3880 { 3881 gpa_t gpa; 3882 struct page *page; 3883 char *kaddr; 3884 bool exchanged; 3885 3886 /* guests cmpxchg8b have to be emulated atomically */ 3887 if (bytes > 8 || (bytes & (bytes - 1))) 3888 goto emul_write; 3889 3890 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3891 3892 if (gpa == UNMAPPED_GVA || 3893 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3894 goto emul_write; 3895 3896 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3897 goto emul_write; 3898 3899 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3900 if (is_error_page(page)) { 3901 kvm_release_page_clean(page); 3902 goto emul_write; 3903 } 3904 3905 kaddr = kmap_atomic(page, KM_USER0); 3906 kaddr += offset_in_page(gpa); 3907 switch (bytes) { 3908 case 1: 3909 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); 3910 break; 3911 case 2: 3912 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); 3913 break; 3914 case 4: 3915 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); 3916 break; 3917 case 8: 3918 exchanged = CMPXCHG64(kaddr, old, new); 3919 break; 3920 default: 3921 BUG(); 3922 } 3923 kunmap_atomic(kaddr, KM_USER0); 3924 kvm_release_page_dirty(page); 3925 3926 if (!exchanged) 3927 return X86EMUL_CMPXCHG_FAILED; 3928 3929 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); 3930 3931 return X86EMUL_CONTINUE; 3932 3933 emul_write: 3934 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3935 3936 return emulator_write_emulated(addr, new, bytes, exception, vcpu); 3937 } 3938 3939 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3940 { 3941 /* TODO: String I/O for in kernel device */ 3942 int r; 3943 3944 if (vcpu->arch.pio.in) 3945 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, 3946 vcpu->arch.pio.size, pd); 3947 else 3948 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, 3949 vcpu->arch.pio.port, vcpu->arch.pio.size, 3950 pd); 3951 return r; 3952 } 3953 3954 3955 static int emulator_pio_in_emulated(int size, unsigned short port, void *val, 3956 unsigned int count, struct kvm_vcpu *vcpu) 3957 { 3958 if (vcpu->arch.pio.count) 3959 goto data_avail; 3960 3961 trace_kvm_pio(0, port, size, count); 3962 3963 vcpu->arch.pio.port = port; 3964 vcpu->arch.pio.in = 1; 3965 vcpu->arch.pio.count = count; 3966 vcpu->arch.pio.size = size; 3967 3968 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3969 data_avail: 3970 memcpy(val, vcpu->arch.pio_data, size * count); 3971 vcpu->arch.pio.count = 0; 3972 return 1; 3973 } 3974 3975 vcpu->run->exit_reason = KVM_EXIT_IO; 3976 vcpu->run->io.direction = KVM_EXIT_IO_IN; 3977 vcpu->run->io.size = size; 3978 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3979 vcpu->run->io.count = count; 3980 vcpu->run->io.port = port; 3981 3982 return 0; 3983 } 3984 3985 static int emulator_pio_out_emulated(int size, unsigned short port, 3986 const void *val, unsigned int count, 3987 struct kvm_vcpu *vcpu) 3988 { 3989 trace_kvm_pio(1, port, size, count); 3990 3991 vcpu->arch.pio.port = port; 3992 vcpu->arch.pio.in = 0; 3993 vcpu->arch.pio.count = count; 3994 vcpu->arch.pio.size = size; 3995 3996 memcpy(vcpu->arch.pio_data, val, size * count); 3997 3998 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3999 vcpu->arch.pio.count = 0; 4000 return 1; 4001 } 4002 4003 vcpu->run->exit_reason = KVM_EXIT_IO; 4004 vcpu->run->io.direction = KVM_EXIT_IO_OUT; 4005 vcpu->run->io.size = size; 4006 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 4007 vcpu->run->io.count = count; 4008 vcpu->run->io.port = port; 4009 4010 return 0; 4011 } 4012 4013 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 4014 { 4015 return kvm_x86_ops->get_segment_base(vcpu, seg); 4016 } 4017 4018 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 4019 { 4020 kvm_mmu_invlpg(vcpu, address); 4021 return X86EMUL_CONTINUE; 4022 } 4023 4024 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4025 { 4026 if (!need_emulate_wbinvd(vcpu)) 4027 return X86EMUL_CONTINUE; 4028 4029 if (kvm_x86_ops->has_wbinvd_exit()) { 4030 int cpu = get_cpu(); 4031 4032 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 4033 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4034 wbinvd_ipi, NULL, 1); 4035 put_cpu(); 4036 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4037 } else 4038 wbinvd(); 4039 return X86EMUL_CONTINUE; 4040 } 4041 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4042 4043 int emulate_clts(struct kvm_vcpu *vcpu) 4044 { 4045 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4046 kvm_x86_ops->fpu_activate(vcpu); 4047 return X86EMUL_CONTINUE; 4048 } 4049 4050 int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 4051 { 4052 return _kvm_get_dr(vcpu, dr, dest); 4053 } 4054 4055 int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 4056 { 4057 4058 return __kvm_set_dr(vcpu, dr, value); 4059 } 4060 4061 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 4062 { 4063 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 4064 } 4065 4066 static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) 4067 { 4068 unsigned long value; 4069 4070 switch (cr) { 4071 case 0: 4072 value = kvm_read_cr0(vcpu); 4073 break; 4074 case 2: 4075 value = vcpu->arch.cr2; 4076 break; 4077 case 3: 4078 value = kvm_read_cr3(vcpu); 4079 break; 4080 case 4: 4081 value = kvm_read_cr4(vcpu); 4082 break; 4083 case 8: 4084 value = kvm_get_cr8(vcpu); 4085 break; 4086 default: 4087 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4088 return 0; 4089 } 4090 4091 return value; 4092 } 4093 4094 static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 4095 { 4096 int res = 0; 4097 4098 switch (cr) { 4099 case 0: 4100 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 4101 break; 4102 case 2: 4103 vcpu->arch.cr2 = val; 4104 break; 4105 case 3: 4106 res = kvm_set_cr3(vcpu, val); 4107 break; 4108 case 4: 4109 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4110 break; 4111 case 8: 4112 res = kvm_set_cr8(vcpu, val); 4113 break; 4114 default: 4115 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4116 res = -1; 4117 } 4118 4119 return res; 4120 } 4121 4122 static int emulator_get_cpl(struct kvm_vcpu *vcpu) 4123 { 4124 return kvm_x86_ops->get_cpl(vcpu); 4125 } 4126 4127 static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4128 { 4129 kvm_x86_ops->get_gdt(vcpu, dt); 4130 } 4131 4132 static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4133 { 4134 kvm_x86_ops->get_idt(vcpu, dt); 4135 } 4136 4137 static unsigned long emulator_get_cached_segment_base(int seg, 4138 struct kvm_vcpu *vcpu) 4139 { 4140 return get_segment_base(vcpu, seg); 4141 } 4142 4143 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4144 struct kvm_vcpu *vcpu) 4145 { 4146 struct kvm_segment var; 4147 4148 kvm_get_segment(vcpu, &var, seg); 4149 4150 if (var.unusable) 4151 return false; 4152 4153 if (var.g) 4154 var.limit >>= 12; 4155 set_desc_limit(desc, var.limit); 4156 set_desc_base(desc, (unsigned long)var.base); 4157 desc->type = var.type; 4158 desc->s = var.s; 4159 desc->dpl = var.dpl; 4160 desc->p = var.present; 4161 desc->avl = var.avl; 4162 desc->l = var.l; 4163 desc->d = var.db; 4164 desc->g = var.g; 4165 4166 return true; 4167 } 4168 4169 static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4170 struct kvm_vcpu *vcpu) 4171 { 4172 struct kvm_segment var; 4173 4174 /* needed to preserve selector */ 4175 kvm_get_segment(vcpu, &var, seg); 4176 4177 var.base = get_desc_base(desc); 4178 var.limit = get_desc_limit(desc); 4179 if (desc->g) 4180 var.limit = (var.limit << 12) | 0xfff; 4181 var.type = desc->type; 4182 var.present = desc->p; 4183 var.dpl = desc->dpl; 4184 var.db = desc->d; 4185 var.s = desc->s; 4186 var.l = desc->l; 4187 var.g = desc->g; 4188 var.avl = desc->avl; 4189 var.present = desc->p; 4190 var.unusable = !var.present; 4191 var.padding = 0; 4192 4193 kvm_set_segment(vcpu, &var, seg); 4194 return; 4195 } 4196 4197 static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) 4198 { 4199 struct kvm_segment kvm_seg; 4200 4201 kvm_get_segment(vcpu, &kvm_seg, seg); 4202 return kvm_seg.selector; 4203 } 4204 4205 static void emulator_set_segment_selector(u16 sel, int seg, 4206 struct kvm_vcpu *vcpu) 4207 { 4208 struct kvm_segment kvm_seg; 4209 4210 kvm_get_segment(vcpu, &kvm_seg, seg); 4211 kvm_seg.selector = sel; 4212 kvm_set_segment(vcpu, &kvm_seg, seg); 4213 } 4214 4215 static struct x86_emulate_ops emulate_ops = { 4216 .read_std = kvm_read_guest_virt_system, 4217 .write_std = kvm_write_guest_virt_system, 4218 .fetch = kvm_fetch_guest_virt, 4219 .read_emulated = emulator_read_emulated, 4220 .write_emulated = emulator_write_emulated, 4221 .cmpxchg_emulated = emulator_cmpxchg_emulated, 4222 .pio_in_emulated = emulator_pio_in_emulated, 4223 .pio_out_emulated = emulator_pio_out_emulated, 4224 .get_cached_descriptor = emulator_get_cached_descriptor, 4225 .set_cached_descriptor = emulator_set_cached_descriptor, 4226 .get_segment_selector = emulator_get_segment_selector, 4227 .set_segment_selector = emulator_set_segment_selector, 4228 .get_cached_segment_base = emulator_get_cached_segment_base, 4229 .get_gdt = emulator_get_gdt, 4230 .get_idt = emulator_get_idt, 4231 .get_cr = emulator_get_cr, 4232 .set_cr = emulator_set_cr, 4233 .cpl = emulator_get_cpl, 4234 .get_dr = emulator_get_dr, 4235 .set_dr = emulator_set_dr, 4236 .set_msr = kvm_set_msr, 4237 .get_msr = kvm_get_msr, 4238 }; 4239 4240 static void cache_all_regs(struct kvm_vcpu *vcpu) 4241 { 4242 kvm_register_read(vcpu, VCPU_REGS_RAX); 4243 kvm_register_read(vcpu, VCPU_REGS_RSP); 4244 kvm_register_read(vcpu, VCPU_REGS_RIP); 4245 vcpu->arch.regs_dirty = ~0; 4246 } 4247 4248 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 4249 { 4250 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 4251 /* 4252 * an sti; sti; sequence only disable interrupts for the first 4253 * instruction. So, if the last instruction, be it emulated or 4254 * not, left the system with the INT_STI flag enabled, it 4255 * means that the last instruction is an sti. We should not 4256 * leave the flag on in this case. The same goes for mov ss 4257 */ 4258 if (!(int_shadow & mask)) 4259 kvm_x86_ops->set_interrupt_shadow(vcpu, mask); 4260 } 4261 4262 static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4263 { 4264 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4265 if (ctxt->exception.vector == PF_VECTOR) 4266 kvm_propagate_fault(vcpu, &ctxt->exception); 4267 else if (ctxt->exception.error_code_valid) 4268 kvm_queue_exception_e(vcpu, ctxt->exception.vector, 4269 ctxt->exception.error_code); 4270 else 4271 kvm_queue_exception(vcpu, ctxt->exception.vector); 4272 } 4273 4274 static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4275 { 4276 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4277 int cs_db, cs_l; 4278 4279 cache_all_regs(vcpu); 4280 4281 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4282 4283 vcpu->arch.emulate_ctxt.vcpu = vcpu; 4284 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 4285 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 4286 vcpu->arch.emulate_ctxt.mode = 4287 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4288 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 4289 ? X86EMUL_MODE_VM86 : cs_l 4290 ? X86EMUL_MODE_PROT64 : cs_db 4291 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4292 memset(c, 0, sizeof(struct decode_cache)); 4293 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4294 } 4295 4296 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) 4297 { 4298 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4299 int ret; 4300 4301 init_emulate_ctxt(vcpu); 4302 4303 vcpu->arch.emulate_ctxt.decode.op_bytes = 2; 4304 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; 4305 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; 4306 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); 4307 4308 if (ret != X86EMUL_CONTINUE) 4309 return EMULATE_FAIL; 4310 4311 vcpu->arch.emulate_ctxt.eip = c->eip; 4312 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4313 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4314 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4315 4316 if (irq == NMI_VECTOR) 4317 vcpu->arch.nmi_pending = false; 4318 else 4319 vcpu->arch.interrupt.pending = false; 4320 4321 return EMULATE_DONE; 4322 } 4323 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); 4324 4325 static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4326 { 4327 int r = EMULATE_DONE; 4328 4329 ++vcpu->stat.insn_emulation_fail; 4330 trace_kvm_emulate_insn_failed(vcpu); 4331 if (!is_guest_mode(vcpu)) { 4332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4333 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4334 vcpu->run->internal.ndata = 0; 4335 r = EMULATE_FAIL; 4336 } 4337 kvm_queue_exception(vcpu, UD_VECTOR); 4338 4339 return r; 4340 } 4341 4342 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4343 { 4344 gpa_t gpa; 4345 4346 if (tdp_enabled) 4347 return false; 4348 4349 /* 4350 * if emulation was due to access to shadowed page table 4351 * and it failed try to unshadow page and re-entetr the 4352 * guest to let CPU execute the instruction. 4353 */ 4354 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 4355 return true; 4356 4357 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); 4358 4359 if (gpa == UNMAPPED_GVA) 4360 return true; /* let cpu generate fault */ 4361 4362 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) 4363 return true; 4364 4365 return false; 4366 } 4367 4368 int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4369 unsigned long cr2, 4370 int emulation_type, 4371 void *insn, 4372 int insn_len) 4373 { 4374 int r; 4375 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4376 4377 kvm_clear_exception_queue(vcpu); 4378 vcpu->arch.mmio_fault_cr2 = cr2; 4379 /* 4380 * TODO: fix emulate.c to use guest_read/write_register 4381 * instead of direct ->regs accesses, can save hundred cycles 4382 * on Intel for instructions that don't read/change RSP, for 4383 * for example. 4384 */ 4385 cache_all_regs(vcpu); 4386 4387 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4388 init_emulate_ctxt(vcpu); 4389 vcpu->arch.emulate_ctxt.interruptibility = 0; 4390 vcpu->arch.emulate_ctxt.have_exception = false; 4391 vcpu->arch.emulate_ctxt.perm_ok = false; 4392 4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4394 if (r == X86EMUL_PROPAGATE_FAULT) 4395 goto done; 4396 4397 trace_kvm_emulate_insn_start(vcpu); 4398 4399 /* Only allow emulation of specific instructions on #UD 4400 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4401 if (emulation_type & EMULTYPE_TRAP_UD) { 4402 if (!c->twobyte) 4403 return EMULATE_FAIL; 4404 switch (c->b) { 4405 case 0x01: /* VMMCALL */ 4406 if (c->modrm_mod != 3 || c->modrm_rm != 1) 4407 return EMULATE_FAIL; 4408 break; 4409 case 0x34: /* sysenter */ 4410 case 0x35: /* sysexit */ 4411 if (c->modrm_mod != 0 || c->modrm_rm != 0) 4412 return EMULATE_FAIL; 4413 break; 4414 case 0x05: /* syscall */ 4415 if (c->modrm_mod != 0 || c->modrm_rm != 0) 4416 return EMULATE_FAIL; 4417 break; 4418 default: 4419 return EMULATE_FAIL; 4420 } 4421 4422 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 4423 return EMULATE_FAIL; 4424 } 4425 4426 ++vcpu->stat.insn_emulation; 4427 if (r) { 4428 if (reexecute_instruction(vcpu, cr2)) 4429 return EMULATE_DONE; 4430 if (emulation_type & EMULTYPE_SKIP) 4431 return EMULATE_FAIL; 4432 return handle_emulation_failure(vcpu); 4433 } 4434 } 4435 4436 if (emulation_type & EMULTYPE_SKIP) { 4437 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 4438 return EMULATE_DONE; 4439 } 4440 4441 /* this is needed for vmware backdor interface to work since it 4442 changes registers values during IO operation */ 4443 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4444 4445 restart: 4446 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); 4447 4448 if (r == EMULATION_FAILED) { 4449 if (reexecute_instruction(vcpu, cr2)) 4450 return EMULATE_DONE; 4451 4452 return handle_emulation_failure(vcpu); 4453 } 4454 4455 done: 4456 if (vcpu->arch.emulate_ctxt.have_exception) { 4457 inject_emulated_exception(vcpu); 4458 r = EMULATE_DONE; 4459 } else if (vcpu->arch.pio.count) { 4460 if (!vcpu->arch.pio.in) 4461 vcpu->arch.pio.count = 0; 4462 r = EMULATE_DO_MMIO; 4463 } else if (vcpu->mmio_needed) { 4464 if (vcpu->mmio_is_write) 4465 vcpu->mmio_needed = 0; 4466 r = EMULATE_DO_MMIO; 4467 } else if (r == EMULATION_RESTART) 4468 goto restart; 4469 else 4470 r = EMULATE_DONE; 4471 4472 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4473 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4474 kvm_make_request(KVM_REQ_EVENT, vcpu); 4475 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4476 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4477 4478 return r; 4479 } 4480 EXPORT_SYMBOL_GPL(x86_emulate_instruction); 4481 4482 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4483 { 4484 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4485 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); 4486 /* do not return to emulator after return from userspace */ 4487 vcpu->arch.pio.count = 0; 4488 return ret; 4489 } 4490 EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4491 4492 static void tsc_bad(void *info) 4493 { 4494 __this_cpu_write(cpu_tsc_khz, 0); 4495 } 4496 4497 static void tsc_khz_changed(void *data) 4498 { 4499 struct cpufreq_freqs *freq = data; 4500 unsigned long khz = 0; 4501 4502 if (data) 4503 khz = freq->new; 4504 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4505 khz = cpufreq_quick_get(raw_smp_processor_id()); 4506 if (!khz) 4507 khz = tsc_khz; 4508 __this_cpu_write(cpu_tsc_khz, khz); 4509 } 4510 4511 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4512 void *data) 4513 { 4514 struct cpufreq_freqs *freq = data; 4515 struct kvm *kvm; 4516 struct kvm_vcpu *vcpu; 4517 int i, send_ipi = 0; 4518 4519 /* 4520 * We allow guests to temporarily run on slowing clocks, 4521 * provided we notify them after, or to run on accelerating 4522 * clocks, provided we notify them before. Thus time never 4523 * goes backwards. 4524 * 4525 * However, we have a problem. We can't atomically update 4526 * the frequency of a given CPU from this function; it is 4527 * merely a notifier, which can be called from any CPU. 4528 * Changing the TSC frequency at arbitrary points in time 4529 * requires a recomputation of local variables related to 4530 * the TSC for each VCPU. We must flag these local variables 4531 * to be updated and be sure the update takes place with the 4532 * new frequency before any guests proceed. 4533 * 4534 * Unfortunately, the combination of hotplug CPU and frequency 4535 * change creates an intractable locking scenario; the order 4536 * of when these callouts happen is undefined with respect to 4537 * CPU hotplug, and they can race with each other. As such, 4538 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is 4539 * undefined; you can actually have a CPU frequency change take 4540 * place in between the computation of X and the setting of the 4541 * variable. To protect against this problem, all updates of 4542 * the per_cpu tsc_khz variable are done in an interrupt 4543 * protected IPI, and all callers wishing to update the value 4544 * must wait for a synchronous IPI to complete (which is trivial 4545 * if the caller is on the CPU already). This establishes the 4546 * necessary total order on variable updates. 4547 * 4548 * Note that because a guest time update may take place 4549 * anytime after the setting of the VCPU's request bit, the 4550 * correct TSC value must be set before the request. However, 4551 * to ensure the update actually makes it to any guest which 4552 * starts running in hardware virtualization between the set 4553 * and the acquisition of the spinlock, we must also ping the 4554 * CPU after setting the request bit. 4555 * 4556 */ 4557 4558 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4559 return 0; 4560 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4561 return 0; 4562 4563 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4564 4565 spin_lock(&kvm_lock); 4566 list_for_each_entry(kvm, &vm_list, vm_list) { 4567 kvm_for_each_vcpu(i, vcpu, kvm) { 4568 if (vcpu->cpu != freq->cpu) 4569 continue; 4570 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 4571 if (vcpu->cpu != smp_processor_id()) 4572 send_ipi = 1; 4573 } 4574 } 4575 spin_unlock(&kvm_lock); 4576 4577 if (freq->old < freq->new && send_ipi) { 4578 /* 4579 * We upscale the frequency. Must make the guest 4580 * doesn't see old kvmclock values while running with 4581 * the new frequency, otherwise we risk the guest sees 4582 * time go backwards. 4583 * 4584 * In case we update the frequency for another cpu 4585 * (which might be in guest context) send an interrupt 4586 * to kick the cpu out of guest context. Next time 4587 * guest context is entered kvmclock will be updated, 4588 * so the guest will not see stale values. 4589 */ 4590 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4591 } 4592 return 0; 4593 } 4594 4595 static struct notifier_block kvmclock_cpufreq_notifier_block = { 4596 .notifier_call = kvmclock_cpufreq_notifier 4597 }; 4598 4599 static int kvmclock_cpu_notifier(struct notifier_block *nfb, 4600 unsigned long action, void *hcpu) 4601 { 4602 unsigned int cpu = (unsigned long)hcpu; 4603 4604 switch (action) { 4605 case CPU_ONLINE: 4606 case CPU_DOWN_FAILED: 4607 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); 4608 break; 4609 case CPU_DOWN_PREPARE: 4610 smp_call_function_single(cpu, tsc_bad, NULL, 1); 4611 break; 4612 } 4613 return NOTIFY_OK; 4614 } 4615 4616 static struct notifier_block kvmclock_cpu_notifier_block = { 4617 .notifier_call = kvmclock_cpu_notifier, 4618 .priority = -INT_MAX 4619 }; 4620 4621 static void kvm_timer_init(void) 4622 { 4623 int cpu; 4624 4625 max_tsc_khz = tsc_khz; 4626 register_hotcpu_notifier(&kvmclock_cpu_notifier_block); 4627 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4628 #ifdef CONFIG_CPU_FREQ 4629 struct cpufreq_policy policy; 4630 memset(&policy, 0, sizeof(policy)); 4631 cpu = get_cpu(); 4632 cpufreq_get_policy(&policy, cpu); 4633 if (policy.cpuinfo.max_freq) 4634 max_tsc_khz = policy.cpuinfo.max_freq; 4635 put_cpu(); 4636 #endif 4637 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4638 CPUFREQ_TRANSITION_NOTIFIER); 4639 } 4640 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); 4641 for_each_online_cpu(cpu) 4642 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); 4643 } 4644 4645 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4646 4647 static int kvm_is_in_guest(void) 4648 { 4649 return percpu_read(current_vcpu) != NULL; 4650 } 4651 4652 static int kvm_is_user_mode(void) 4653 { 4654 int user_mode = 3; 4655 4656 if (percpu_read(current_vcpu)) 4657 user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); 4658 4659 return user_mode != 0; 4660 } 4661 4662 static unsigned long kvm_get_guest_ip(void) 4663 { 4664 unsigned long ip = 0; 4665 4666 if (percpu_read(current_vcpu)) 4667 ip = kvm_rip_read(percpu_read(current_vcpu)); 4668 4669 return ip; 4670 } 4671 4672 static struct perf_guest_info_callbacks kvm_guest_cbs = { 4673 .is_in_guest = kvm_is_in_guest, 4674 .is_user_mode = kvm_is_user_mode, 4675 .get_guest_ip = kvm_get_guest_ip, 4676 }; 4677 4678 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) 4679 { 4680 percpu_write(current_vcpu, vcpu); 4681 } 4682 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); 4683 4684 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) 4685 { 4686 percpu_write(current_vcpu, NULL); 4687 } 4688 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 4689 4690 int kvm_arch_init(void *opaque) 4691 { 4692 int r; 4693 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 4694 4695 if (kvm_x86_ops) { 4696 printk(KERN_ERR "kvm: already loaded the other module\n"); 4697 r = -EEXIST; 4698 goto out; 4699 } 4700 4701 if (!ops->cpu_has_kvm_support()) { 4702 printk(KERN_ERR "kvm: no hardware support\n"); 4703 r = -EOPNOTSUPP; 4704 goto out; 4705 } 4706 if (ops->disabled_by_bios()) { 4707 printk(KERN_ERR "kvm: disabled by bios\n"); 4708 r = -EOPNOTSUPP; 4709 goto out; 4710 } 4711 4712 r = kvm_mmu_module_init(); 4713 if (r) 4714 goto out; 4715 4716 kvm_init_msr_list(); 4717 4718 kvm_x86_ops = ops; 4719 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4720 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4721 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4722 4723 kvm_timer_init(); 4724 4725 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4726 4727 if (cpu_has_xsave) 4728 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 4729 4730 return 0; 4731 4732 out: 4733 return r; 4734 } 4735 4736 void kvm_arch_exit(void) 4737 { 4738 perf_unregister_guest_info_callbacks(&kvm_guest_cbs); 4739 4740 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4741 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4742 CPUFREQ_TRANSITION_NOTIFIER); 4743 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); 4744 kvm_x86_ops = NULL; 4745 kvm_mmu_module_exit(); 4746 } 4747 4748 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 4749 { 4750 ++vcpu->stat.halt_exits; 4751 if (irqchip_in_kernel(vcpu->kvm)) { 4752 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 4753 return 1; 4754 } else { 4755 vcpu->run->exit_reason = KVM_EXIT_HLT; 4756 return 0; 4757 } 4758 } 4759 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 4760 4761 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 4762 unsigned long a1) 4763 { 4764 if (is_long_mode(vcpu)) 4765 return a0; 4766 else 4767 return a0 | ((gpa_t)a1 << 32); 4768 } 4769 4770 int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 4771 { 4772 u64 param, ingpa, outgpa, ret; 4773 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; 4774 bool fast, longmode; 4775 int cs_db, cs_l; 4776 4777 /* 4778 * hypercall generates UD from non zero cpl and real mode 4779 * per HYPER-V spec 4780 */ 4781 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { 4782 kvm_queue_exception(vcpu, UD_VECTOR); 4783 return 0; 4784 } 4785 4786 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4787 longmode = is_long_mode(vcpu) && cs_l == 1; 4788 4789 if (!longmode) { 4790 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | 4791 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); 4792 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | 4793 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); 4794 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | 4795 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); 4796 } 4797 #ifdef CONFIG_X86_64 4798 else { 4799 param = kvm_register_read(vcpu, VCPU_REGS_RCX); 4800 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); 4801 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); 4802 } 4803 #endif 4804 4805 code = param & 0xffff; 4806 fast = (param >> 16) & 0x1; 4807 rep_cnt = (param >> 32) & 0xfff; 4808 rep_idx = (param >> 48) & 0xfff; 4809 4810 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); 4811 4812 switch (code) { 4813 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: 4814 kvm_vcpu_on_spin(vcpu); 4815 break; 4816 default: 4817 res = HV_STATUS_INVALID_HYPERCALL_CODE; 4818 break; 4819 } 4820 4821 ret = res | (((u64)rep_done & 0xfff) << 32); 4822 if (longmode) { 4823 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 4824 } else { 4825 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); 4826 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); 4827 } 4828 4829 return 1; 4830 } 4831 4832 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 4833 { 4834 unsigned long nr, a0, a1, a2, a3, ret; 4835 int r = 1; 4836 4837 if (kvm_hv_hypercall_enabled(vcpu->kvm)) 4838 return kvm_hv_hypercall(vcpu); 4839 4840 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 4841 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 4842 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 4843 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 4844 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 4845 4846 trace_kvm_hypercall(nr, a0, a1, a2, a3); 4847 4848 if (!is_long_mode(vcpu)) { 4849 nr &= 0xFFFFFFFF; 4850 a0 &= 0xFFFFFFFF; 4851 a1 &= 0xFFFFFFFF; 4852 a2 &= 0xFFFFFFFF; 4853 a3 &= 0xFFFFFFFF; 4854 } 4855 4856 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 4857 ret = -KVM_EPERM; 4858 goto out; 4859 } 4860 4861 switch (nr) { 4862 case KVM_HC_VAPIC_POLL_IRQ: 4863 ret = 0; 4864 break; 4865 case KVM_HC_MMU_OP: 4866 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 4867 break; 4868 default: 4869 ret = -KVM_ENOSYS; 4870 break; 4871 } 4872 out: 4873 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 4874 ++vcpu->stat.hypercalls; 4875 return r; 4876 } 4877 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 4878 4879 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 4880 { 4881 char instruction[3]; 4882 unsigned long rip = kvm_rip_read(vcpu); 4883 4884 /* 4885 * Blow out the MMU to ensure that no other VCPU has an active mapping 4886 * to ensure that the updated hypercall appears atomically across all 4887 * VCPUs. 4888 */ 4889 kvm_mmu_zap_all(vcpu->kvm); 4890 4891 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4892 4893 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 4894 } 4895 4896 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4897 { 4898 struct desc_ptr dt = { limit, base }; 4899 4900 kvm_x86_ops->set_gdt(vcpu, &dt); 4901 } 4902 4903 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4904 { 4905 struct desc_ptr dt = { limit, base }; 4906 4907 kvm_x86_ops->set_idt(vcpu, &dt); 4908 } 4909 4910 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4911 { 4912 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4913 int j, nent = vcpu->arch.cpuid_nent; 4914 4915 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 4916 /* when no next entry is found, the current entry[i] is reselected */ 4917 for (j = i + 1; ; j = (j + 1) % nent) { 4918 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 4919 if (ej->function == e->function) { 4920 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 4921 return j; 4922 } 4923 } 4924 return 0; /* silence gcc, even though control never reaches here */ 4925 } 4926 4927 /* find an entry with matching function, matching index (if needed), and that 4928 * should be read next (if it's stateful) */ 4929 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 4930 u32 function, u32 index) 4931 { 4932 if (e->function != function) 4933 return 0; 4934 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 4935 return 0; 4936 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 4937 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 4938 return 0; 4939 return 1; 4940 } 4941 4942 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 4943 u32 function, u32 index) 4944 { 4945 int i; 4946 struct kvm_cpuid_entry2 *best = NULL; 4947 4948 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 4949 struct kvm_cpuid_entry2 *e; 4950 4951 e = &vcpu->arch.cpuid_entries[i]; 4952 if (is_matching_cpuid_entry(e, function, index)) { 4953 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 4954 move_to_next_stateful_cpuid_entry(vcpu, i); 4955 best = e; 4956 break; 4957 } 4958 /* 4959 * Both basic or both extended? 4960 */ 4961 if (((e->function ^ function) & 0x80000000) == 0) 4962 if (!best || e->function > best->function) 4963 best = e; 4964 } 4965 return best; 4966 } 4967 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); 4968 4969 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4970 { 4971 struct kvm_cpuid_entry2 *best; 4972 4973 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); 4974 if (!best || best->eax < 0x80000008) 4975 goto not_found; 4976 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4977 if (best) 4978 return best->eax & 0xff; 4979 not_found: 4980 return 36; 4981 } 4982 4983 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 4984 { 4985 u32 function, index; 4986 struct kvm_cpuid_entry2 *best; 4987 4988 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 4989 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 4990 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 4991 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 4992 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 4993 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 4994 best = kvm_find_cpuid_entry(vcpu, function, index); 4995 if (best) { 4996 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 4997 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 4998 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 4999 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 5000 } 5001 kvm_x86_ops->skip_emulated_instruction(vcpu); 5002 trace_kvm_cpuid(function, 5003 kvm_register_read(vcpu, VCPU_REGS_RAX), 5004 kvm_register_read(vcpu, VCPU_REGS_RBX), 5005 kvm_register_read(vcpu, VCPU_REGS_RCX), 5006 kvm_register_read(vcpu, VCPU_REGS_RDX)); 5007 } 5008 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 5009 5010 /* 5011 * Check if userspace requested an interrupt window, and that the 5012 * interrupt window is open. 5013 * 5014 * No need to exit to userspace if we already have an interrupt queued. 5015 */ 5016 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 5017 { 5018 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 5019 vcpu->run->request_interrupt_window && 5020 kvm_arch_interrupt_allowed(vcpu)); 5021 } 5022 5023 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 5024 { 5025 struct kvm_run *kvm_run = vcpu->run; 5026 5027 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 5028 kvm_run->cr8 = kvm_get_cr8(vcpu); 5029 kvm_run->apic_base = kvm_get_apic_base(vcpu); 5030 if (irqchip_in_kernel(vcpu->kvm)) 5031 kvm_run->ready_for_interrupt_injection = 1; 5032 else 5033 kvm_run->ready_for_interrupt_injection = 5034 kvm_arch_interrupt_allowed(vcpu) && 5035 !kvm_cpu_has_interrupt(vcpu) && 5036 !kvm_event_needs_reinjection(vcpu); 5037 } 5038 5039 static void vapic_enter(struct kvm_vcpu *vcpu) 5040 { 5041 struct kvm_lapic *apic = vcpu->arch.apic; 5042 struct page *page; 5043 5044 if (!apic || !apic->vapic_addr) 5045 return; 5046 5047 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 5048 5049 vcpu->arch.apic->vapic_page = page; 5050 } 5051 5052 static void vapic_exit(struct kvm_vcpu *vcpu) 5053 { 5054 struct kvm_lapic *apic = vcpu->arch.apic; 5055 int idx; 5056 5057 if (!apic || !apic->vapic_addr) 5058 return; 5059 5060 idx = srcu_read_lock(&vcpu->kvm->srcu); 5061 kvm_release_page_dirty(apic->vapic_page); 5062 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 5063 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5064 } 5065 5066 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 5067 { 5068 int max_irr, tpr; 5069 5070 if (!kvm_x86_ops->update_cr8_intercept) 5071 return; 5072 5073 if (!vcpu->arch.apic) 5074 return; 5075 5076 if (!vcpu->arch.apic->vapic_addr) 5077 max_irr = kvm_lapic_find_highest_irr(vcpu); 5078 else 5079 max_irr = -1; 5080 5081 if (max_irr != -1) 5082 max_irr >>= 4; 5083 5084 tpr = kvm_lapic_get_cr8(vcpu); 5085 5086 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 5087 } 5088 5089 static void inject_pending_event(struct kvm_vcpu *vcpu) 5090 { 5091 /* try to reinject previous events if any */ 5092 if (vcpu->arch.exception.pending) { 5093 trace_kvm_inj_exception(vcpu->arch.exception.nr, 5094 vcpu->arch.exception.has_error_code, 5095 vcpu->arch.exception.error_code); 5096 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 5097 vcpu->arch.exception.has_error_code, 5098 vcpu->arch.exception.error_code, 5099 vcpu->arch.exception.reinject); 5100 return; 5101 } 5102 5103 if (vcpu->arch.nmi_injected) { 5104 kvm_x86_ops->set_nmi(vcpu); 5105 return; 5106 } 5107 5108 if (vcpu->arch.interrupt.pending) { 5109 kvm_x86_ops->set_irq(vcpu); 5110 return; 5111 } 5112 5113 /* try to inject new event if pending */ 5114 if (vcpu->arch.nmi_pending) { 5115 if (kvm_x86_ops->nmi_allowed(vcpu)) { 5116 vcpu->arch.nmi_pending = false; 5117 vcpu->arch.nmi_injected = true; 5118 kvm_x86_ops->set_nmi(vcpu); 5119 } 5120 } else if (kvm_cpu_has_interrupt(vcpu)) { 5121 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5122 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5123 false); 5124 kvm_x86_ops->set_irq(vcpu); 5125 } 5126 } 5127 } 5128 5129 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) 5130 { 5131 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && 5132 !vcpu->guest_xcr0_loaded) { 5133 /* kvm_set_xcr() also depends on this */ 5134 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 5135 vcpu->guest_xcr0_loaded = 1; 5136 } 5137 } 5138 5139 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) 5140 { 5141 if (vcpu->guest_xcr0_loaded) { 5142 if (vcpu->arch.xcr0 != host_xcr0) 5143 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); 5144 vcpu->guest_xcr0_loaded = 0; 5145 } 5146 } 5147 5148 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5149 { 5150 int r; 5151 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5152 vcpu->run->request_interrupt_window; 5153 5154 if (vcpu->requests) { 5155 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 5156 kvm_mmu_unload(vcpu); 5157 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5158 __kvm_migrate_timers(vcpu); 5159 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5160 r = kvm_guest_time_update(vcpu); 5161 if (unlikely(r)) 5162 goto out; 5163 } 5164 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5165 kvm_mmu_sync_roots(vcpu); 5166 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5167 kvm_x86_ops->tlb_flush(vcpu); 5168 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { 5169 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 5170 r = 0; 5171 goto out; 5172 } 5173 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 5174 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5175 r = 0; 5176 goto out; 5177 } 5178 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { 5179 vcpu->fpu_active = 0; 5180 kvm_x86_ops->fpu_deactivate(vcpu); 5181 } 5182 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { 5183 /* Page is swapped out. Do synthetic halt */ 5184 vcpu->arch.apf.halted = true; 5185 r = 1; 5186 goto out; 5187 } 5188 } 5189 5190 r = kvm_mmu_reload(vcpu); 5191 if (unlikely(r)) 5192 goto out; 5193 5194 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5195 inject_pending_event(vcpu); 5196 5197 /* enable NMI/IRQ window open exits if needed */ 5198 if (vcpu->arch.nmi_pending) 5199 kvm_x86_ops->enable_nmi_window(vcpu); 5200 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5201 kvm_x86_ops->enable_irq_window(vcpu); 5202 5203 if (kvm_lapic_enabled(vcpu)) { 5204 update_cr8_intercept(vcpu); 5205 kvm_lapic_sync_to_vapic(vcpu); 5206 } 5207 } 5208 5209 preempt_disable(); 5210 5211 kvm_x86_ops->prepare_guest_switch(vcpu); 5212 if (vcpu->fpu_active) 5213 kvm_load_guest_fpu(vcpu); 5214 kvm_load_guest_xcr0(vcpu); 5215 5216 atomic_set(&vcpu->guest_mode, 1); 5217 smp_wmb(); 5218 5219 local_irq_disable(); 5220 5221 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5222 || need_resched() || signal_pending(current)) { 5223 atomic_set(&vcpu->guest_mode, 0); 5224 smp_wmb(); 5225 local_irq_enable(); 5226 preempt_enable(); 5227 kvm_x86_ops->cancel_injection(vcpu); 5228 r = 1; 5229 goto out; 5230 } 5231 5232 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5233 5234 kvm_guest_enter(); 5235 5236 if (unlikely(vcpu->arch.switch_db_regs)) { 5237 set_debugreg(0, 7); 5238 set_debugreg(vcpu->arch.eff_db[0], 0); 5239 set_debugreg(vcpu->arch.eff_db[1], 1); 5240 set_debugreg(vcpu->arch.eff_db[2], 2); 5241 set_debugreg(vcpu->arch.eff_db[3], 3); 5242 } 5243 5244 trace_kvm_entry(vcpu->vcpu_id); 5245 kvm_x86_ops->run(vcpu); 5246 5247 /* 5248 * If the guest has used debug registers, at least dr7 5249 * will be disabled while returning to the host. 5250 * If we don't have active breakpoints in the host, we don't 5251 * care about the messed up debug address registers. But if 5252 * we have some of them active, restore the old state. 5253 */ 5254 if (hw_breakpoint_active()) 5255 hw_breakpoint_restore(); 5256 5257 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5258 5259 atomic_set(&vcpu->guest_mode, 0); 5260 smp_wmb(); 5261 local_irq_enable(); 5262 5263 ++vcpu->stat.exits; 5264 5265 /* 5266 * We must have an instruction between local_irq_enable() and 5267 * kvm_guest_exit(), so the timer interrupt isn't delayed by 5268 * the interrupt shadow. The stat.exits increment will do nicely. 5269 * But we need to prevent reordering, hence this barrier(): 5270 */ 5271 barrier(); 5272 5273 kvm_guest_exit(); 5274 5275 preempt_enable(); 5276 5277 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5278 5279 /* 5280 * Profile KVM exit RIPs: 5281 */ 5282 if (unlikely(prof_on == KVM_PROFILING)) { 5283 unsigned long rip = kvm_rip_read(vcpu); 5284 profile_hit(KVM_PROFILING, (void *)rip); 5285 } 5286 5287 5288 kvm_lapic_sync_from_vapic(vcpu); 5289 5290 r = kvm_x86_ops->handle_exit(vcpu); 5291 out: 5292 return r; 5293 } 5294 5295 5296 static int __vcpu_run(struct kvm_vcpu *vcpu) 5297 { 5298 int r; 5299 struct kvm *kvm = vcpu->kvm; 5300 5301 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 5302 pr_debug("vcpu %d received sipi with vector # %x\n", 5303 vcpu->vcpu_id, vcpu->arch.sipi_vector); 5304 kvm_lapic_reset(vcpu); 5305 r = kvm_arch_vcpu_reset(vcpu); 5306 if (r) 5307 return r; 5308 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5309 } 5310 5311 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5312 vapic_enter(vcpu); 5313 5314 r = 1; 5315 while (r > 0) { 5316 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 5317 !vcpu->arch.apf.halted) 5318 r = vcpu_enter_guest(vcpu); 5319 else { 5320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5321 kvm_vcpu_block(vcpu); 5322 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5323 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 5324 { 5325 switch(vcpu->arch.mp_state) { 5326 case KVM_MP_STATE_HALTED: 5327 vcpu->arch.mp_state = 5328 KVM_MP_STATE_RUNNABLE; 5329 case KVM_MP_STATE_RUNNABLE: 5330 vcpu->arch.apf.halted = false; 5331 break; 5332 case KVM_MP_STATE_SIPI_RECEIVED: 5333 default: 5334 r = -EINTR; 5335 break; 5336 } 5337 } 5338 } 5339 5340 if (r <= 0) 5341 break; 5342 5343 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 5344 if (kvm_cpu_has_pending_timer(vcpu)) 5345 kvm_inject_pending_timer_irqs(vcpu); 5346 5347 if (dm_request_for_irq_injection(vcpu)) { 5348 r = -EINTR; 5349 vcpu->run->exit_reason = KVM_EXIT_INTR; 5350 ++vcpu->stat.request_irq_exits; 5351 } 5352 5353 kvm_check_async_pf_completion(vcpu); 5354 5355 if (signal_pending(current)) { 5356 r = -EINTR; 5357 vcpu->run->exit_reason = KVM_EXIT_INTR; 5358 ++vcpu->stat.signal_exits; 5359 } 5360 if (need_resched()) { 5361 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5362 kvm_resched(vcpu); 5363 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5364 } 5365 } 5366 5367 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5368 5369 vapic_exit(vcpu); 5370 5371 return r; 5372 } 5373 5374 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5375 { 5376 int r; 5377 sigset_t sigsaved; 5378 5379 if (!tsk_used_math(current) && init_fpu(current)) 5380 return -ENOMEM; 5381 5382 if (vcpu->sigset_active) 5383 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5384 5385 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 5386 kvm_vcpu_block(vcpu); 5387 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 5388 r = -EAGAIN; 5389 goto out; 5390 } 5391 5392 /* re-sync apic's tpr */ 5393 if (!irqchip_in_kernel(vcpu->kvm)) { 5394 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { 5395 r = -EINVAL; 5396 goto out; 5397 } 5398 } 5399 5400 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5401 if (vcpu->mmio_needed) { 5402 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5403 vcpu->mmio_read_completed = 1; 5404 vcpu->mmio_needed = 0; 5405 } 5406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5407 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5409 if (r != EMULATE_DONE) { 5410 r = 0; 5411 goto out; 5412 } 5413 } 5414 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 5415 kvm_register_write(vcpu, VCPU_REGS_RAX, 5416 kvm_run->hypercall.ret); 5417 5418 r = __vcpu_run(vcpu); 5419 5420 out: 5421 post_kvm_run_save(vcpu); 5422 if (vcpu->sigset_active) 5423 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 5424 5425 return r; 5426 } 5427 5428 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5429 { 5430 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5431 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 5432 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 5433 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 5434 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 5435 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 5436 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 5437 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 5438 #ifdef CONFIG_X86_64 5439 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 5440 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 5441 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 5442 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 5443 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 5444 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 5445 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 5446 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 5447 #endif 5448 5449 regs->rip = kvm_rip_read(vcpu); 5450 regs->rflags = kvm_get_rflags(vcpu); 5451 5452 return 0; 5453 } 5454 5455 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5456 { 5457 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 5458 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 5459 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 5460 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 5461 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 5462 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 5463 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 5464 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 5465 #ifdef CONFIG_X86_64 5466 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 5467 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 5468 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 5469 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 5470 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 5471 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 5472 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 5473 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 5474 #endif 5475 5476 kvm_rip_write(vcpu, regs->rip); 5477 kvm_set_rflags(vcpu, regs->rflags); 5478 5479 vcpu->arch.exception.pending = false; 5480 5481 kvm_make_request(KVM_REQ_EVENT, vcpu); 5482 5483 return 0; 5484 } 5485 5486 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 5487 { 5488 struct kvm_segment cs; 5489 5490 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 5491 *db = cs.db; 5492 *l = cs.l; 5493 } 5494 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 5495 5496 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 5497 struct kvm_sregs *sregs) 5498 { 5499 struct desc_ptr dt; 5500 5501 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5502 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5503 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5504 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 5505 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 5506 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 5507 5508 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 5509 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 5510 5511 kvm_x86_ops->get_idt(vcpu, &dt); 5512 sregs->idt.limit = dt.size; 5513 sregs->idt.base = dt.address; 5514 kvm_x86_ops->get_gdt(vcpu, &dt); 5515 sregs->gdt.limit = dt.size; 5516 sregs->gdt.base = dt.address; 5517 5518 sregs->cr0 = kvm_read_cr0(vcpu); 5519 sregs->cr2 = vcpu->arch.cr2; 5520 sregs->cr3 = kvm_read_cr3(vcpu); 5521 sregs->cr4 = kvm_read_cr4(vcpu); 5522 sregs->cr8 = kvm_get_cr8(vcpu); 5523 sregs->efer = vcpu->arch.efer; 5524 sregs->apic_base = kvm_get_apic_base(vcpu); 5525 5526 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 5527 5528 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 5529 set_bit(vcpu->arch.interrupt.nr, 5530 (unsigned long *)sregs->interrupt_bitmap); 5531 5532 return 0; 5533 } 5534 5535 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5536 struct kvm_mp_state *mp_state) 5537 { 5538 mp_state->mp_state = vcpu->arch.mp_state; 5539 return 0; 5540 } 5541 5542 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5543 struct kvm_mp_state *mp_state) 5544 { 5545 vcpu->arch.mp_state = mp_state->mp_state; 5546 kvm_make_request(KVM_REQ_EVENT, vcpu); 5547 return 0; 5548 } 5549 5550 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5551 bool has_error_code, u32 error_code) 5552 { 5553 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5554 int ret; 5555 5556 init_emulate_ctxt(vcpu); 5557 5558 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, 5559 tss_selector, reason, has_error_code, 5560 error_code); 5561 5562 if (ret) 5563 return EMULATE_FAIL; 5564 5565 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5566 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5567 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5568 kvm_make_request(KVM_REQ_EVENT, vcpu); 5569 return EMULATE_DONE; 5570 } 5571 EXPORT_SYMBOL_GPL(kvm_task_switch); 5572 5573 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 5574 struct kvm_sregs *sregs) 5575 { 5576 int mmu_reset_needed = 0; 5577 int pending_vec, max_bits; 5578 struct desc_ptr dt; 5579 5580 dt.size = sregs->idt.limit; 5581 dt.address = sregs->idt.base; 5582 kvm_x86_ops->set_idt(vcpu, &dt); 5583 dt.size = sregs->gdt.limit; 5584 dt.address = sregs->gdt.base; 5585 kvm_x86_ops->set_gdt(vcpu, &dt); 5586 5587 vcpu->arch.cr2 = sregs->cr2; 5588 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; 5589 vcpu->arch.cr3 = sregs->cr3; 5590 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 5591 5592 kvm_set_cr8(vcpu, sregs->cr8); 5593 5594 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 5595 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5596 kvm_set_apic_base(vcpu, sregs->apic_base); 5597 5598 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 5599 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5600 vcpu->arch.cr0 = sregs->cr0; 5601 5602 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5604 if (sregs->cr4 & X86_CR4_OSXSAVE) 5605 update_cpuid(vcpu); 5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 5608 mmu_reset_needed = 1; 5609 } 5610 5611 if (mmu_reset_needed) 5612 kvm_mmu_reset_context(vcpu); 5613 5614 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 5615 pending_vec = find_first_bit( 5616 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 5617 if (pending_vec < max_bits) { 5618 kvm_queue_interrupt(vcpu, pending_vec, false); 5619 pr_debug("Set back pending irq %d\n", pending_vec); 5620 if (irqchip_in_kernel(vcpu->kvm)) 5621 kvm_pic_clear_isr_ack(vcpu->kvm); 5622 } 5623 5624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5625 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5626 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5627 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 5628 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 5629 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 5630 5631 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 5632 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 5633 5634 update_cr8_intercept(vcpu); 5635 5636 /* Older userspace won't unhalt the vcpu on reset. */ 5637 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5638 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5639 !is_protmode(vcpu)) 5640 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5641 5642 kvm_make_request(KVM_REQ_EVENT, vcpu); 5643 5644 return 0; 5645 } 5646 5647 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 5648 struct kvm_guest_debug *dbg) 5649 { 5650 unsigned long rflags; 5651 int i, r; 5652 5653 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5654 r = -EBUSY; 5655 if (vcpu->arch.exception.pending) 5656 goto out; 5657 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5658 kvm_queue_exception(vcpu, DB_VECTOR); 5659 else 5660 kvm_queue_exception(vcpu, BP_VECTOR); 5661 } 5662 5663 /* 5664 * Read rflags as long as potentially injected trace flags are still 5665 * filtered out. 5666 */ 5667 rflags = kvm_get_rflags(vcpu); 5668 5669 vcpu->guest_debug = dbg->control; 5670 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 5671 vcpu->guest_debug = 0; 5672 5673 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5674 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5675 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5676 vcpu->arch.switch_db_regs = 5677 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 5678 } else { 5679 for (i = 0; i < KVM_NR_DB_REGS; i++) 5680 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 5681 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5682 } 5683 5684 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5685 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 5686 get_segment_base(vcpu, VCPU_SREG_CS); 5687 5688 /* 5689 * Trigger an rflags update that will inject or remove the trace 5690 * flags. 5691 */ 5692 kvm_set_rflags(vcpu, rflags); 5693 5694 kvm_x86_ops->set_guest_debug(vcpu, dbg); 5695 5696 r = 0; 5697 5698 out: 5699 5700 return r; 5701 } 5702 5703 /* 5704 * Translate a guest virtual address to a guest physical address. 5705 */ 5706 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5707 struct kvm_translation *tr) 5708 { 5709 unsigned long vaddr = tr->linear_address; 5710 gpa_t gpa; 5711 int idx; 5712 5713 idx = srcu_read_lock(&vcpu->kvm->srcu); 5714 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5715 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5716 tr->physical_address = gpa; 5717 tr->valid = gpa != UNMAPPED_GVA; 5718 tr->writeable = 1; 5719 tr->usermode = 0; 5720 5721 return 0; 5722 } 5723 5724 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5725 { 5726 struct i387_fxsave_struct *fxsave = 5727 &vcpu->arch.guest_fpu.state->fxsave; 5728 5729 memcpy(fpu->fpr, fxsave->st_space, 128); 5730 fpu->fcw = fxsave->cwd; 5731 fpu->fsw = fxsave->swd; 5732 fpu->ftwx = fxsave->twd; 5733 fpu->last_opcode = fxsave->fop; 5734 fpu->last_ip = fxsave->rip; 5735 fpu->last_dp = fxsave->rdp; 5736 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5737 5738 return 0; 5739 } 5740 5741 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5742 { 5743 struct i387_fxsave_struct *fxsave = 5744 &vcpu->arch.guest_fpu.state->fxsave; 5745 5746 memcpy(fxsave->st_space, fpu->fpr, 128); 5747 fxsave->cwd = fpu->fcw; 5748 fxsave->swd = fpu->fsw; 5749 fxsave->twd = fpu->ftwx; 5750 fxsave->fop = fpu->last_opcode; 5751 fxsave->rip = fpu->last_ip; 5752 fxsave->rdp = fpu->last_dp; 5753 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5754 5755 return 0; 5756 } 5757 5758 int fx_init(struct kvm_vcpu *vcpu) 5759 { 5760 int err; 5761 5762 err = fpu_alloc(&vcpu->arch.guest_fpu); 5763 if (err) 5764 return err; 5765 5766 fpu_finit(&vcpu->arch.guest_fpu); 5767 5768 /* 5769 * Ensure guest xcr0 is valid for loading 5770 */ 5771 vcpu->arch.xcr0 = XSTATE_FP; 5772 5773 vcpu->arch.cr0 |= X86_CR0_ET; 5774 5775 return 0; 5776 } 5777 EXPORT_SYMBOL_GPL(fx_init); 5778 5779 static void fx_free(struct kvm_vcpu *vcpu) 5780 { 5781 fpu_free(&vcpu->arch.guest_fpu); 5782 } 5783 5784 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5785 { 5786 if (vcpu->guest_fpu_loaded) 5787 return; 5788 5789 /* 5790 * Restore all possible states in the guest, 5791 * and assume host would use all available bits. 5792 * Guest xcr0 would be loaded later. 5793 */ 5794 kvm_put_guest_xcr0(vcpu); 5795 vcpu->guest_fpu_loaded = 1; 5796 unlazy_fpu(current); 5797 fpu_restore_checking(&vcpu->arch.guest_fpu); 5798 trace_kvm_fpu(1); 5799 } 5800 5801 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5802 { 5803 kvm_put_guest_xcr0(vcpu); 5804 5805 if (!vcpu->guest_fpu_loaded) 5806 return; 5807 5808 vcpu->guest_fpu_loaded = 0; 5809 fpu_save_init(&vcpu->arch.guest_fpu); 5810 ++vcpu->stat.fpu_reload; 5811 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); 5812 trace_kvm_fpu(0); 5813 } 5814 5815 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5816 { 5817 if (vcpu->arch.time_page) { 5818 kvm_release_page_dirty(vcpu->arch.time_page); 5819 vcpu->arch.time_page = NULL; 5820 } 5821 5822 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5823 fx_free(vcpu); 5824 kvm_x86_ops->vcpu_free(vcpu); 5825 } 5826 5827 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 5828 unsigned int id) 5829 { 5830 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) 5831 printk_once(KERN_WARNING 5832 "kvm: SMP vm created on host with unstable TSC; " 5833 "guest TSC will not be reliable\n"); 5834 return kvm_x86_ops->vcpu_create(kvm, id); 5835 } 5836 5837 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 5838 { 5839 int r; 5840 5841 vcpu->arch.mtrr_state.have_fixed = 1; 5842 vcpu_load(vcpu); 5843 r = kvm_arch_vcpu_reset(vcpu); 5844 if (r == 0) 5845 r = kvm_mmu_setup(vcpu); 5846 vcpu_put(vcpu); 5847 if (r < 0) 5848 goto free_vcpu; 5849 5850 return 0; 5851 free_vcpu: 5852 kvm_x86_ops->vcpu_free(vcpu); 5853 return r; 5854 } 5855 5856 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5857 { 5858 vcpu->arch.apf.msr_val = 0; 5859 5860 vcpu_load(vcpu); 5861 kvm_mmu_unload(vcpu); 5862 vcpu_put(vcpu); 5863 5864 fx_free(vcpu); 5865 kvm_x86_ops->vcpu_free(vcpu); 5866 } 5867 5868 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 5869 { 5870 vcpu->arch.nmi_pending = false; 5871 vcpu->arch.nmi_injected = false; 5872 5873 vcpu->arch.switch_db_regs = 0; 5874 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 5875 vcpu->arch.dr6 = DR6_FIXED_1; 5876 vcpu->arch.dr7 = DR7_FIXED_1; 5877 5878 kvm_make_request(KVM_REQ_EVENT, vcpu); 5879 vcpu->arch.apf.msr_val = 0; 5880 5881 kvm_clear_async_pf_completion_queue(vcpu); 5882 kvm_async_pf_hash_reset(vcpu); 5883 vcpu->arch.apf.halted = false; 5884 5885 return kvm_x86_ops->vcpu_reset(vcpu); 5886 } 5887 5888 int kvm_arch_hardware_enable(void *garbage) 5889 { 5890 struct kvm *kvm; 5891 struct kvm_vcpu *vcpu; 5892 int i; 5893 5894 kvm_shared_msr_cpu_online(); 5895 list_for_each_entry(kvm, &vm_list, vm_list) 5896 kvm_for_each_vcpu(i, vcpu, kvm) 5897 if (vcpu->cpu == smp_processor_id()) 5898 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5899 return kvm_x86_ops->hardware_enable(garbage); 5900 } 5901 5902 void kvm_arch_hardware_disable(void *garbage) 5903 { 5904 kvm_x86_ops->hardware_disable(garbage); 5905 drop_user_return_notifiers(garbage); 5906 } 5907 5908 int kvm_arch_hardware_setup(void) 5909 { 5910 return kvm_x86_ops->hardware_setup(); 5911 } 5912 5913 void kvm_arch_hardware_unsetup(void) 5914 { 5915 kvm_x86_ops->hardware_unsetup(); 5916 } 5917 5918 void kvm_arch_check_processor_compat(void *rtn) 5919 { 5920 kvm_x86_ops->check_processor_compatibility(rtn); 5921 } 5922 5923 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 5924 { 5925 struct page *page; 5926 struct kvm *kvm; 5927 int r; 5928 5929 BUG_ON(vcpu->kvm == NULL); 5930 kvm = vcpu->kvm; 5931 5932 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 5933 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 5934 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5935 vcpu->arch.mmu.translate_gpa = translate_gpa; 5936 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; 5937 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5938 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5939 else 5940 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 5941 5942 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 5943 if (!page) { 5944 r = -ENOMEM; 5945 goto fail; 5946 } 5947 vcpu->arch.pio_data = page_address(page); 5948 5949 if (!kvm->arch.virtual_tsc_khz) 5950 kvm_arch_set_tsc_khz(kvm, max_tsc_khz); 5951 5952 r = kvm_mmu_create(vcpu); 5953 if (r < 0) 5954 goto fail_free_pio_data; 5955 5956 if (irqchip_in_kernel(kvm)) { 5957 r = kvm_create_lapic(vcpu); 5958 if (r < 0) 5959 goto fail_mmu_destroy; 5960 } 5961 5962 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 5963 GFP_KERNEL); 5964 if (!vcpu->arch.mce_banks) { 5965 r = -ENOMEM; 5966 goto fail_free_lapic; 5967 } 5968 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5969 5970 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5971 goto fail_free_mce_banks; 5972 5973 kvm_async_pf_hash_reset(vcpu); 5974 5975 return 0; 5976 fail_free_mce_banks: 5977 kfree(vcpu->arch.mce_banks); 5978 fail_free_lapic: 5979 kvm_free_lapic(vcpu); 5980 fail_mmu_destroy: 5981 kvm_mmu_destroy(vcpu); 5982 fail_free_pio_data: 5983 free_page((unsigned long)vcpu->arch.pio_data); 5984 fail: 5985 return r; 5986 } 5987 5988 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5989 { 5990 int idx; 5991 5992 kfree(vcpu->arch.mce_banks); 5993 kvm_free_lapic(vcpu); 5994 idx = srcu_read_lock(&vcpu->kvm->srcu); 5995 kvm_mmu_destroy(vcpu); 5996 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5997 free_page((unsigned long)vcpu->arch.pio_data); 5998 } 5999 6000 int kvm_arch_init_vm(struct kvm *kvm) 6001 { 6002 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6003 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6004 6005 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6006 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6007 6008 spin_lock_init(&kvm->arch.tsc_write_lock); 6009 6010 return 0; 6011 } 6012 6013 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6014 { 6015 vcpu_load(vcpu); 6016 kvm_mmu_unload(vcpu); 6017 vcpu_put(vcpu); 6018 } 6019 6020 static void kvm_free_vcpus(struct kvm *kvm) 6021 { 6022 unsigned int i; 6023 struct kvm_vcpu *vcpu; 6024 6025 /* 6026 * Unpin any mmu pages first. 6027 */ 6028 kvm_for_each_vcpu(i, vcpu, kvm) { 6029 kvm_clear_async_pf_completion_queue(vcpu); 6030 kvm_unload_vcpu_mmu(vcpu); 6031 } 6032 kvm_for_each_vcpu(i, vcpu, kvm) 6033 kvm_arch_vcpu_free(vcpu); 6034 6035 mutex_lock(&kvm->lock); 6036 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 6037 kvm->vcpus[i] = NULL; 6038 6039 atomic_set(&kvm->online_vcpus, 0); 6040 mutex_unlock(&kvm->lock); 6041 } 6042 6043 void kvm_arch_sync_events(struct kvm *kvm) 6044 { 6045 kvm_free_all_assigned_devices(kvm); 6046 kvm_free_pit(kvm); 6047 } 6048 6049 void kvm_arch_destroy_vm(struct kvm *kvm) 6050 { 6051 kvm_iommu_unmap_guest(kvm); 6052 kfree(kvm->arch.vpic); 6053 kfree(kvm->arch.vioapic); 6054 kvm_free_vcpus(kvm); 6055 if (kvm->arch.apic_access_page) 6056 put_page(kvm->arch.apic_access_page); 6057 if (kvm->arch.ept_identity_pagetable) 6058 put_page(kvm->arch.ept_identity_pagetable); 6059 } 6060 6061 int kvm_arch_prepare_memory_region(struct kvm *kvm, 6062 struct kvm_memory_slot *memslot, 6063 struct kvm_memory_slot old, 6064 struct kvm_userspace_memory_region *mem, 6065 int user_alloc) 6066 { 6067 int npages = memslot->npages; 6068 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; 6069 6070 /* Prevent internal slot pages from being moved by fork()/COW. */ 6071 if (memslot->id >= KVM_MEMORY_SLOTS) 6072 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6073 6074 /*To keep backward compatibility with older userspace, 6075 *x86 needs to hanlde !user_alloc case. 6076 */ 6077 if (!user_alloc) { 6078 if (npages && !old.rmap) { 6079 unsigned long userspace_addr; 6080 6081 down_write(¤t->mm->mmap_sem); 6082 userspace_addr = do_mmap(NULL, 0, 6083 npages * PAGE_SIZE, 6084 PROT_READ | PROT_WRITE, 6085 map_flags, 6086 0); 6087 up_write(¤t->mm->mmap_sem); 6088 6089 if (IS_ERR((void *)userspace_addr)) 6090 return PTR_ERR((void *)userspace_addr); 6091 6092 memslot->userspace_addr = userspace_addr; 6093 } 6094 } 6095 6096 6097 return 0; 6098 } 6099 6100 void kvm_arch_commit_memory_region(struct kvm *kvm, 6101 struct kvm_userspace_memory_region *mem, 6102 struct kvm_memory_slot old, 6103 int user_alloc) 6104 { 6105 6106 int npages = mem->memory_size >> PAGE_SHIFT; 6107 6108 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6109 int ret; 6110 6111 down_write(¤t->mm->mmap_sem); 6112 ret = do_munmap(current->mm, old.userspace_addr, 6113 old.npages * PAGE_SIZE); 6114 up_write(¤t->mm->mmap_sem); 6115 if (ret < 0) 6116 printk(KERN_WARNING 6117 "kvm_vm_ioctl_set_memory_region: " 6118 "failed to munmap memory\n"); 6119 } 6120 6121 spin_lock(&kvm->mmu_lock); 6122 if (!kvm->arch.n_requested_mmu_pages) { 6123 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 6124 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6125 } 6126 6127 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6128 spin_unlock(&kvm->mmu_lock); 6129 } 6130 6131 void kvm_arch_flush_shadow(struct kvm *kvm) 6132 { 6133 kvm_mmu_zap_all(kvm); 6134 kvm_reload_remote_mmus(kvm); 6135 } 6136 6137 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6138 { 6139 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6140 !vcpu->arch.apf.halted) 6141 || !list_empty_careful(&vcpu->async_pf.done) 6142 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6143 || vcpu->arch.nmi_pending || 6144 (kvm_arch_interrupt_allowed(vcpu) && 6145 kvm_cpu_has_interrupt(vcpu)); 6146 } 6147 6148 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 6149 { 6150 int me; 6151 int cpu = vcpu->cpu; 6152 6153 if (waitqueue_active(&vcpu->wq)) { 6154 wake_up_interruptible(&vcpu->wq); 6155 ++vcpu->stat.halt_wakeup; 6156 } 6157 6158 me = get_cpu(); 6159 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6160 if (atomic_xchg(&vcpu->guest_mode, 0)) 6161 smp_send_reschedule(cpu); 6162 put_cpu(); 6163 } 6164 6165 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 6166 { 6167 return kvm_x86_ops->interrupt_allowed(vcpu); 6168 } 6169 6170 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) 6171 { 6172 unsigned long current_rip = kvm_rip_read(vcpu) + 6173 get_segment_base(vcpu, VCPU_SREG_CS); 6174 6175 return current_rip == linear_rip; 6176 } 6177 EXPORT_SYMBOL_GPL(kvm_is_linear_rip); 6178 6179 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 6180 { 6181 unsigned long rflags; 6182 6183 rflags = kvm_x86_ops->get_rflags(vcpu); 6184 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 6185 rflags &= ~X86_EFLAGS_TF; 6186 return rflags; 6187 } 6188 EXPORT_SYMBOL_GPL(kvm_get_rflags); 6189 6190 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 6191 { 6192 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 6193 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6194 rflags |= X86_EFLAGS_TF; 6195 kvm_x86_ops->set_rflags(vcpu, rflags); 6196 kvm_make_request(KVM_REQ_EVENT, vcpu); 6197 } 6198 EXPORT_SYMBOL_GPL(kvm_set_rflags); 6199 6200 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) 6201 { 6202 int r; 6203 6204 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || 6205 is_error_page(work->page)) 6206 return; 6207 6208 r = kvm_mmu_reload(vcpu); 6209 if (unlikely(r)) 6210 return; 6211 6212 if (!vcpu->arch.mmu.direct_map && 6213 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) 6214 return; 6215 6216 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); 6217 } 6218 6219 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) 6220 { 6221 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); 6222 } 6223 6224 static inline u32 kvm_async_pf_next_probe(u32 key) 6225 { 6226 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); 6227 } 6228 6229 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 6230 { 6231 u32 key = kvm_async_pf_hash_fn(gfn); 6232 6233 while (vcpu->arch.apf.gfns[key] != ~0) 6234 key = kvm_async_pf_next_probe(key); 6235 6236 vcpu->arch.apf.gfns[key] = gfn; 6237 } 6238 6239 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) 6240 { 6241 int i; 6242 u32 key = kvm_async_pf_hash_fn(gfn); 6243 6244 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && 6245 (vcpu->arch.apf.gfns[key] != gfn && 6246 vcpu->arch.apf.gfns[key] != ~0); i++) 6247 key = kvm_async_pf_next_probe(key); 6248 6249 return key; 6250 } 6251 6252 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 6253 { 6254 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; 6255 } 6256 6257 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 6258 { 6259 u32 i, j, k; 6260 6261 i = j = kvm_async_pf_gfn_slot(vcpu, gfn); 6262 while (true) { 6263 vcpu->arch.apf.gfns[i] = ~0; 6264 do { 6265 j = kvm_async_pf_next_probe(j); 6266 if (vcpu->arch.apf.gfns[j] == ~0) 6267 return; 6268 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); 6269 /* 6270 * k lies cyclically in ]i,j] 6271 * | i.k.j | 6272 * |....j i.k.| or |.k..j i...| 6273 */ 6274 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); 6275 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; 6276 i = j; 6277 } 6278 } 6279 6280 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) 6281 { 6282 6283 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, 6284 sizeof(val)); 6285 } 6286 6287 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, 6288 struct kvm_async_pf *work) 6289 { 6290 struct x86_exception fault; 6291 6292 trace_kvm_async_pf_not_present(work->arch.token, work->gva); 6293 kvm_add_async_pf_gfn(vcpu, work->arch.gfn); 6294 6295 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || 6296 (vcpu->arch.apf.send_user_only && 6297 kvm_x86_ops->get_cpl(vcpu) == 0)) 6298 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 6299 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { 6300 fault.vector = PF_VECTOR; 6301 fault.error_code_valid = true; 6302 fault.error_code = 0; 6303 fault.nested_page_fault = false; 6304 fault.address = work->arch.token; 6305 kvm_inject_page_fault(vcpu, &fault); 6306 } 6307 } 6308 6309 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, 6310 struct kvm_async_pf *work) 6311 { 6312 struct x86_exception fault; 6313 6314 trace_kvm_async_pf_ready(work->arch.token, work->gva); 6315 if (is_error_page(work->page)) 6316 work->arch.token = ~0; /* broadcast wakeup */ 6317 else 6318 kvm_del_async_pf_gfn(vcpu, work->arch.gfn); 6319 6320 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && 6321 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { 6322 fault.vector = PF_VECTOR; 6323 fault.error_code_valid = true; 6324 fault.error_code = 0; 6325 fault.nested_page_fault = false; 6326 fault.address = work->arch.token; 6327 kvm_inject_page_fault(vcpu, &fault); 6328 } 6329 vcpu->arch.apf.halted = false; 6330 } 6331 6332 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) 6333 { 6334 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) 6335 return true; 6336 else 6337 return !kvm_event_needs_reinjection(vcpu) && 6338 kvm_x86_ops->interrupt_allowed(vcpu); 6339 } 6340 6341 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6342 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6343 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6344 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 6345 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 6346 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 6347 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 6348 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 6349 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 6350 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 6351 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 6352 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 6353