1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 #include <linux/user-return-notifier.h> 41 #include <trace/events/kvm.h> 42 #undef TRACE_INCLUDE_FILE 43 #define CREATE_TRACE_POINTS 44 #include "trace.h" 45 46 #include <asm/debugreg.h> 47 #include <asm/uaccess.h> 48 #include <asm/msr.h> 49 #include <asm/desc.h> 50 #include <asm/mtrr.h> 51 #include <asm/mce.h> 52 53 #define MAX_IO_MSRS 256 54 #define CR0_RESERVED_BITS \ 55 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 56 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 57 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 58 #define CR4_RESERVED_BITS \ 59 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 60 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 61 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 62 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 63 64 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 65 66 #define KVM_MAX_MCE_BANKS 32 67 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 68 69 /* EFER defaults: 70 * - enable syscall per default because its emulated by KVM 71 * - enable LME and LMA per default on 64 bit KVM 72 */ 73 #ifdef CONFIG_X86_64 74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 75 #else 76 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 77 #endif 78 79 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 80 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 81 82 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 83 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 84 struct kvm_cpuid_entry2 __user *entries); 85 86 struct kvm_x86_ops *kvm_x86_ops; 87 EXPORT_SYMBOL_GPL(kvm_x86_ops); 88 89 int ignore_msrs = 0; 90 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 91 92 #define KVM_NR_SHARED_MSRS 16 93 94 struct kvm_shared_msrs_global { 95 int nr; 96 struct kvm_shared_msr { 97 u32 msr; 98 u64 value; 99 } msrs[KVM_NR_SHARED_MSRS]; 100 }; 101 102 struct kvm_shared_msrs { 103 struct user_return_notifier urn; 104 bool registered; 105 u64 current_value[KVM_NR_SHARED_MSRS]; 106 }; 107 108 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 109 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); 110 111 struct kvm_stats_debugfs_item debugfs_entries[] = { 112 { "pf_fixed", VCPU_STAT(pf_fixed) }, 113 { "pf_guest", VCPU_STAT(pf_guest) }, 114 { "tlb_flush", VCPU_STAT(tlb_flush) }, 115 { "invlpg", VCPU_STAT(invlpg) }, 116 { "exits", VCPU_STAT(exits) }, 117 { "io_exits", VCPU_STAT(io_exits) }, 118 { "mmio_exits", VCPU_STAT(mmio_exits) }, 119 { "signal_exits", VCPU_STAT(signal_exits) }, 120 { "irq_window", VCPU_STAT(irq_window_exits) }, 121 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 122 { "halt_exits", VCPU_STAT(halt_exits) }, 123 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 124 { "hypercalls", VCPU_STAT(hypercalls) }, 125 { "request_irq", VCPU_STAT(request_irq_exits) }, 126 { "irq_exits", VCPU_STAT(irq_exits) }, 127 { "host_state_reload", VCPU_STAT(host_state_reload) }, 128 { "efer_reload", VCPU_STAT(efer_reload) }, 129 { "fpu_reload", VCPU_STAT(fpu_reload) }, 130 { "insn_emulation", VCPU_STAT(insn_emulation) }, 131 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 132 { "irq_injections", VCPU_STAT(irq_injections) }, 133 { "nmi_injections", VCPU_STAT(nmi_injections) }, 134 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 135 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 136 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 137 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 138 { "mmu_flooded", VM_STAT(mmu_flooded) }, 139 { "mmu_recycled", VM_STAT(mmu_recycled) }, 140 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 141 { "mmu_unsync", VM_STAT(mmu_unsync) }, 142 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 143 { "largepages", VM_STAT(lpages) }, 144 { NULL } 145 }; 146 147 static void kvm_on_user_return(struct user_return_notifier *urn) 148 { 149 unsigned slot; 150 struct kvm_shared_msr *global; 151 struct kvm_shared_msrs *locals 152 = container_of(urn, struct kvm_shared_msrs, urn); 153 154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 155 global = &shared_msrs_global.msrs[slot]; 156 if (global->value != locals->current_value[slot]) { 157 wrmsrl(global->msr, global->value); 158 locals->current_value[slot] = global->value; 159 } 160 } 161 locals->registered = false; 162 user_return_notifier_unregister(urn); 163 } 164 165 void kvm_define_shared_msr(unsigned slot, u32 msr) 166 { 167 int cpu; 168 u64 value; 169 170 if (slot >= shared_msrs_global.nr) 171 shared_msrs_global.nr = slot + 1; 172 shared_msrs_global.msrs[slot].msr = msr; 173 rdmsrl_safe(msr, &value); 174 shared_msrs_global.msrs[slot].value = value; 175 for_each_online_cpu(cpu) 176 per_cpu(shared_msrs, cpu).current_value[slot] = value; 177 } 178 EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 179 180 static void kvm_shared_msr_cpu_online(void) 181 { 182 unsigned i; 183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); 184 185 for (i = 0; i < shared_msrs_global.nr; ++i) 186 locals->current_value[i] = shared_msrs_global.msrs[i].value; 187 } 188 189 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 190 { 191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 192 193 if (((value ^ smsr->current_value[slot]) & mask) == 0) 194 return; 195 smsr->current_value[slot] = value; 196 wrmsrl(shared_msrs_global.msrs[slot].msr, value); 197 if (!smsr->registered) { 198 smsr->urn.on_user_return = kvm_on_user_return; 199 user_return_notifier_register(&smsr->urn); 200 smsr->registered = true; 201 } 202 } 203 EXPORT_SYMBOL_GPL(kvm_set_shared_msr); 204 205 static void drop_user_return_notifiers(void *ignore) 206 { 207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 208 209 if (smsr->registered) 210 kvm_on_user_return(&smsr->urn); 211 } 212 213 unsigned long segment_base(u16 selector) 214 { 215 struct descriptor_table gdt; 216 struct desc_struct *d; 217 unsigned long table_base; 218 unsigned long v; 219 220 if (selector == 0) 221 return 0; 222 223 kvm_get_gdt(&gdt); 224 table_base = gdt.base; 225 226 if (selector & 4) { /* from ldt */ 227 u16 ldt_selector = kvm_read_ldt(); 228 229 table_base = segment_base(ldt_selector); 230 } 231 d = (struct desc_struct *)(table_base + (selector & ~7)); 232 v = get_desc_base(d); 233 #ifdef CONFIG_X86_64 234 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 235 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 236 #endif 237 return v; 238 } 239 EXPORT_SYMBOL_GPL(segment_base); 240 241 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 242 { 243 if (irqchip_in_kernel(vcpu->kvm)) 244 return vcpu->arch.apic_base; 245 else 246 return vcpu->arch.apic_base; 247 } 248 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 249 250 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 251 { 252 /* TODO: reserve bits check */ 253 if (irqchip_in_kernel(vcpu->kvm)) 254 kvm_lapic_set_base(vcpu, data); 255 else 256 vcpu->arch.apic_base = data; 257 } 258 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 259 260 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 261 { 262 WARN_ON(vcpu->arch.exception.pending); 263 vcpu->arch.exception.pending = true; 264 vcpu->arch.exception.has_error_code = false; 265 vcpu->arch.exception.nr = nr; 266 } 267 EXPORT_SYMBOL_GPL(kvm_queue_exception); 268 269 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 270 u32 error_code) 271 { 272 ++vcpu->stat.pf_guest; 273 274 if (vcpu->arch.exception.pending) { 275 switch(vcpu->arch.exception.nr) { 276 case DF_VECTOR: 277 /* triple fault -> shutdown */ 278 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 279 return; 280 case PF_VECTOR: 281 vcpu->arch.exception.nr = DF_VECTOR; 282 vcpu->arch.exception.error_code = 0; 283 return; 284 default: 285 /* replace previous exception with a new one in a hope 286 that instruction re-execution will regenerate lost 287 exception */ 288 vcpu->arch.exception.pending = false; 289 break; 290 } 291 } 292 vcpu->arch.cr2 = addr; 293 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 294 } 295 296 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 297 { 298 vcpu->arch.nmi_pending = 1; 299 } 300 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 301 302 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 303 { 304 WARN_ON(vcpu->arch.exception.pending); 305 vcpu->arch.exception.pending = true; 306 vcpu->arch.exception.has_error_code = true; 307 vcpu->arch.exception.nr = nr; 308 vcpu->arch.exception.error_code = error_code; 309 } 310 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 311 312 /* 313 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 314 * a #GP and return false. 315 */ 316 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 317 { 318 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 319 return true; 320 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 321 return false; 322 } 323 EXPORT_SYMBOL_GPL(kvm_require_cpl); 324 325 /* 326 * Load the pae pdptrs. Return true is they are all valid. 327 */ 328 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 329 { 330 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 331 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 332 int i; 333 int ret; 334 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 335 336 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 337 offset * sizeof(u64), sizeof(pdpte)); 338 if (ret < 0) { 339 ret = 0; 340 goto out; 341 } 342 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 343 if (is_present_gpte(pdpte[i]) && 344 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 345 ret = 0; 346 goto out; 347 } 348 } 349 ret = 1; 350 351 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 352 __set_bit(VCPU_EXREG_PDPTR, 353 (unsigned long *)&vcpu->arch.regs_avail); 354 __set_bit(VCPU_EXREG_PDPTR, 355 (unsigned long *)&vcpu->arch.regs_dirty); 356 out: 357 358 return ret; 359 } 360 EXPORT_SYMBOL_GPL(load_pdptrs); 361 362 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 363 { 364 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 365 bool changed = true; 366 int r; 367 368 if (is_long_mode(vcpu) || !is_pae(vcpu)) 369 return false; 370 371 if (!test_bit(VCPU_EXREG_PDPTR, 372 (unsigned long *)&vcpu->arch.regs_avail)) 373 return true; 374 375 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 376 if (r < 0) 377 goto out; 378 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 379 out: 380 381 return changed; 382 } 383 384 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 385 { 386 if (cr0 & CR0_RESERVED_BITS) { 387 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 388 cr0, vcpu->arch.cr0); 389 kvm_inject_gp(vcpu, 0); 390 return; 391 } 392 393 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 394 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 395 kvm_inject_gp(vcpu, 0); 396 return; 397 } 398 399 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 400 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 401 "and a clear PE flag\n"); 402 kvm_inject_gp(vcpu, 0); 403 return; 404 } 405 406 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 407 #ifdef CONFIG_X86_64 408 if ((vcpu->arch.shadow_efer & EFER_LME)) { 409 int cs_db, cs_l; 410 411 if (!is_pae(vcpu)) { 412 printk(KERN_DEBUG "set_cr0: #GP, start paging " 413 "in long mode while PAE is disabled\n"); 414 kvm_inject_gp(vcpu, 0); 415 return; 416 } 417 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 418 if (cs_l) { 419 printk(KERN_DEBUG "set_cr0: #GP, start paging " 420 "in long mode while CS.L == 1\n"); 421 kvm_inject_gp(vcpu, 0); 422 return; 423 424 } 425 } else 426 #endif 427 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 428 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 429 "reserved bits\n"); 430 kvm_inject_gp(vcpu, 0); 431 return; 432 } 433 434 } 435 436 kvm_x86_ops->set_cr0(vcpu, cr0); 437 vcpu->arch.cr0 = cr0; 438 439 kvm_mmu_reset_context(vcpu); 440 return; 441 } 442 EXPORT_SYMBOL_GPL(kvm_set_cr0); 443 444 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 445 { 446 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 447 } 448 EXPORT_SYMBOL_GPL(kvm_lmsw); 449 450 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 451 { 452 unsigned long old_cr4 = vcpu->arch.cr4; 453 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 454 455 if (cr4 & CR4_RESERVED_BITS) { 456 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 457 kvm_inject_gp(vcpu, 0); 458 return; 459 } 460 461 if (is_long_mode(vcpu)) { 462 if (!(cr4 & X86_CR4_PAE)) { 463 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 464 "in long mode\n"); 465 kvm_inject_gp(vcpu, 0); 466 return; 467 } 468 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 469 && ((cr4 ^ old_cr4) & pdptr_bits) 470 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 471 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 472 kvm_inject_gp(vcpu, 0); 473 return; 474 } 475 476 if (cr4 & X86_CR4_VMXE) { 477 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 478 kvm_inject_gp(vcpu, 0); 479 return; 480 } 481 kvm_x86_ops->set_cr4(vcpu, cr4); 482 vcpu->arch.cr4 = cr4; 483 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 484 kvm_mmu_reset_context(vcpu); 485 } 486 EXPORT_SYMBOL_GPL(kvm_set_cr4); 487 488 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 489 { 490 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 491 kvm_mmu_sync_roots(vcpu); 492 kvm_mmu_flush_tlb(vcpu); 493 return; 494 } 495 496 if (is_long_mode(vcpu)) { 497 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 498 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 499 kvm_inject_gp(vcpu, 0); 500 return; 501 } 502 } else { 503 if (is_pae(vcpu)) { 504 if (cr3 & CR3_PAE_RESERVED_BITS) { 505 printk(KERN_DEBUG 506 "set_cr3: #GP, reserved bits\n"); 507 kvm_inject_gp(vcpu, 0); 508 return; 509 } 510 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 511 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 512 "reserved bits\n"); 513 kvm_inject_gp(vcpu, 0); 514 return; 515 } 516 } 517 /* 518 * We don't check reserved bits in nonpae mode, because 519 * this isn't enforced, and VMware depends on this. 520 */ 521 } 522 523 /* 524 * Does the new cr3 value map to physical memory? (Note, we 525 * catch an invalid cr3 even in real-mode, because it would 526 * cause trouble later on when we turn on paging anyway.) 527 * 528 * A real CPU would silently accept an invalid cr3 and would 529 * attempt to use it - with largely undefined (and often hard 530 * to debug) behavior on the guest side. 531 */ 532 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 533 kvm_inject_gp(vcpu, 0); 534 else { 535 vcpu->arch.cr3 = cr3; 536 vcpu->arch.mmu.new_cr3(vcpu); 537 } 538 } 539 EXPORT_SYMBOL_GPL(kvm_set_cr3); 540 541 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 542 { 543 if (cr8 & CR8_RESERVED_BITS) { 544 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 545 kvm_inject_gp(vcpu, 0); 546 return; 547 } 548 if (irqchip_in_kernel(vcpu->kvm)) 549 kvm_lapic_set_tpr(vcpu, cr8); 550 else 551 vcpu->arch.cr8 = cr8; 552 } 553 EXPORT_SYMBOL_GPL(kvm_set_cr8); 554 555 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 556 { 557 if (irqchip_in_kernel(vcpu->kvm)) 558 return kvm_lapic_get_cr8(vcpu); 559 else 560 return vcpu->arch.cr8; 561 } 562 EXPORT_SYMBOL_GPL(kvm_get_cr8); 563 564 static inline u32 bit(int bitno) 565 { 566 return 1 << (bitno & 31); 567 } 568 569 /* 570 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 572 * 573 * This list is modified at module load time to reflect the 574 * capabilities of the host cpu. This capabilities test skips MSRs that are 575 * kvm-specific. Those are put in the beginning of the list. 576 */ 577 578 #define KVM_SAVE_MSRS_BEGIN 2 579 static u32 msrs_to_save[] = { 580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 582 MSR_K6_STAR, 583 #ifdef CONFIG_X86_64 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 585 #endif 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 587 }; 588 589 static unsigned num_msrs_to_save; 590 591 static u32 emulated_msrs[] = { 592 MSR_IA32_MISC_ENABLE, 593 }; 594 595 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 596 { 597 if (efer & efer_reserved_bits) { 598 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 599 efer); 600 kvm_inject_gp(vcpu, 0); 601 return; 602 } 603 604 if (is_paging(vcpu) 605 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 606 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 607 kvm_inject_gp(vcpu, 0); 608 return; 609 } 610 611 if (efer & EFER_FFXSR) { 612 struct kvm_cpuid_entry2 *feat; 613 614 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 615 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 616 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 617 kvm_inject_gp(vcpu, 0); 618 return; 619 } 620 } 621 622 if (efer & EFER_SVME) { 623 struct kvm_cpuid_entry2 *feat; 624 625 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 626 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 627 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 628 kvm_inject_gp(vcpu, 0); 629 return; 630 } 631 } 632 633 kvm_x86_ops->set_efer(vcpu, efer); 634 635 efer &= ~EFER_LMA; 636 efer |= vcpu->arch.shadow_efer & EFER_LMA; 637 638 vcpu->arch.shadow_efer = efer; 639 640 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 641 kvm_mmu_reset_context(vcpu); 642 } 643 644 void kvm_enable_efer_bits(u64 mask) 645 { 646 efer_reserved_bits &= ~mask; 647 } 648 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 649 650 651 /* 652 * Writes msr value into into the appropriate "register". 653 * Returns 0 on success, non-0 otherwise. 654 * Assumes vcpu_load() was already called. 655 */ 656 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 657 { 658 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 659 } 660 661 /* 662 * Adapt set_msr() to msr_io()'s calling convention 663 */ 664 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 665 { 666 return kvm_set_msr(vcpu, index, *data); 667 } 668 669 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 670 { 671 static int version; 672 struct pvclock_wall_clock wc; 673 struct timespec now, sys, boot; 674 675 if (!wall_clock) 676 return; 677 678 version++; 679 680 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 681 682 /* 683 * The guest calculates current wall clock time by adding 684 * system time (updated by kvm_write_guest_time below) to the 685 * wall clock specified here. guest system time equals host 686 * system time for us, thus we must fill in host boot time here. 687 */ 688 now = current_kernel_time(); 689 ktime_get_ts(&sys); 690 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 691 692 wc.sec = boot.tv_sec; 693 wc.nsec = boot.tv_nsec; 694 wc.version = version; 695 696 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 697 698 version++; 699 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 700 } 701 702 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 703 { 704 uint32_t quotient, remainder; 705 706 /* Don't try to replace with do_div(), this one calculates 707 * "(dividend << 32) / divisor" */ 708 __asm__ ( "divl %4" 709 : "=a" (quotient), "=d" (remainder) 710 : "0" (0), "1" (dividend), "r" (divisor) ); 711 return quotient; 712 } 713 714 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 715 { 716 uint64_t nsecs = 1000000000LL; 717 int32_t shift = 0; 718 uint64_t tps64; 719 uint32_t tps32; 720 721 tps64 = tsc_khz * 1000LL; 722 while (tps64 > nsecs*2) { 723 tps64 >>= 1; 724 shift--; 725 } 726 727 tps32 = (uint32_t)tps64; 728 while (tps32 <= (uint32_t)nsecs) { 729 tps32 <<= 1; 730 shift++; 731 } 732 733 hv_clock->tsc_shift = shift; 734 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 735 736 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 737 __func__, tsc_khz, hv_clock->tsc_shift, 738 hv_clock->tsc_to_system_mul); 739 } 740 741 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 742 743 static void kvm_write_guest_time(struct kvm_vcpu *v) 744 { 745 struct timespec ts; 746 unsigned long flags; 747 struct kvm_vcpu_arch *vcpu = &v->arch; 748 void *shared_kaddr; 749 unsigned long this_tsc_khz; 750 751 if ((!vcpu->time_page)) 752 return; 753 754 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 755 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 756 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 757 vcpu->hv_clock_tsc_khz = this_tsc_khz; 758 } 759 put_cpu_var(cpu_tsc_khz); 760 761 /* Keep irq disabled to prevent changes to the clock */ 762 local_irq_save(flags); 763 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 764 ktime_get_ts(&ts); 765 local_irq_restore(flags); 766 767 /* With all the info we got, fill in the values */ 768 769 vcpu->hv_clock.system_time = ts.tv_nsec + 770 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 771 772 /* 773 * The interface expects us to write an even number signaling that the 774 * update is finished. Since the guest won't see the intermediate 775 * state, we just increase by 2 at the end. 776 */ 777 vcpu->hv_clock.version += 2; 778 779 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 780 781 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 782 sizeof(vcpu->hv_clock)); 783 784 kunmap_atomic(shared_kaddr, KM_USER0); 785 786 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 787 } 788 789 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 790 { 791 struct kvm_vcpu_arch *vcpu = &v->arch; 792 793 if (!vcpu->time_page) 794 return 0; 795 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 796 return 1; 797 } 798 799 static bool msr_mtrr_valid(unsigned msr) 800 { 801 switch (msr) { 802 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 803 case MSR_MTRRfix64K_00000: 804 case MSR_MTRRfix16K_80000: 805 case MSR_MTRRfix16K_A0000: 806 case MSR_MTRRfix4K_C0000: 807 case MSR_MTRRfix4K_C8000: 808 case MSR_MTRRfix4K_D0000: 809 case MSR_MTRRfix4K_D8000: 810 case MSR_MTRRfix4K_E0000: 811 case MSR_MTRRfix4K_E8000: 812 case MSR_MTRRfix4K_F0000: 813 case MSR_MTRRfix4K_F8000: 814 case MSR_MTRRdefType: 815 case MSR_IA32_CR_PAT: 816 return true; 817 case 0x2f8: 818 return true; 819 } 820 return false; 821 } 822 823 static bool valid_pat_type(unsigned t) 824 { 825 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 826 } 827 828 static bool valid_mtrr_type(unsigned t) 829 { 830 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 831 } 832 833 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 834 { 835 int i; 836 837 if (!msr_mtrr_valid(msr)) 838 return false; 839 840 if (msr == MSR_IA32_CR_PAT) { 841 for (i = 0; i < 8; i++) 842 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 843 return false; 844 return true; 845 } else if (msr == MSR_MTRRdefType) { 846 if (data & ~0xcff) 847 return false; 848 return valid_mtrr_type(data & 0xff); 849 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 850 for (i = 0; i < 8 ; i++) 851 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 852 return false; 853 return true; 854 } 855 856 /* variable MTRRs */ 857 return valid_mtrr_type(data & 0xff); 858 } 859 860 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 861 { 862 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 863 864 if (!mtrr_valid(vcpu, msr, data)) 865 return 1; 866 867 if (msr == MSR_MTRRdefType) { 868 vcpu->arch.mtrr_state.def_type = data; 869 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 870 } else if (msr == MSR_MTRRfix64K_00000) 871 p[0] = data; 872 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 873 p[1 + msr - MSR_MTRRfix16K_80000] = data; 874 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 875 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 876 else if (msr == MSR_IA32_CR_PAT) 877 vcpu->arch.pat = data; 878 else { /* Variable MTRRs */ 879 int idx, is_mtrr_mask; 880 u64 *pt; 881 882 idx = (msr - 0x200) / 2; 883 is_mtrr_mask = msr - 0x200 - 2 * idx; 884 if (!is_mtrr_mask) 885 pt = 886 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 887 else 888 pt = 889 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 890 *pt = data; 891 } 892 893 kvm_mmu_reset_context(vcpu); 894 return 0; 895 } 896 897 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 898 { 899 u64 mcg_cap = vcpu->arch.mcg_cap; 900 unsigned bank_num = mcg_cap & 0xff; 901 902 switch (msr) { 903 case MSR_IA32_MCG_STATUS: 904 vcpu->arch.mcg_status = data; 905 break; 906 case MSR_IA32_MCG_CTL: 907 if (!(mcg_cap & MCG_CTL_P)) 908 return 1; 909 if (data != 0 && data != ~(u64)0) 910 return -1; 911 vcpu->arch.mcg_ctl = data; 912 break; 913 default: 914 if (msr >= MSR_IA32_MC0_CTL && 915 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 916 u32 offset = msr - MSR_IA32_MC0_CTL; 917 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 918 if ((offset & 0x3) == 0 && 919 data != 0 && data != ~(u64)0) 920 return -1; 921 vcpu->arch.mce_banks[offset] = data; 922 break; 923 } 924 return 1; 925 } 926 return 0; 927 } 928 929 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) 930 { 931 struct kvm *kvm = vcpu->kvm; 932 int lm = is_long_mode(vcpu); 933 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 934 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; 935 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 936 : kvm->arch.xen_hvm_config.blob_size_32; 937 u32 page_num = data & ~PAGE_MASK; 938 u64 page_addr = data & PAGE_MASK; 939 u8 *page; 940 int r; 941 942 r = -E2BIG; 943 if (page_num >= blob_size) 944 goto out; 945 r = -ENOMEM; 946 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 947 if (!page) 948 goto out; 949 r = -EFAULT; 950 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) 951 goto out_free; 952 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 953 goto out_free; 954 r = 0; 955 out_free: 956 kfree(page); 957 out: 958 return r; 959 } 960 961 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 962 { 963 switch (msr) { 964 case MSR_EFER: 965 set_efer(vcpu, data); 966 break; 967 case MSR_K7_HWCR: 968 data &= ~(u64)0x40; /* ignore flush filter disable */ 969 if (data != 0) { 970 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 971 data); 972 return 1; 973 } 974 break; 975 case MSR_FAM10H_MMIO_CONF_BASE: 976 if (data != 0) { 977 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 978 "0x%llx\n", data); 979 return 1; 980 } 981 break; 982 case MSR_AMD64_NB_CFG: 983 break; 984 case MSR_IA32_DEBUGCTLMSR: 985 if (!data) { 986 /* We support the non-activated case already */ 987 break; 988 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 989 /* Values other than LBR and BTF are vendor-specific, 990 thus reserved and should throw a #GP */ 991 return 1; 992 } 993 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 994 __func__, data); 995 break; 996 case MSR_IA32_UCODE_REV: 997 case MSR_IA32_UCODE_WRITE: 998 case MSR_VM_HSAVE_PA: 999 case MSR_AMD64_PATCH_LOADER: 1000 break; 1001 case 0x200 ... 0x2ff: 1002 return set_msr_mtrr(vcpu, msr, data); 1003 case MSR_IA32_APICBASE: 1004 kvm_set_apic_base(vcpu, data); 1005 break; 1006 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1007 return kvm_x2apic_msr_write(vcpu, msr, data); 1008 case MSR_IA32_MISC_ENABLE: 1009 vcpu->arch.ia32_misc_enable_msr = data; 1010 break; 1011 case MSR_KVM_WALL_CLOCK: 1012 vcpu->kvm->arch.wall_clock = data; 1013 kvm_write_wall_clock(vcpu->kvm, data); 1014 break; 1015 case MSR_KVM_SYSTEM_TIME: { 1016 if (vcpu->arch.time_page) { 1017 kvm_release_page_dirty(vcpu->arch.time_page); 1018 vcpu->arch.time_page = NULL; 1019 } 1020 1021 vcpu->arch.time = data; 1022 1023 /* we verify if the enable bit is set... */ 1024 if (!(data & 1)) 1025 break; 1026 1027 /* ...but clean it before doing the actual write */ 1028 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 1029 1030 vcpu->arch.time_page = 1031 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1032 1033 if (is_error_page(vcpu->arch.time_page)) { 1034 kvm_release_page_clean(vcpu->arch.time_page); 1035 vcpu->arch.time_page = NULL; 1036 } 1037 1038 kvm_request_guest_time_update(vcpu); 1039 break; 1040 } 1041 case MSR_IA32_MCG_CTL: 1042 case MSR_IA32_MCG_STATUS: 1043 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1044 return set_msr_mce(vcpu, msr, data); 1045 1046 /* Performance counters are not protected by a CPUID bit, 1047 * so we should check all of them in the generic path for the sake of 1048 * cross vendor migration. 1049 * Writing a zero into the event select MSRs disables them, 1050 * which we perfectly emulate ;-). Any other value should be at least 1051 * reported, some guests depend on them. 1052 */ 1053 case MSR_P6_EVNTSEL0: 1054 case MSR_P6_EVNTSEL1: 1055 case MSR_K7_EVNTSEL0: 1056 case MSR_K7_EVNTSEL1: 1057 case MSR_K7_EVNTSEL2: 1058 case MSR_K7_EVNTSEL3: 1059 if (data != 0) 1060 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1061 "0x%x data 0x%llx\n", msr, data); 1062 break; 1063 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1064 * so we ignore writes to make it happy. 1065 */ 1066 case MSR_P6_PERFCTR0: 1067 case MSR_P6_PERFCTR1: 1068 case MSR_K7_PERFCTR0: 1069 case MSR_K7_PERFCTR1: 1070 case MSR_K7_PERFCTR2: 1071 case MSR_K7_PERFCTR3: 1072 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1073 "0x%x data 0x%llx\n", msr, data); 1074 break; 1075 default: 1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1077 return xen_hvm_config(vcpu, data); 1078 if (!ignore_msrs) { 1079 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1080 msr, data); 1081 return 1; 1082 } else { 1083 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1084 msr, data); 1085 break; 1086 } 1087 } 1088 return 0; 1089 } 1090 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1091 1092 1093 /* 1094 * Reads an msr value (of 'msr_index') into 'pdata'. 1095 * Returns 0 on success, non-0 otherwise. 1096 * Assumes vcpu_load() was already called. 1097 */ 1098 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1099 { 1100 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 1101 } 1102 1103 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1104 { 1105 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1106 1107 if (!msr_mtrr_valid(msr)) 1108 return 1; 1109 1110 if (msr == MSR_MTRRdefType) 1111 *pdata = vcpu->arch.mtrr_state.def_type + 1112 (vcpu->arch.mtrr_state.enabled << 10); 1113 else if (msr == MSR_MTRRfix64K_00000) 1114 *pdata = p[0]; 1115 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1116 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 1117 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1118 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 1119 else if (msr == MSR_IA32_CR_PAT) 1120 *pdata = vcpu->arch.pat; 1121 else { /* Variable MTRRs */ 1122 int idx, is_mtrr_mask; 1123 u64 *pt; 1124 1125 idx = (msr - 0x200) / 2; 1126 is_mtrr_mask = msr - 0x200 - 2 * idx; 1127 if (!is_mtrr_mask) 1128 pt = 1129 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1130 else 1131 pt = 1132 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1133 *pdata = *pt; 1134 } 1135 1136 return 0; 1137 } 1138 1139 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1140 { 1141 u64 data; 1142 u64 mcg_cap = vcpu->arch.mcg_cap; 1143 unsigned bank_num = mcg_cap & 0xff; 1144 1145 switch (msr) { 1146 case MSR_IA32_P5_MC_ADDR: 1147 case MSR_IA32_P5_MC_TYPE: 1148 data = 0; 1149 break; 1150 case MSR_IA32_MCG_CAP: 1151 data = vcpu->arch.mcg_cap; 1152 break; 1153 case MSR_IA32_MCG_CTL: 1154 if (!(mcg_cap & MCG_CTL_P)) 1155 return 1; 1156 data = vcpu->arch.mcg_ctl; 1157 break; 1158 case MSR_IA32_MCG_STATUS: 1159 data = vcpu->arch.mcg_status; 1160 break; 1161 default: 1162 if (msr >= MSR_IA32_MC0_CTL && 1163 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1164 u32 offset = msr - MSR_IA32_MC0_CTL; 1165 data = vcpu->arch.mce_banks[offset]; 1166 break; 1167 } 1168 return 1; 1169 } 1170 *pdata = data; 1171 return 0; 1172 } 1173 1174 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1175 { 1176 u64 data; 1177 1178 switch (msr) { 1179 case MSR_IA32_PLATFORM_ID: 1180 case MSR_IA32_UCODE_REV: 1181 case MSR_IA32_EBL_CR_POWERON: 1182 case MSR_IA32_DEBUGCTLMSR: 1183 case MSR_IA32_LASTBRANCHFROMIP: 1184 case MSR_IA32_LASTBRANCHTOIP: 1185 case MSR_IA32_LASTINTFROMIP: 1186 case MSR_IA32_LASTINTTOIP: 1187 case MSR_K8_SYSCFG: 1188 case MSR_K7_HWCR: 1189 case MSR_VM_HSAVE_PA: 1190 case MSR_P6_PERFCTR0: 1191 case MSR_P6_PERFCTR1: 1192 case MSR_P6_EVNTSEL0: 1193 case MSR_P6_EVNTSEL1: 1194 case MSR_K7_EVNTSEL0: 1195 case MSR_K7_PERFCTR0: 1196 case MSR_K8_INT_PENDING_MSG: 1197 case MSR_AMD64_NB_CFG: 1198 case MSR_FAM10H_MMIO_CONF_BASE: 1199 data = 0; 1200 break; 1201 case MSR_MTRRcap: 1202 data = 0x500 | KVM_NR_VAR_MTRR; 1203 break; 1204 case 0x200 ... 0x2ff: 1205 return get_msr_mtrr(vcpu, msr, pdata); 1206 case 0xcd: /* fsb frequency */ 1207 data = 3; 1208 break; 1209 case MSR_IA32_APICBASE: 1210 data = kvm_get_apic_base(vcpu); 1211 break; 1212 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1213 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1214 break; 1215 case MSR_IA32_MISC_ENABLE: 1216 data = vcpu->arch.ia32_misc_enable_msr; 1217 break; 1218 case MSR_IA32_PERF_STATUS: 1219 /* TSC increment by tick */ 1220 data = 1000ULL; 1221 /* CPU multiplier */ 1222 data |= (((uint64_t)4ULL) << 40); 1223 break; 1224 case MSR_EFER: 1225 data = vcpu->arch.shadow_efer; 1226 break; 1227 case MSR_KVM_WALL_CLOCK: 1228 data = vcpu->kvm->arch.wall_clock; 1229 break; 1230 case MSR_KVM_SYSTEM_TIME: 1231 data = vcpu->arch.time; 1232 break; 1233 case MSR_IA32_P5_MC_ADDR: 1234 case MSR_IA32_P5_MC_TYPE: 1235 case MSR_IA32_MCG_CAP: 1236 case MSR_IA32_MCG_CTL: 1237 case MSR_IA32_MCG_STATUS: 1238 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1239 return get_msr_mce(vcpu, msr, pdata); 1240 default: 1241 if (!ignore_msrs) { 1242 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1243 return 1; 1244 } else { 1245 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1246 data = 0; 1247 } 1248 break; 1249 } 1250 *pdata = data; 1251 return 0; 1252 } 1253 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1254 1255 /* 1256 * Read or write a bunch of msrs. All parameters are kernel addresses. 1257 * 1258 * @return number of msrs set successfully. 1259 */ 1260 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1261 struct kvm_msr_entry *entries, 1262 int (*do_msr)(struct kvm_vcpu *vcpu, 1263 unsigned index, u64 *data)) 1264 { 1265 int i; 1266 1267 vcpu_load(vcpu); 1268 1269 down_read(&vcpu->kvm->slots_lock); 1270 for (i = 0; i < msrs->nmsrs; ++i) 1271 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1272 break; 1273 up_read(&vcpu->kvm->slots_lock); 1274 1275 vcpu_put(vcpu); 1276 1277 return i; 1278 } 1279 1280 /* 1281 * Read or write a bunch of msrs. Parameters are user addresses. 1282 * 1283 * @return number of msrs set successfully. 1284 */ 1285 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1286 int (*do_msr)(struct kvm_vcpu *vcpu, 1287 unsigned index, u64 *data), 1288 int writeback) 1289 { 1290 struct kvm_msrs msrs; 1291 struct kvm_msr_entry *entries; 1292 int r, n; 1293 unsigned size; 1294 1295 r = -EFAULT; 1296 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1297 goto out; 1298 1299 r = -E2BIG; 1300 if (msrs.nmsrs >= MAX_IO_MSRS) 1301 goto out; 1302 1303 r = -ENOMEM; 1304 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1305 entries = vmalloc(size); 1306 if (!entries) 1307 goto out; 1308 1309 r = -EFAULT; 1310 if (copy_from_user(entries, user_msrs->entries, size)) 1311 goto out_free; 1312 1313 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1314 if (r < 0) 1315 goto out_free; 1316 1317 r = -EFAULT; 1318 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1319 goto out_free; 1320 1321 r = n; 1322 1323 out_free: 1324 vfree(entries); 1325 out: 1326 return r; 1327 } 1328 1329 int kvm_dev_ioctl_check_extension(long ext) 1330 { 1331 int r; 1332 1333 switch (ext) { 1334 case KVM_CAP_IRQCHIP: 1335 case KVM_CAP_HLT: 1336 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1337 case KVM_CAP_SET_TSS_ADDR: 1338 case KVM_CAP_EXT_CPUID: 1339 case KVM_CAP_CLOCKSOURCE: 1340 case KVM_CAP_PIT: 1341 case KVM_CAP_NOP_IO_DELAY: 1342 case KVM_CAP_MP_STATE: 1343 case KVM_CAP_SYNC_MMU: 1344 case KVM_CAP_REINJECT_CONTROL: 1345 case KVM_CAP_IRQ_INJECT_STATUS: 1346 case KVM_CAP_ASSIGN_DEV_IRQ: 1347 case KVM_CAP_IRQFD: 1348 case KVM_CAP_IOEVENTFD: 1349 case KVM_CAP_PIT2: 1350 case KVM_CAP_PIT_STATE2: 1351 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1352 case KVM_CAP_XEN_HVM: 1353 case KVM_CAP_ADJUST_CLOCK: 1354 case KVM_CAP_VCPU_EVENTS: 1355 r = 1; 1356 break; 1357 case KVM_CAP_COALESCED_MMIO: 1358 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1359 break; 1360 case KVM_CAP_VAPIC: 1361 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1362 break; 1363 case KVM_CAP_NR_VCPUS: 1364 r = KVM_MAX_VCPUS; 1365 break; 1366 case KVM_CAP_NR_MEMSLOTS: 1367 r = KVM_MEMORY_SLOTS; 1368 break; 1369 case KVM_CAP_PV_MMU: /* obsolete */ 1370 r = 0; 1371 break; 1372 case KVM_CAP_IOMMU: 1373 r = iommu_found(); 1374 break; 1375 case KVM_CAP_MCE: 1376 r = KVM_MAX_MCE_BANKS; 1377 break; 1378 default: 1379 r = 0; 1380 break; 1381 } 1382 return r; 1383 1384 } 1385 1386 long kvm_arch_dev_ioctl(struct file *filp, 1387 unsigned int ioctl, unsigned long arg) 1388 { 1389 void __user *argp = (void __user *)arg; 1390 long r; 1391 1392 switch (ioctl) { 1393 case KVM_GET_MSR_INDEX_LIST: { 1394 struct kvm_msr_list __user *user_msr_list = argp; 1395 struct kvm_msr_list msr_list; 1396 unsigned n; 1397 1398 r = -EFAULT; 1399 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1400 goto out; 1401 n = msr_list.nmsrs; 1402 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1403 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1404 goto out; 1405 r = -E2BIG; 1406 if (n < msr_list.nmsrs) 1407 goto out; 1408 r = -EFAULT; 1409 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1410 num_msrs_to_save * sizeof(u32))) 1411 goto out; 1412 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 1413 &emulated_msrs, 1414 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1415 goto out; 1416 r = 0; 1417 break; 1418 } 1419 case KVM_GET_SUPPORTED_CPUID: { 1420 struct kvm_cpuid2 __user *cpuid_arg = argp; 1421 struct kvm_cpuid2 cpuid; 1422 1423 r = -EFAULT; 1424 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1425 goto out; 1426 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1427 cpuid_arg->entries); 1428 if (r) 1429 goto out; 1430 1431 r = -EFAULT; 1432 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1433 goto out; 1434 r = 0; 1435 break; 1436 } 1437 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 1438 u64 mce_cap; 1439 1440 mce_cap = KVM_MCE_CAP_SUPPORTED; 1441 r = -EFAULT; 1442 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 1443 goto out; 1444 r = 0; 1445 break; 1446 } 1447 default: 1448 r = -EINVAL; 1449 } 1450 out: 1451 return r; 1452 } 1453 1454 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1455 { 1456 kvm_x86_ops->vcpu_load(vcpu, cpu); 1457 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1458 unsigned long khz = cpufreq_quick_get(cpu); 1459 if (!khz) 1460 khz = tsc_khz; 1461 per_cpu(cpu_tsc_khz, cpu) = khz; 1462 } 1463 kvm_request_guest_time_update(vcpu); 1464 } 1465 1466 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1467 { 1468 kvm_x86_ops->vcpu_put(vcpu); 1469 kvm_put_guest_fpu(vcpu); 1470 } 1471 1472 static int is_efer_nx(void) 1473 { 1474 unsigned long long efer = 0; 1475 1476 rdmsrl_safe(MSR_EFER, &efer); 1477 return efer & EFER_NX; 1478 } 1479 1480 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1481 { 1482 int i; 1483 struct kvm_cpuid_entry2 *e, *entry; 1484 1485 entry = NULL; 1486 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1487 e = &vcpu->arch.cpuid_entries[i]; 1488 if (e->function == 0x80000001) { 1489 entry = e; 1490 break; 1491 } 1492 } 1493 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1494 entry->edx &= ~(1 << 20); 1495 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1496 } 1497 } 1498 1499 /* when an old userspace process fills a new kernel module */ 1500 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1501 struct kvm_cpuid *cpuid, 1502 struct kvm_cpuid_entry __user *entries) 1503 { 1504 int r, i; 1505 struct kvm_cpuid_entry *cpuid_entries; 1506 1507 r = -E2BIG; 1508 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1509 goto out; 1510 r = -ENOMEM; 1511 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1512 if (!cpuid_entries) 1513 goto out; 1514 r = -EFAULT; 1515 if (copy_from_user(cpuid_entries, entries, 1516 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1517 goto out_free; 1518 for (i = 0; i < cpuid->nent; i++) { 1519 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1520 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1521 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1522 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1523 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1524 vcpu->arch.cpuid_entries[i].index = 0; 1525 vcpu->arch.cpuid_entries[i].flags = 0; 1526 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1527 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1528 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1529 } 1530 vcpu->arch.cpuid_nent = cpuid->nent; 1531 cpuid_fix_nx_cap(vcpu); 1532 r = 0; 1533 kvm_apic_set_version(vcpu); 1534 1535 out_free: 1536 vfree(cpuid_entries); 1537 out: 1538 return r; 1539 } 1540 1541 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1542 struct kvm_cpuid2 *cpuid, 1543 struct kvm_cpuid_entry2 __user *entries) 1544 { 1545 int r; 1546 1547 r = -E2BIG; 1548 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1549 goto out; 1550 r = -EFAULT; 1551 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1552 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1553 goto out; 1554 vcpu->arch.cpuid_nent = cpuid->nent; 1555 kvm_apic_set_version(vcpu); 1556 return 0; 1557 1558 out: 1559 return r; 1560 } 1561 1562 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1563 struct kvm_cpuid2 *cpuid, 1564 struct kvm_cpuid_entry2 __user *entries) 1565 { 1566 int r; 1567 1568 r = -E2BIG; 1569 if (cpuid->nent < vcpu->arch.cpuid_nent) 1570 goto out; 1571 r = -EFAULT; 1572 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1573 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1574 goto out; 1575 return 0; 1576 1577 out: 1578 cpuid->nent = vcpu->arch.cpuid_nent; 1579 return r; 1580 } 1581 1582 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1583 u32 index) 1584 { 1585 entry->function = function; 1586 entry->index = index; 1587 cpuid_count(entry->function, entry->index, 1588 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1589 entry->flags = 0; 1590 } 1591 1592 #define F(x) bit(X86_FEATURE_##x) 1593 1594 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1595 u32 index, int *nent, int maxnent) 1596 { 1597 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1598 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; 1599 #ifdef CONFIG_X86_64 1600 unsigned f_lm = F(LM); 1601 #else 1602 unsigned f_lm = 0; 1603 #endif 1604 1605 /* cpuid 1.edx */ 1606 const u32 kvm_supported_word0_x86_features = 1607 F(FPU) | F(VME) | F(DE) | F(PSE) | 1608 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1609 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1610 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1611 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1612 0 /* Reserved, DS, ACPI */ | F(MMX) | 1613 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1614 0 /* HTT, TM, Reserved, PBE */; 1615 /* cpuid 0x80000001.edx */ 1616 const u32 kvm_supported_word1_x86_features = 1617 F(FPU) | F(VME) | F(DE) | F(PSE) | 1618 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1619 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1620 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1621 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1622 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1623 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1624 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1625 /* cpuid 1.ecx */ 1626 const u32 kvm_supported_word4_x86_features = 1627 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1628 0 /* DS-CPL, VMX, SMX, EST */ | 1629 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1630 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1631 0 /* Reserved, DCA */ | F(XMM4_1) | 1632 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1633 0 /* Reserved, XSAVE, OSXSAVE */; 1634 /* cpuid 0x80000001.ecx */ 1635 const u32 kvm_supported_word6_x86_features = 1636 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1637 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1638 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1639 0 /* SKINIT */ | 0 /* WDT */; 1640 1641 /* all calls to cpuid_count() should be made on the same cpu */ 1642 get_cpu(); 1643 do_cpuid_1_ent(entry, function, index); 1644 ++*nent; 1645 1646 switch (function) { 1647 case 0: 1648 entry->eax = min(entry->eax, (u32)0xb); 1649 break; 1650 case 1: 1651 entry->edx &= kvm_supported_word0_x86_features; 1652 entry->ecx &= kvm_supported_word4_x86_features; 1653 /* we support x2apic emulation even if host does not support 1654 * it since we emulate x2apic in software */ 1655 entry->ecx |= F(X2APIC); 1656 break; 1657 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1658 * may return different values. This forces us to get_cpu() before 1659 * issuing the first command, and also to emulate this annoying behavior 1660 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1661 case 2: { 1662 int t, times = entry->eax & 0xff; 1663 1664 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1665 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1666 for (t = 1; t < times && *nent < maxnent; ++t) { 1667 do_cpuid_1_ent(&entry[t], function, 0); 1668 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1669 ++*nent; 1670 } 1671 break; 1672 } 1673 /* function 4 and 0xb have additional index. */ 1674 case 4: { 1675 int i, cache_type; 1676 1677 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1678 /* read more entries until cache_type is zero */ 1679 for (i = 1; *nent < maxnent; ++i) { 1680 cache_type = entry[i - 1].eax & 0x1f; 1681 if (!cache_type) 1682 break; 1683 do_cpuid_1_ent(&entry[i], function, i); 1684 entry[i].flags |= 1685 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1686 ++*nent; 1687 } 1688 break; 1689 } 1690 case 0xb: { 1691 int i, level_type; 1692 1693 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1694 /* read more entries until level_type is zero */ 1695 for (i = 1; *nent < maxnent; ++i) { 1696 level_type = entry[i - 1].ecx & 0xff00; 1697 if (!level_type) 1698 break; 1699 do_cpuid_1_ent(&entry[i], function, i); 1700 entry[i].flags |= 1701 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1702 ++*nent; 1703 } 1704 break; 1705 } 1706 case 0x80000000: 1707 entry->eax = min(entry->eax, 0x8000001a); 1708 break; 1709 case 0x80000001: 1710 entry->edx &= kvm_supported_word1_x86_features; 1711 entry->ecx &= kvm_supported_word6_x86_features; 1712 break; 1713 } 1714 put_cpu(); 1715 } 1716 1717 #undef F 1718 1719 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1720 struct kvm_cpuid_entry2 __user *entries) 1721 { 1722 struct kvm_cpuid_entry2 *cpuid_entries; 1723 int limit, nent = 0, r = -E2BIG; 1724 u32 func; 1725 1726 if (cpuid->nent < 1) 1727 goto out; 1728 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1729 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 1730 r = -ENOMEM; 1731 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1732 if (!cpuid_entries) 1733 goto out; 1734 1735 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1736 limit = cpuid_entries[0].eax; 1737 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1738 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1739 &nent, cpuid->nent); 1740 r = -E2BIG; 1741 if (nent >= cpuid->nent) 1742 goto out_free; 1743 1744 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1745 limit = cpuid_entries[nent - 1].eax; 1746 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1747 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1748 &nent, cpuid->nent); 1749 r = -E2BIG; 1750 if (nent >= cpuid->nent) 1751 goto out_free; 1752 1753 r = -EFAULT; 1754 if (copy_to_user(entries, cpuid_entries, 1755 nent * sizeof(struct kvm_cpuid_entry2))) 1756 goto out_free; 1757 cpuid->nent = nent; 1758 r = 0; 1759 1760 out_free: 1761 vfree(cpuid_entries); 1762 out: 1763 return r; 1764 } 1765 1766 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1767 struct kvm_lapic_state *s) 1768 { 1769 vcpu_load(vcpu); 1770 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1771 vcpu_put(vcpu); 1772 1773 return 0; 1774 } 1775 1776 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1777 struct kvm_lapic_state *s) 1778 { 1779 vcpu_load(vcpu); 1780 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1781 kvm_apic_post_state_restore(vcpu); 1782 update_cr8_intercept(vcpu); 1783 vcpu_put(vcpu); 1784 1785 return 0; 1786 } 1787 1788 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1789 struct kvm_interrupt *irq) 1790 { 1791 if (irq->irq < 0 || irq->irq >= 256) 1792 return -EINVAL; 1793 if (irqchip_in_kernel(vcpu->kvm)) 1794 return -ENXIO; 1795 vcpu_load(vcpu); 1796 1797 kvm_queue_interrupt(vcpu, irq->irq, false); 1798 1799 vcpu_put(vcpu); 1800 1801 return 0; 1802 } 1803 1804 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1805 { 1806 vcpu_load(vcpu); 1807 kvm_inject_nmi(vcpu); 1808 vcpu_put(vcpu); 1809 1810 return 0; 1811 } 1812 1813 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1814 struct kvm_tpr_access_ctl *tac) 1815 { 1816 if (tac->flags) 1817 return -EINVAL; 1818 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1819 return 0; 1820 } 1821 1822 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 1823 u64 mcg_cap) 1824 { 1825 int r; 1826 unsigned bank_num = mcg_cap & 0xff, bank; 1827 1828 r = -EINVAL; 1829 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 1830 goto out; 1831 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1832 goto out; 1833 r = 0; 1834 vcpu->arch.mcg_cap = mcg_cap; 1835 /* Init IA32_MCG_CTL to all 1s */ 1836 if (mcg_cap & MCG_CTL_P) 1837 vcpu->arch.mcg_ctl = ~(u64)0; 1838 /* Init IA32_MCi_CTL to all 1s */ 1839 for (bank = 0; bank < bank_num; bank++) 1840 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 1841 out: 1842 return r; 1843 } 1844 1845 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 1846 struct kvm_x86_mce *mce) 1847 { 1848 u64 mcg_cap = vcpu->arch.mcg_cap; 1849 unsigned bank_num = mcg_cap & 0xff; 1850 u64 *banks = vcpu->arch.mce_banks; 1851 1852 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 1853 return -EINVAL; 1854 /* 1855 * if IA32_MCG_CTL is not all 1s, the uncorrected error 1856 * reporting is disabled 1857 */ 1858 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 1859 vcpu->arch.mcg_ctl != ~(u64)0) 1860 return 0; 1861 banks += 4 * mce->bank; 1862 /* 1863 * if IA32_MCi_CTL is not all 1s, the uncorrected error 1864 * reporting is disabled for the bank 1865 */ 1866 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 1867 return 0; 1868 if (mce->status & MCI_STATUS_UC) { 1869 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 1870 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 1871 printk(KERN_DEBUG "kvm: set_mce: " 1872 "injects mce exception while " 1873 "previous one is in progress!\n"); 1874 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 1875 return 0; 1876 } 1877 if (banks[1] & MCI_STATUS_VAL) 1878 mce->status |= MCI_STATUS_OVER; 1879 banks[2] = mce->addr; 1880 banks[3] = mce->misc; 1881 vcpu->arch.mcg_status = mce->mcg_status; 1882 banks[1] = mce->status; 1883 kvm_queue_exception(vcpu, MC_VECTOR); 1884 } else if (!(banks[1] & MCI_STATUS_VAL) 1885 || !(banks[1] & MCI_STATUS_UC)) { 1886 if (banks[1] & MCI_STATUS_VAL) 1887 mce->status |= MCI_STATUS_OVER; 1888 banks[2] = mce->addr; 1889 banks[3] = mce->misc; 1890 banks[1] = mce->status; 1891 } else 1892 banks[1] |= MCI_STATUS_OVER; 1893 return 0; 1894 } 1895 1896 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 1897 struct kvm_vcpu_events *events) 1898 { 1899 vcpu_load(vcpu); 1900 1901 events->exception.injected = vcpu->arch.exception.pending; 1902 events->exception.nr = vcpu->arch.exception.nr; 1903 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 1904 events->exception.error_code = vcpu->arch.exception.error_code; 1905 1906 events->interrupt.injected = vcpu->arch.interrupt.pending; 1907 events->interrupt.nr = vcpu->arch.interrupt.nr; 1908 events->interrupt.soft = vcpu->arch.interrupt.soft; 1909 1910 events->nmi.injected = vcpu->arch.nmi_injected; 1911 events->nmi.pending = vcpu->arch.nmi_pending; 1912 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 1913 1914 events->sipi_vector = vcpu->arch.sipi_vector; 1915 1916 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 1917 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 1918 1919 vcpu_put(vcpu); 1920 } 1921 1922 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 1923 struct kvm_vcpu_events *events) 1924 { 1925 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 1926 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 1927 return -EINVAL; 1928 1929 vcpu_load(vcpu); 1930 1931 vcpu->arch.exception.pending = events->exception.injected; 1932 vcpu->arch.exception.nr = events->exception.nr; 1933 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 1934 vcpu->arch.exception.error_code = events->exception.error_code; 1935 1936 vcpu->arch.interrupt.pending = events->interrupt.injected; 1937 vcpu->arch.interrupt.nr = events->interrupt.nr; 1938 vcpu->arch.interrupt.soft = events->interrupt.soft; 1939 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 1940 kvm_pic_clear_isr_ack(vcpu->kvm); 1941 1942 vcpu->arch.nmi_injected = events->nmi.injected; 1943 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 1944 vcpu->arch.nmi_pending = events->nmi.pending; 1945 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 1946 1947 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 1948 vcpu->arch.sipi_vector = events->sipi_vector; 1949 1950 vcpu_put(vcpu); 1951 1952 return 0; 1953 } 1954 1955 long kvm_arch_vcpu_ioctl(struct file *filp, 1956 unsigned int ioctl, unsigned long arg) 1957 { 1958 struct kvm_vcpu *vcpu = filp->private_data; 1959 void __user *argp = (void __user *)arg; 1960 int r; 1961 struct kvm_lapic_state *lapic = NULL; 1962 1963 switch (ioctl) { 1964 case KVM_GET_LAPIC: { 1965 r = -EINVAL; 1966 if (!vcpu->arch.apic) 1967 goto out; 1968 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1969 1970 r = -ENOMEM; 1971 if (!lapic) 1972 goto out; 1973 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1974 if (r) 1975 goto out; 1976 r = -EFAULT; 1977 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1978 goto out; 1979 r = 0; 1980 break; 1981 } 1982 case KVM_SET_LAPIC: { 1983 r = -EINVAL; 1984 if (!vcpu->arch.apic) 1985 goto out; 1986 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1987 r = -ENOMEM; 1988 if (!lapic) 1989 goto out; 1990 r = -EFAULT; 1991 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1992 goto out; 1993 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1994 if (r) 1995 goto out; 1996 r = 0; 1997 break; 1998 } 1999 case KVM_INTERRUPT: { 2000 struct kvm_interrupt irq; 2001 2002 r = -EFAULT; 2003 if (copy_from_user(&irq, argp, sizeof irq)) 2004 goto out; 2005 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 2006 if (r) 2007 goto out; 2008 r = 0; 2009 break; 2010 } 2011 case KVM_NMI: { 2012 r = kvm_vcpu_ioctl_nmi(vcpu); 2013 if (r) 2014 goto out; 2015 r = 0; 2016 break; 2017 } 2018 case KVM_SET_CPUID: { 2019 struct kvm_cpuid __user *cpuid_arg = argp; 2020 struct kvm_cpuid cpuid; 2021 2022 r = -EFAULT; 2023 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2024 goto out; 2025 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 2026 if (r) 2027 goto out; 2028 break; 2029 } 2030 case KVM_SET_CPUID2: { 2031 struct kvm_cpuid2 __user *cpuid_arg = argp; 2032 struct kvm_cpuid2 cpuid; 2033 2034 r = -EFAULT; 2035 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2036 goto out; 2037 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 2038 cpuid_arg->entries); 2039 if (r) 2040 goto out; 2041 break; 2042 } 2043 case KVM_GET_CPUID2: { 2044 struct kvm_cpuid2 __user *cpuid_arg = argp; 2045 struct kvm_cpuid2 cpuid; 2046 2047 r = -EFAULT; 2048 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2049 goto out; 2050 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 2051 cpuid_arg->entries); 2052 if (r) 2053 goto out; 2054 r = -EFAULT; 2055 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 2056 goto out; 2057 r = 0; 2058 break; 2059 } 2060 case KVM_GET_MSRS: 2061 r = msr_io(vcpu, argp, kvm_get_msr, 1); 2062 break; 2063 case KVM_SET_MSRS: 2064 r = msr_io(vcpu, argp, do_set_msr, 0); 2065 break; 2066 case KVM_TPR_ACCESS_REPORTING: { 2067 struct kvm_tpr_access_ctl tac; 2068 2069 r = -EFAULT; 2070 if (copy_from_user(&tac, argp, sizeof tac)) 2071 goto out; 2072 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 2073 if (r) 2074 goto out; 2075 r = -EFAULT; 2076 if (copy_to_user(argp, &tac, sizeof tac)) 2077 goto out; 2078 r = 0; 2079 break; 2080 }; 2081 case KVM_SET_VAPIC_ADDR: { 2082 struct kvm_vapic_addr va; 2083 2084 r = -EINVAL; 2085 if (!irqchip_in_kernel(vcpu->kvm)) 2086 goto out; 2087 r = -EFAULT; 2088 if (copy_from_user(&va, argp, sizeof va)) 2089 goto out; 2090 r = 0; 2091 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 2092 break; 2093 } 2094 case KVM_X86_SETUP_MCE: { 2095 u64 mcg_cap; 2096 2097 r = -EFAULT; 2098 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 2099 goto out; 2100 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 2101 break; 2102 } 2103 case KVM_X86_SET_MCE: { 2104 struct kvm_x86_mce mce; 2105 2106 r = -EFAULT; 2107 if (copy_from_user(&mce, argp, sizeof mce)) 2108 goto out; 2109 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2110 break; 2111 } 2112 case KVM_GET_VCPU_EVENTS: { 2113 struct kvm_vcpu_events events; 2114 2115 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 2116 2117 r = -EFAULT; 2118 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 2119 break; 2120 r = 0; 2121 break; 2122 } 2123 case KVM_SET_VCPU_EVENTS: { 2124 struct kvm_vcpu_events events; 2125 2126 r = -EFAULT; 2127 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 2128 break; 2129 2130 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2131 break; 2132 } 2133 default: 2134 r = -EINVAL; 2135 } 2136 out: 2137 kfree(lapic); 2138 return r; 2139 } 2140 2141 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2142 { 2143 int ret; 2144 2145 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 2146 return -1; 2147 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 2148 return ret; 2149 } 2150 2151 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 2152 u64 ident_addr) 2153 { 2154 kvm->arch.ept_identity_map_addr = ident_addr; 2155 return 0; 2156 } 2157 2158 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 2159 u32 kvm_nr_mmu_pages) 2160 { 2161 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2162 return -EINVAL; 2163 2164 down_write(&kvm->slots_lock); 2165 spin_lock(&kvm->mmu_lock); 2166 2167 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2168 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2169 2170 spin_unlock(&kvm->mmu_lock); 2171 up_write(&kvm->slots_lock); 2172 return 0; 2173 } 2174 2175 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 2176 { 2177 return kvm->arch.n_alloc_mmu_pages; 2178 } 2179 2180 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2181 { 2182 int i; 2183 struct kvm_mem_alias *alias; 2184 2185 for (i = 0; i < kvm->arch.naliases; ++i) { 2186 alias = &kvm->arch.aliases[i]; 2187 if (gfn >= alias->base_gfn 2188 && gfn < alias->base_gfn + alias->npages) 2189 return alias->target_gfn + gfn - alias->base_gfn; 2190 } 2191 return gfn; 2192 } 2193 2194 /* 2195 * Set a new alias region. Aliases map a portion of physical memory into 2196 * another portion. This is useful for memory windows, for example the PC 2197 * VGA region. 2198 */ 2199 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 2200 struct kvm_memory_alias *alias) 2201 { 2202 int r, n; 2203 struct kvm_mem_alias *p; 2204 2205 r = -EINVAL; 2206 /* General sanity checks */ 2207 if (alias->memory_size & (PAGE_SIZE - 1)) 2208 goto out; 2209 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 2210 goto out; 2211 if (alias->slot >= KVM_ALIAS_SLOTS) 2212 goto out; 2213 if (alias->guest_phys_addr + alias->memory_size 2214 < alias->guest_phys_addr) 2215 goto out; 2216 if (alias->target_phys_addr + alias->memory_size 2217 < alias->target_phys_addr) 2218 goto out; 2219 2220 down_write(&kvm->slots_lock); 2221 spin_lock(&kvm->mmu_lock); 2222 2223 p = &kvm->arch.aliases[alias->slot]; 2224 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2225 p->npages = alias->memory_size >> PAGE_SHIFT; 2226 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2227 2228 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2229 if (kvm->arch.aliases[n - 1].npages) 2230 break; 2231 kvm->arch.naliases = n; 2232 2233 spin_unlock(&kvm->mmu_lock); 2234 kvm_mmu_zap_all(kvm); 2235 2236 up_write(&kvm->slots_lock); 2237 2238 return 0; 2239 2240 out: 2241 return r; 2242 } 2243 2244 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2245 { 2246 int r; 2247 2248 r = 0; 2249 switch (chip->chip_id) { 2250 case KVM_IRQCHIP_PIC_MASTER: 2251 memcpy(&chip->chip.pic, 2252 &pic_irqchip(kvm)->pics[0], 2253 sizeof(struct kvm_pic_state)); 2254 break; 2255 case KVM_IRQCHIP_PIC_SLAVE: 2256 memcpy(&chip->chip.pic, 2257 &pic_irqchip(kvm)->pics[1], 2258 sizeof(struct kvm_pic_state)); 2259 break; 2260 case KVM_IRQCHIP_IOAPIC: 2261 r = kvm_get_ioapic(kvm, &chip->chip.ioapic); 2262 break; 2263 default: 2264 r = -EINVAL; 2265 break; 2266 } 2267 return r; 2268 } 2269 2270 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2271 { 2272 int r; 2273 2274 r = 0; 2275 switch (chip->chip_id) { 2276 case KVM_IRQCHIP_PIC_MASTER: 2277 spin_lock(&pic_irqchip(kvm)->lock); 2278 memcpy(&pic_irqchip(kvm)->pics[0], 2279 &chip->chip.pic, 2280 sizeof(struct kvm_pic_state)); 2281 spin_unlock(&pic_irqchip(kvm)->lock); 2282 break; 2283 case KVM_IRQCHIP_PIC_SLAVE: 2284 spin_lock(&pic_irqchip(kvm)->lock); 2285 memcpy(&pic_irqchip(kvm)->pics[1], 2286 &chip->chip.pic, 2287 sizeof(struct kvm_pic_state)); 2288 spin_unlock(&pic_irqchip(kvm)->lock); 2289 break; 2290 case KVM_IRQCHIP_IOAPIC: 2291 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2292 break; 2293 default: 2294 r = -EINVAL; 2295 break; 2296 } 2297 kvm_pic_update_irq(pic_irqchip(kvm)); 2298 return r; 2299 } 2300 2301 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2302 { 2303 int r = 0; 2304 2305 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2306 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2307 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2308 return r; 2309 } 2310 2311 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2312 { 2313 int r = 0; 2314 2315 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2316 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2317 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 2318 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2319 return r; 2320 } 2321 2322 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2323 { 2324 int r = 0; 2325 2326 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2327 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 2328 sizeof(ps->channels)); 2329 ps->flags = kvm->arch.vpit->pit_state.flags; 2330 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2331 return r; 2332 } 2333 2334 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2335 { 2336 int r = 0, start = 0; 2337 u32 prev_legacy, cur_legacy; 2338 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2339 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 2340 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 2341 if (!prev_legacy && cur_legacy) 2342 start = 1; 2343 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 2344 sizeof(kvm->arch.vpit->pit_state.channels)); 2345 kvm->arch.vpit->pit_state.flags = ps->flags; 2346 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 2347 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2348 return r; 2349 } 2350 2351 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 2352 struct kvm_reinject_control *control) 2353 { 2354 if (!kvm->arch.vpit) 2355 return -ENXIO; 2356 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2357 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2358 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2359 return 0; 2360 } 2361 2362 /* 2363 * Get (and clear) the dirty memory log for a memory slot. 2364 */ 2365 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2366 struct kvm_dirty_log *log) 2367 { 2368 int r; 2369 int n; 2370 struct kvm_memory_slot *memslot; 2371 int is_dirty = 0; 2372 2373 down_write(&kvm->slots_lock); 2374 2375 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2376 if (r) 2377 goto out; 2378 2379 /* If nothing is dirty, don't bother messing with page tables. */ 2380 if (is_dirty) { 2381 spin_lock(&kvm->mmu_lock); 2382 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2383 spin_unlock(&kvm->mmu_lock); 2384 memslot = &kvm->memslots[log->slot]; 2385 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2386 memset(memslot->dirty_bitmap, 0, n); 2387 } 2388 r = 0; 2389 out: 2390 up_write(&kvm->slots_lock); 2391 return r; 2392 } 2393 2394 long kvm_arch_vm_ioctl(struct file *filp, 2395 unsigned int ioctl, unsigned long arg) 2396 { 2397 struct kvm *kvm = filp->private_data; 2398 void __user *argp = (void __user *)arg; 2399 int r = -ENOTTY; 2400 /* 2401 * This union makes it completely explicit to gcc-3.x 2402 * that these two variables' stack usage should be 2403 * combined, not added together. 2404 */ 2405 union { 2406 struct kvm_pit_state ps; 2407 struct kvm_pit_state2 ps2; 2408 struct kvm_memory_alias alias; 2409 struct kvm_pit_config pit_config; 2410 } u; 2411 2412 switch (ioctl) { 2413 case KVM_SET_TSS_ADDR: 2414 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 2415 if (r < 0) 2416 goto out; 2417 break; 2418 case KVM_SET_IDENTITY_MAP_ADDR: { 2419 u64 ident_addr; 2420 2421 r = -EFAULT; 2422 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 2423 goto out; 2424 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 2425 if (r < 0) 2426 goto out; 2427 break; 2428 } 2429 case KVM_SET_MEMORY_REGION: { 2430 struct kvm_memory_region kvm_mem; 2431 struct kvm_userspace_memory_region kvm_userspace_mem; 2432 2433 r = -EFAULT; 2434 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2435 goto out; 2436 kvm_userspace_mem.slot = kvm_mem.slot; 2437 kvm_userspace_mem.flags = kvm_mem.flags; 2438 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 2439 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 2440 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 2441 if (r) 2442 goto out; 2443 break; 2444 } 2445 case KVM_SET_NR_MMU_PAGES: 2446 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2447 if (r) 2448 goto out; 2449 break; 2450 case KVM_GET_NR_MMU_PAGES: 2451 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2452 break; 2453 case KVM_SET_MEMORY_ALIAS: 2454 r = -EFAULT; 2455 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 2456 goto out; 2457 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 2458 if (r) 2459 goto out; 2460 break; 2461 case KVM_CREATE_IRQCHIP: { 2462 struct kvm_pic *vpic; 2463 2464 mutex_lock(&kvm->lock); 2465 r = -EEXIST; 2466 if (kvm->arch.vpic) 2467 goto create_irqchip_unlock; 2468 r = -ENOMEM; 2469 vpic = kvm_create_pic(kvm); 2470 if (vpic) { 2471 r = kvm_ioapic_init(kvm); 2472 if (r) { 2473 kfree(vpic); 2474 goto create_irqchip_unlock; 2475 } 2476 } else 2477 goto create_irqchip_unlock; 2478 smp_wmb(); 2479 kvm->arch.vpic = vpic; 2480 smp_wmb(); 2481 r = kvm_setup_default_irq_routing(kvm); 2482 if (r) { 2483 mutex_lock(&kvm->irq_lock); 2484 kfree(kvm->arch.vpic); 2485 kfree(kvm->arch.vioapic); 2486 kvm->arch.vpic = NULL; 2487 kvm->arch.vioapic = NULL; 2488 mutex_unlock(&kvm->irq_lock); 2489 } 2490 create_irqchip_unlock: 2491 mutex_unlock(&kvm->lock); 2492 break; 2493 } 2494 case KVM_CREATE_PIT: 2495 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2496 goto create_pit; 2497 case KVM_CREATE_PIT2: 2498 r = -EFAULT; 2499 if (copy_from_user(&u.pit_config, argp, 2500 sizeof(struct kvm_pit_config))) 2501 goto out; 2502 create_pit: 2503 down_write(&kvm->slots_lock); 2504 r = -EEXIST; 2505 if (kvm->arch.vpit) 2506 goto create_pit_unlock; 2507 r = -ENOMEM; 2508 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 2509 if (kvm->arch.vpit) 2510 r = 0; 2511 create_pit_unlock: 2512 up_write(&kvm->slots_lock); 2513 break; 2514 case KVM_IRQ_LINE_STATUS: 2515 case KVM_IRQ_LINE: { 2516 struct kvm_irq_level irq_event; 2517 2518 r = -EFAULT; 2519 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2520 goto out; 2521 if (irqchip_in_kernel(kvm)) { 2522 __s32 status; 2523 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2524 irq_event.irq, irq_event.level); 2525 if (ioctl == KVM_IRQ_LINE_STATUS) { 2526 irq_event.status = status; 2527 if (copy_to_user(argp, &irq_event, 2528 sizeof irq_event)) 2529 goto out; 2530 } 2531 r = 0; 2532 } 2533 break; 2534 } 2535 case KVM_GET_IRQCHIP: { 2536 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2537 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2538 2539 r = -ENOMEM; 2540 if (!chip) 2541 goto out; 2542 r = -EFAULT; 2543 if (copy_from_user(chip, argp, sizeof *chip)) 2544 goto get_irqchip_out; 2545 r = -ENXIO; 2546 if (!irqchip_in_kernel(kvm)) 2547 goto get_irqchip_out; 2548 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 2549 if (r) 2550 goto get_irqchip_out; 2551 r = -EFAULT; 2552 if (copy_to_user(argp, chip, sizeof *chip)) 2553 goto get_irqchip_out; 2554 r = 0; 2555 get_irqchip_out: 2556 kfree(chip); 2557 if (r) 2558 goto out; 2559 break; 2560 } 2561 case KVM_SET_IRQCHIP: { 2562 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2563 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2564 2565 r = -ENOMEM; 2566 if (!chip) 2567 goto out; 2568 r = -EFAULT; 2569 if (copy_from_user(chip, argp, sizeof *chip)) 2570 goto set_irqchip_out; 2571 r = -ENXIO; 2572 if (!irqchip_in_kernel(kvm)) 2573 goto set_irqchip_out; 2574 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 2575 if (r) 2576 goto set_irqchip_out; 2577 r = 0; 2578 set_irqchip_out: 2579 kfree(chip); 2580 if (r) 2581 goto out; 2582 break; 2583 } 2584 case KVM_GET_PIT: { 2585 r = -EFAULT; 2586 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 2587 goto out; 2588 r = -ENXIO; 2589 if (!kvm->arch.vpit) 2590 goto out; 2591 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 2592 if (r) 2593 goto out; 2594 r = -EFAULT; 2595 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 2596 goto out; 2597 r = 0; 2598 break; 2599 } 2600 case KVM_SET_PIT: { 2601 r = -EFAULT; 2602 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 2603 goto out; 2604 r = -ENXIO; 2605 if (!kvm->arch.vpit) 2606 goto out; 2607 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2608 if (r) 2609 goto out; 2610 r = 0; 2611 break; 2612 } 2613 case KVM_GET_PIT2: { 2614 r = -ENXIO; 2615 if (!kvm->arch.vpit) 2616 goto out; 2617 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 2618 if (r) 2619 goto out; 2620 r = -EFAULT; 2621 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 2622 goto out; 2623 r = 0; 2624 break; 2625 } 2626 case KVM_SET_PIT2: { 2627 r = -EFAULT; 2628 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 2629 goto out; 2630 r = -ENXIO; 2631 if (!kvm->arch.vpit) 2632 goto out; 2633 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 2634 if (r) 2635 goto out; 2636 r = 0; 2637 break; 2638 } 2639 case KVM_REINJECT_CONTROL: { 2640 struct kvm_reinject_control control; 2641 r = -EFAULT; 2642 if (copy_from_user(&control, argp, sizeof(control))) 2643 goto out; 2644 r = kvm_vm_ioctl_reinject(kvm, &control); 2645 if (r) 2646 goto out; 2647 r = 0; 2648 break; 2649 } 2650 case KVM_XEN_HVM_CONFIG: { 2651 r = -EFAULT; 2652 if (copy_from_user(&kvm->arch.xen_hvm_config, argp, 2653 sizeof(struct kvm_xen_hvm_config))) 2654 goto out; 2655 r = -EINVAL; 2656 if (kvm->arch.xen_hvm_config.flags) 2657 goto out; 2658 r = 0; 2659 break; 2660 } 2661 case KVM_SET_CLOCK: { 2662 struct timespec now; 2663 struct kvm_clock_data user_ns; 2664 u64 now_ns; 2665 s64 delta; 2666 2667 r = -EFAULT; 2668 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 2669 goto out; 2670 2671 r = -EINVAL; 2672 if (user_ns.flags) 2673 goto out; 2674 2675 r = 0; 2676 ktime_get_ts(&now); 2677 now_ns = timespec_to_ns(&now); 2678 delta = user_ns.clock - now_ns; 2679 kvm->arch.kvmclock_offset = delta; 2680 break; 2681 } 2682 case KVM_GET_CLOCK: { 2683 struct timespec now; 2684 struct kvm_clock_data user_ns; 2685 u64 now_ns; 2686 2687 ktime_get_ts(&now); 2688 now_ns = timespec_to_ns(&now); 2689 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 2690 user_ns.flags = 0; 2691 2692 r = -EFAULT; 2693 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 2694 goto out; 2695 r = 0; 2696 break; 2697 } 2698 2699 default: 2700 ; 2701 } 2702 out: 2703 return r; 2704 } 2705 2706 static void kvm_init_msr_list(void) 2707 { 2708 u32 dummy[2]; 2709 unsigned i, j; 2710 2711 /* skip the first msrs in the list. KVM-specific */ 2712 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 2713 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2714 continue; 2715 if (j < i) 2716 msrs_to_save[j] = msrs_to_save[i]; 2717 j++; 2718 } 2719 num_msrs_to_save = j; 2720 } 2721 2722 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 2723 const void *v) 2724 { 2725 if (vcpu->arch.apic && 2726 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 2727 return 0; 2728 2729 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 2730 } 2731 2732 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 2733 { 2734 if (vcpu->arch.apic && 2735 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 2736 return 0; 2737 2738 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 2739 } 2740 2741 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2742 struct kvm_vcpu *vcpu) 2743 { 2744 void *data = val; 2745 int r = X86EMUL_CONTINUE; 2746 2747 while (bytes) { 2748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2749 unsigned offset = addr & (PAGE_SIZE-1); 2750 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2751 int ret; 2752 2753 if (gpa == UNMAPPED_GVA) { 2754 r = X86EMUL_PROPAGATE_FAULT; 2755 goto out; 2756 } 2757 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2758 if (ret < 0) { 2759 r = X86EMUL_UNHANDLEABLE; 2760 goto out; 2761 } 2762 2763 bytes -= toread; 2764 data += toread; 2765 addr += toread; 2766 } 2767 out: 2768 return r; 2769 } 2770 2771 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2772 struct kvm_vcpu *vcpu) 2773 { 2774 void *data = val; 2775 int r = X86EMUL_CONTINUE; 2776 2777 while (bytes) { 2778 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2779 unsigned offset = addr & (PAGE_SIZE-1); 2780 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2781 int ret; 2782 2783 if (gpa == UNMAPPED_GVA) { 2784 r = X86EMUL_PROPAGATE_FAULT; 2785 goto out; 2786 } 2787 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2788 if (ret < 0) { 2789 r = X86EMUL_UNHANDLEABLE; 2790 goto out; 2791 } 2792 2793 bytes -= towrite; 2794 data += towrite; 2795 addr += towrite; 2796 } 2797 out: 2798 return r; 2799 } 2800 2801 2802 static int emulator_read_emulated(unsigned long addr, 2803 void *val, 2804 unsigned int bytes, 2805 struct kvm_vcpu *vcpu) 2806 { 2807 gpa_t gpa; 2808 2809 if (vcpu->mmio_read_completed) { 2810 memcpy(val, vcpu->mmio_data, bytes); 2811 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 2812 vcpu->mmio_phys_addr, *(u64 *)val); 2813 vcpu->mmio_read_completed = 0; 2814 return X86EMUL_CONTINUE; 2815 } 2816 2817 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2818 2819 /* For APIC access vmexit */ 2820 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2821 goto mmio; 2822 2823 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2824 == X86EMUL_CONTINUE) 2825 return X86EMUL_CONTINUE; 2826 if (gpa == UNMAPPED_GVA) 2827 return X86EMUL_PROPAGATE_FAULT; 2828 2829 mmio: 2830 /* 2831 * Is this MMIO handled locally? 2832 */ 2833 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 2834 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 2835 return X86EMUL_CONTINUE; 2836 } 2837 2838 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 2839 2840 vcpu->mmio_needed = 1; 2841 vcpu->mmio_phys_addr = gpa; 2842 vcpu->mmio_size = bytes; 2843 vcpu->mmio_is_write = 0; 2844 2845 return X86EMUL_UNHANDLEABLE; 2846 } 2847 2848 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2849 const void *val, int bytes) 2850 { 2851 int ret; 2852 2853 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2854 if (ret < 0) 2855 return 0; 2856 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2857 return 1; 2858 } 2859 2860 static int emulator_write_emulated_onepage(unsigned long addr, 2861 const void *val, 2862 unsigned int bytes, 2863 struct kvm_vcpu *vcpu) 2864 { 2865 gpa_t gpa; 2866 2867 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2868 2869 if (gpa == UNMAPPED_GVA) { 2870 kvm_inject_page_fault(vcpu, addr, 2); 2871 return X86EMUL_PROPAGATE_FAULT; 2872 } 2873 2874 /* For APIC access vmexit */ 2875 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2876 goto mmio; 2877 2878 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2879 return X86EMUL_CONTINUE; 2880 2881 mmio: 2882 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 2883 /* 2884 * Is this MMIO handled locally? 2885 */ 2886 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 2887 return X86EMUL_CONTINUE; 2888 2889 vcpu->mmio_needed = 1; 2890 vcpu->mmio_phys_addr = gpa; 2891 vcpu->mmio_size = bytes; 2892 vcpu->mmio_is_write = 1; 2893 memcpy(vcpu->mmio_data, val, bytes); 2894 2895 return X86EMUL_CONTINUE; 2896 } 2897 2898 int emulator_write_emulated(unsigned long addr, 2899 const void *val, 2900 unsigned int bytes, 2901 struct kvm_vcpu *vcpu) 2902 { 2903 /* Crossing a page boundary? */ 2904 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2905 int rc, now; 2906 2907 now = -addr & ~PAGE_MASK; 2908 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2909 if (rc != X86EMUL_CONTINUE) 2910 return rc; 2911 addr += now; 2912 val += now; 2913 bytes -= now; 2914 } 2915 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2916 } 2917 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2918 2919 static int emulator_cmpxchg_emulated(unsigned long addr, 2920 const void *old, 2921 const void *new, 2922 unsigned int bytes, 2923 struct kvm_vcpu *vcpu) 2924 { 2925 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 2926 #ifndef CONFIG_X86_64 2927 /* guests cmpxchg8b have to be emulated atomically */ 2928 if (bytes == 8) { 2929 gpa_t gpa; 2930 struct page *page; 2931 char *kaddr; 2932 u64 val; 2933 2934 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2935 2936 if (gpa == UNMAPPED_GVA || 2937 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2938 goto emul_write; 2939 2940 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2941 goto emul_write; 2942 2943 val = *(u64 *)new; 2944 2945 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2946 2947 kaddr = kmap_atomic(page, KM_USER0); 2948 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2949 kunmap_atomic(kaddr, KM_USER0); 2950 kvm_release_page_dirty(page); 2951 } 2952 emul_write: 2953 #endif 2954 2955 return emulator_write_emulated(addr, new, bytes, vcpu); 2956 } 2957 2958 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2959 { 2960 return kvm_x86_ops->get_segment_base(vcpu, seg); 2961 } 2962 2963 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2964 { 2965 kvm_mmu_invlpg(vcpu, address); 2966 return X86EMUL_CONTINUE; 2967 } 2968 2969 int emulate_clts(struct kvm_vcpu *vcpu) 2970 { 2971 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2972 return X86EMUL_CONTINUE; 2973 } 2974 2975 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2976 { 2977 struct kvm_vcpu *vcpu = ctxt->vcpu; 2978 2979 switch (dr) { 2980 case 0 ... 3: 2981 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2982 return X86EMUL_CONTINUE; 2983 default: 2984 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2985 return X86EMUL_UNHANDLEABLE; 2986 } 2987 } 2988 2989 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2990 { 2991 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2992 int exception; 2993 2994 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2995 if (exception) { 2996 /* FIXME: better handling */ 2997 return X86EMUL_UNHANDLEABLE; 2998 } 2999 return X86EMUL_CONTINUE; 3000 } 3001 3002 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3003 { 3004 u8 opcodes[4]; 3005 unsigned long rip = kvm_rip_read(vcpu); 3006 unsigned long rip_linear; 3007 3008 if (!printk_ratelimit()) 3009 return; 3010 3011 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3012 3013 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3014 3015 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3016 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3017 } 3018 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3019 3020 static struct x86_emulate_ops emulate_ops = { 3021 .read_std = kvm_read_guest_virt, 3022 .read_emulated = emulator_read_emulated, 3023 .write_emulated = emulator_write_emulated, 3024 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3025 }; 3026 3027 static void cache_all_regs(struct kvm_vcpu *vcpu) 3028 { 3029 kvm_register_read(vcpu, VCPU_REGS_RAX); 3030 kvm_register_read(vcpu, VCPU_REGS_RSP); 3031 kvm_register_read(vcpu, VCPU_REGS_RIP); 3032 vcpu->arch.regs_dirty = ~0; 3033 } 3034 3035 int emulate_instruction(struct kvm_vcpu *vcpu, 3036 unsigned long cr2, 3037 u16 error_code, 3038 int emulation_type) 3039 { 3040 int r, shadow_mask; 3041 struct decode_cache *c; 3042 struct kvm_run *run = vcpu->run; 3043 3044 kvm_clear_exception_queue(vcpu); 3045 vcpu->arch.mmio_fault_cr2 = cr2; 3046 /* 3047 * TODO: fix emulate.c to use guest_read/write_register 3048 * instead of direct ->regs accesses, can save hundred cycles 3049 * on Intel for instructions that don't read/change RSP, for 3050 * for example. 3051 */ 3052 cache_all_regs(vcpu); 3053 3054 vcpu->mmio_is_write = 0; 3055 vcpu->arch.pio.string = 0; 3056 3057 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3058 int cs_db, cs_l; 3059 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3060 3061 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3062 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3063 vcpu->arch.emulate_ctxt.mode = 3064 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3065 ? X86EMUL_MODE_REAL : cs_l 3066 ? X86EMUL_MODE_PROT64 : cs_db 3067 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3068 3069 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3070 3071 /* Only allow emulation of specific instructions on #UD 3072 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3073 c = &vcpu->arch.emulate_ctxt.decode; 3074 if (emulation_type & EMULTYPE_TRAP_UD) { 3075 if (!c->twobyte) 3076 return EMULATE_FAIL; 3077 switch (c->b) { 3078 case 0x01: /* VMMCALL */ 3079 if (c->modrm_mod != 3 || c->modrm_rm != 1) 3080 return EMULATE_FAIL; 3081 break; 3082 case 0x34: /* sysenter */ 3083 case 0x35: /* sysexit */ 3084 if (c->modrm_mod != 0 || c->modrm_rm != 0) 3085 return EMULATE_FAIL; 3086 break; 3087 case 0x05: /* syscall */ 3088 if (c->modrm_mod != 0 || c->modrm_rm != 0) 3089 return EMULATE_FAIL; 3090 break; 3091 default: 3092 return EMULATE_FAIL; 3093 } 3094 3095 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 3096 return EMULATE_FAIL; 3097 } 3098 3099 ++vcpu->stat.insn_emulation; 3100 if (r) { 3101 ++vcpu->stat.insn_emulation_fail; 3102 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3103 return EMULATE_DONE; 3104 return EMULATE_FAIL; 3105 } 3106 } 3107 3108 if (emulation_type & EMULTYPE_SKIP) { 3109 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 3110 return EMULATE_DONE; 3111 } 3112 3113 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3114 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3115 3116 if (r == 0) 3117 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3118 3119 if (vcpu->arch.pio.string) 3120 return EMULATE_DO_MMIO; 3121 3122 if ((r || vcpu->mmio_is_write) && run) { 3123 run->exit_reason = KVM_EXIT_MMIO; 3124 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3125 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3126 run->mmio.len = vcpu->mmio_size; 3127 run->mmio.is_write = vcpu->mmio_is_write; 3128 } 3129 3130 if (r) { 3131 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3132 return EMULATE_DONE; 3133 if (!vcpu->mmio_needed) { 3134 kvm_report_emulation_failure(vcpu, "mmio"); 3135 return EMULATE_FAIL; 3136 } 3137 return EMULATE_DO_MMIO; 3138 } 3139 3140 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3141 3142 if (vcpu->mmio_is_write) { 3143 vcpu->mmio_needed = 0; 3144 return EMULATE_DO_MMIO; 3145 } 3146 3147 return EMULATE_DONE; 3148 } 3149 EXPORT_SYMBOL_GPL(emulate_instruction); 3150 3151 static int pio_copy_data(struct kvm_vcpu *vcpu) 3152 { 3153 void *p = vcpu->arch.pio_data; 3154 gva_t q = vcpu->arch.pio.guest_gva; 3155 unsigned bytes; 3156 int ret; 3157 3158 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3159 if (vcpu->arch.pio.in) 3160 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3161 else 3162 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3163 return ret; 3164 } 3165 3166 int complete_pio(struct kvm_vcpu *vcpu) 3167 { 3168 struct kvm_pio_request *io = &vcpu->arch.pio; 3169 long delta; 3170 int r; 3171 unsigned long val; 3172 3173 if (!io->string) { 3174 if (io->in) { 3175 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3176 memcpy(&val, vcpu->arch.pio_data, io->size); 3177 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 3178 } 3179 } else { 3180 if (io->in) { 3181 r = pio_copy_data(vcpu); 3182 if (r) 3183 return r; 3184 } 3185 3186 delta = 1; 3187 if (io->rep) { 3188 delta *= io->cur_count; 3189 /* 3190 * The size of the register should really depend on 3191 * current address size. 3192 */ 3193 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 3194 val -= delta; 3195 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 3196 } 3197 if (io->down) 3198 delta = -delta; 3199 delta *= io->size; 3200 if (io->in) { 3201 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 3202 val += delta; 3203 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 3204 } else { 3205 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 3206 val += delta; 3207 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3208 } 3209 } 3210 3211 io->count -= io->cur_count; 3212 io->cur_count = 0; 3213 3214 return 0; 3215 } 3216 3217 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3218 { 3219 /* TODO: String I/O for in kernel device */ 3220 int r; 3221 3222 if (vcpu->arch.pio.in) 3223 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3224 vcpu->arch.pio.size, pd); 3225 else 3226 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3227 vcpu->arch.pio.size, pd); 3228 return r; 3229 } 3230 3231 static int pio_string_write(struct kvm_vcpu *vcpu) 3232 { 3233 struct kvm_pio_request *io = &vcpu->arch.pio; 3234 void *pd = vcpu->arch.pio_data; 3235 int i, r = 0; 3236 3237 for (i = 0; i < io->cur_count; i++) { 3238 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3239 io->port, io->size, pd)) { 3240 r = -EOPNOTSUPP; 3241 break; 3242 } 3243 pd += io->size; 3244 } 3245 return r; 3246 } 3247 3248 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) 3249 { 3250 unsigned long val; 3251 3252 vcpu->run->exit_reason = KVM_EXIT_IO; 3253 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3254 vcpu->run->io.size = vcpu->arch.pio.size = size; 3255 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3256 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 3257 vcpu->run->io.port = vcpu->arch.pio.port = port; 3258 vcpu->arch.pio.in = in; 3259 vcpu->arch.pio.string = 0; 3260 vcpu->arch.pio.down = 0; 3261 vcpu->arch.pio.rep = 0; 3262 3263 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3264 size, 1); 3265 3266 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3267 memcpy(vcpu->arch.pio_data, &val, 4); 3268 3269 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3270 complete_pio(vcpu); 3271 return 1; 3272 } 3273 return 0; 3274 } 3275 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3276 3277 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3278 int size, unsigned long count, int down, 3279 gva_t address, int rep, unsigned port) 3280 { 3281 unsigned now, in_page; 3282 int ret = 0; 3283 3284 vcpu->run->exit_reason = KVM_EXIT_IO; 3285 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3286 vcpu->run->io.size = vcpu->arch.pio.size = size; 3287 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3288 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 3289 vcpu->run->io.port = vcpu->arch.pio.port = port; 3290 vcpu->arch.pio.in = in; 3291 vcpu->arch.pio.string = 1; 3292 vcpu->arch.pio.down = down; 3293 vcpu->arch.pio.rep = rep; 3294 3295 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3296 size, count); 3297 3298 if (!count) { 3299 kvm_x86_ops->skip_emulated_instruction(vcpu); 3300 return 1; 3301 } 3302 3303 if (!down) 3304 in_page = PAGE_SIZE - offset_in_page(address); 3305 else 3306 in_page = offset_in_page(address) + size; 3307 now = min(count, (unsigned long)in_page / size); 3308 if (!now) 3309 now = 1; 3310 if (down) { 3311 /* 3312 * String I/O in reverse. Yuck. Kill the guest, fix later. 3313 */ 3314 pr_unimpl(vcpu, "guest string pio down\n"); 3315 kvm_inject_gp(vcpu, 0); 3316 return 1; 3317 } 3318 vcpu->run->io.count = now; 3319 vcpu->arch.pio.cur_count = now; 3320 3321 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 3322 kvm_x86_ops->skip_emulated_instruction(vcpu); 3323 3324 vcpu->arch.pio.guest_gva = address; 3325 3326 if (!vcpu->arch.pio.in) { 3327 /* string PIO write */ 3328 ret = pio_copy_data(vcpu); 3329 if (ret == X86EMUL_PROPAGATE_FAULT) { 3330 kvm_inject_gp(vcpu, 0); 3331 return 1; 3332 } 3333 if (ret == 0 && !pio_string_write(vcpu)) { 3334 complete_pio(vcpu); 3335 if (vcpu->arch.pio.count == 0) 3336 ret = 1; 3337 } 3338 } 3339 /* no string PIO read support yet */ 3340 3341 return ret; 3342 } 3343 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3344 3345 static void bounce_off(void *info) 3346 { 3347 /* nothing */ 3348 } 3349 3350 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3351 void *data) 3352 { 3353 struct cpufreq_freqs *freq = data; 3354 struct kvm *kvm; 3355 struct kvm_vcpu *vcpu; 3356 int i, send_ipi = 0; 3357 3358 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3359 return 0; 3360 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3361 return 0; 3362 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 3363 3364 spin_lock(&kvm_lock); 3365 list_for_each_entry(kvm, &vm_list, vm_list) { 3366 kvm_for_each_vcpu(i, vcpu, kvm) { 3367 if (vcpu->cpu != freq->cpu) 3368 continue; 3369 if (!kvm_request_guest_time_update(vcpu)) 3370 continue; 3371 if (vcpu->cpu != smp_processor_id()) 3372 send_ipi++; 3373 } 3374 } 3375 spin_unlock(&kvm_lock); 3376 3377 if (freq->old < freq->new && send_ipi) { 3378 /* 3379 * We upscale the frequency. Must make the guest 3380 * doesn't see old kvmclock values while running with 3381 * the new frequency, otherwise we risk the guest sees 3382 * time go backwards. 3383 * 3384 * In case we update the frequency for another cpu 3385 * (which might be in guest context) send an interrupt 3386 * to kick the cpu out of guest context. Next time 3387 * guest context is entered kvmclock will be updated, 3388 * so the guest will not see stale values. 3389 */ 3390 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 3391 } 3392 return 0; 3393 } 3394 3395 static struct notifier_block kvmclock_cpufreq_notifier_block = { 3396 .notifier_call = kvmclock_cpufreq_notifier 3397 }; 3398 3399 static void kvm_timer_init(void) 3400 { 3401 int cpu; 3402 3403 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 3404 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 3405 CPUFREQ_TRANSITION_NOTIFIER); 3406 for_each_online_cpu(cpu) { 3407 unsigned long khz = cpufreq_get(cpu); 3408 if (!khz) 3409 khz = tsc_khz; 3410 per_cpu(cpu_tsc_khz, cpu) = khz; 3411 } 3412 } else { 3413 for_each_possible_cpu(cpu) 3414 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 3415 } 3416 } 3417 3418 int kvm_arch_init(void *opaque) 3419 { 3420 int r; 3421 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3422 3423 if (kvm_x86_ops) { 3424 printk(KERN_ERR "kvm: already loaded the other module\n"); 3425 r = -EEXIST; 3426 goto out; 3427 } 3428 3429 if (!ops->cpu_has_kvm_support()) { 3430 printk(KERN_ERR "kvm: no hardware support\n"); 3431 r = -EOPNOTSUPP; 3432 goto out; 3433 } 3434 if (ops->disabled_by_bios()) { 3435 printk(KERN_ERR "kvm: disabled by bios\n"); 3436 r = -EOPNOTSUPP; 3437 goto out; 3438 } 3439 3440 r = kvm_mmu_module_init(); 3441 if (r) 3442 goto out; 3443 3444 kvm_init_msr_list(); 3445 3446 kvm_x86_ops = ops; 3447 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 3448 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 3449 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3450 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3451 3452 kvm_timer_init(); 3453 3454 return 0; 3455 3456 out: 3457 return r; 3458 } 3459 3460 void kvm_arch_exit(void) 3461 { 3462 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 3463 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 3464 CPUFREQ_TRANSITION_NOTIFIER); 3465 kvm_x86_ops = NULL; 3466 kvm_mmu_module_exit(); 3467 } 3468 3469 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3470 { 3471 ++vcpu->stat.halt_exits; 3472 if (irqchip_in_kernel(vcpu->kvm)) { 3473 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3474 return 1; 3475 } else { 3476 vcpu->run->exit_reason = KVM_EXIT_HLT; 3477 return 0; 3478 } 3479 } 3480 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 3481 3482 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 3483 unsigned long a1) 3484 { 3485 if (is_long_mode(vcpu)) 3486 return a0; 3487 else 3488 return a0 | ((gpa_t)a1 << 32); 3489 } 3490 3491 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3492 { 3493 unsigned long nr, a0, a1, a2, a3, ret; 3494 int r = 1; 3495 3496 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3497 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3498 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3499 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3500 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3501 3502 trace_kvm_hypercall(nr, a0, a1, a2, a3); 3503 3504 if (!is_long_mode(vcpu)) { 3505 nr &= 0xFFFFFFFF; 3506 a0 &= 0xFFFFFFFF; 3507 a1 &= 0xFFFFFFFF; 3508 a2 &= 0xFFFFFFFF; 3509 a3 &= 0xFFFFFFFF; 3510 } 3511 3512 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 3513 ret = -KVM_EPERM; 3514 goto out; 3515 } 3516 3517 switch (nr) { 3518 case KVM_HC_VAPIC_POLL_IRQ: 3519 ret = 0; 3520 break; 3521 case KVM_HC_MMU_OP: 3522 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 3523 break; 3524 default: 3525 ret = -KVM_ENOSYS; 3526 break; 3527 } 3528 out: 3529 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3530 ++vcpu->stat.hypercalls; 3531 return r; 3532 } 3533 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 3534 3535 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3536 { 3537 char instruction[3]; 3538 int ret = 0; 3539 unsigned long rip = kvm_rip_read(vcpu); 3540 3541 3542 /* 3543 * Blow out the MMU to ensure that no other VCPU has an active mapping 3544 * to ensure that the updated hypercall appears atomically across all 3545 * VCPUs. 3546 */ 3547 kvm_mmu_zap_all(vcpu->kvm); 3548 3549 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3550 if (emulator_write_emulated(rip, instruction, 3, vcpu) 3551 != X86EMUL_CONTINUE) 3552 ret = -EFAULT; 3553 3554 return ret; 3555 } 3556 3557 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3558 { 3559 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 3560 } 3561 3562 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3563 { 3564 struct descriptor_table dt = { limit, base }; 3565 3566 kvm_x86_ops->set_gdt(vcpu, &dt); 3567 } 3568 3569 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3570 { 3571 struct descriptor_table dt = { limit, base }; 3572 3573 kvm_x86_ops->set_idt(vcpu, &dt); 3574 } 3575 3576 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 3577 unsigned long *rflags) 3578 { 3579 kvm_lmsw(vcpu, msw); 3580 *rflags = kvm_get_rflags(vcpu); 3581 } 3582 3583 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3584 { 3585 unsigned long value; 3586 3587 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3588 switch (cr) { 3589 case 0: 3590 value = vcpu->arch.cr0; 3591 break; 3592 case 2: 3593 value = vcpu->arch.cr2; 3594 break; 3595 case 3: 3596 value = vcpu->arch.cr3; 3597 break; 3598 case 4: 3599 value = vcpu->arch.cr4; 3600 break; 3601 case 8: 3602 value = kvm_get_cr8(vcpu); 3603 break; 3604 default: 3605 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3606 return 0; 3607 } 3608 3609 return value; 3610 } 3611 3612 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3613 unsigned long *rflags) 3614 { 3615 switch (cr) { 3616 case 0: 3617 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3618 *rflags = kvm_get_rflags(vcpu); 3619 break; 3620 case 2: 3621 vcpu->arch.cr2 = val; 3622 break; 3623 case 3: 3624 kvm_set_cr3(vcpu, val); 3625 break; 3626 case 4: 3627 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 3628 break; 3629 case 8: 3630 kvm_set_cr8(vcpu, val & 0xfUL); 3631 break; 3632 default: 3633 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3634 } 3635 } 3636 3637 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 3638 { 3639 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 3640 int j, nent = vcpu->arch.cpuid_nent; 3641 3642 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 3643 /* when no next entry is found, the current entry[i] is reselected */ 3644 for (j = i + 1; ; j = (j + 1) % nent) { 3645 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 3646 if (ej->function == e->function) { 3647 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 3648 return j; 3649 } 3650 } 3651 return 0; /* silence gcc, even though control never reaches here */ 3652 } 3653 3654 /* find an entry with matching function, matching index (if needed), and that 3655 * should be read next (if it's stateful) */ 3656 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3657 u32 function, u32 index) 3658 { 3659 if (e->function != function) 3660 return 0; 3661 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3662 return 0; 3663 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3664 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3665 return 0; 3666 return 1; 3667 } 3668 3669 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3670 u32 function, u32 index) 3671 { 3672 int i; 3673 struct kvm_cpuid_entry2 *best = NULL; 3674 3675 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3676 struct kvm_cpuid_entry2 *e; 3677 3678 e = &vcpu->arch.cpuid_entries[i]; 3679 if (is_matching_cpuid_entry(e, function, index)) { 3680 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3681 move_to_next_stateful_cpuid_entry(vcpu, i); 3682 best = e; 3683 break; 3684 } 3685 /* 3686 * Both basic or both extended? 3687 */ 3688 if (((e->function ^ function) & 0x80000000) == 0) 3689 if (!best || e->function > best->function) 3690 best = e; 3691 } 3692 return best; 3693 } 3694 3695 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3696 { 3697 struct kvm_cpuid_entry2 *best; 3698 3699 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3700 if (best) 3701 return best->eax & 0xff; 3702 return 36; 3703 } 3704 3705 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3706 { 3707 u32 function, index; 3708 struct kvm_cpuid_entry2 *best; 3709 3710 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3711 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3712 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3713 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3714 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3715 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3716 best = kvm_find_cpuid_entry(vcpu, function, index); 3717 if (best) { 3718 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3719 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3720 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3721 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3722 } 3723 kvm_x86_ops->skip_emulated_instruction(vcpu); 3724 trace_kvm_cpuid(function, 3725 kvm_register_read(vcpu, VCPU_REGS_RAX), 3726 kvm_register_read(vcpu, VCPU_REGS_RBX), 3727 kvm_register_read(vcpu, VCPU_REGS_RCX), 3728 kvm_register_read(vcpu, VCPU_REGS_RDX)); 3729 } 3730 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3731 3732 /* 3733 * Check if userspace requested an interrupt window, and that the 3734 * interrupt window is open. 3735 * 3736 * No need to exit to userspace if we already have an interrupt queued. 3737 */ 3738 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 3739 { 3740 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3741 vcpu->run->request_interrupt_window && 3742 kvm_arch_interrupt_allowed(vcpu)); 3743 } 3744 3745 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 3746 { 3747 struct kvm_run *kvm_run = vcpu->run; 3748 3749 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3750 kvm_run->cr8 = kvm_get_cr8(vcpu); 3751 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3752 if (irqchip_in_kernel(vcpu->kvm)) 3753 kvm_run->ready_for_interrupt_injection = 1; 3754 else 3755 kvm_run->ready_for_interrupt_injection = 3756 kvm_arch_interrupt_allowed(vcpu) && 3757 !kvm_cpu_has_interrupt(vcpu) && 3758 !kvm_event_needs_reinjection(vcpu); 3759 } 3760 3761 static void vapic_enter(struct kvm_vcpu *vcpu) 3762 { 3763 struct kvm_lapic *apic = vcpu->arch.apic; 3764 struct page *page; 3765 3766 if (!apic || !apic->vapic_addr) 3767 return; 3768 3769 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3770 3771 vcpu->arch.apic->vapic_page = page; 3772 } 3773 3774 static void vapic_exit(struct kvm_vcpu *vcpu) 3775 { 3776 struct kvm_lapic *apic = vcpu->arch.apic; 3777 3778 if (!apic || !apic->vapic_addr) 3779 return; 3780 3781 down_read(&vcpu->kvm->slots_lock); 3782 kvm_release_page_dirty(apic->vapic_page); 3783 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3784 up_read(&vcpu->kvm->slots_lock); 3785 } 3786 3787 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3788 { 3789 int max_irr, tpr; 3790 3791 if (!kvm_x86_ops->update_cr8_intercept) 3792 return; 3793 3794 if (!vcpu->arch.apic) 3795 return; 3796 3797 if (!vcpu->arch.apic->vapic_addr) 3798 max_irr = kvm_lapic_find_highest_irr(vcpu); 3799 else 3800 max_irr = -1; 3801 3802 if (max_irr != -1) 3803 max_irr >>= 4; 3804 3805 tpr = kvm_lapic_get_cr8(vcpu); 3806 3807 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3808 } 3809 3810 static void inject_pending_event(struct kvm_vcpu *vcpu) 3811 { 3812 /* try to reinject previous events if any */ 3813 if (vcpu->arch.exception.pending) { 3814 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 3815 vcpu->arch.exception.has_error_code, 3816 vcpu->arch.exception.error_code); 3817 return; 3818 } 3819 3820 if (vcpu->arch.nmi_injected) { 3821 kvm_x86_ops->set_nmi(vcpu); 3822 return; 3823 } 3824 3825 if (vcpu->arch.interrupt.pending) { 3826 kvm_x86_ops->set_irq(vcpu); 3827 return; 3828 } 3829 3830 /* try to inject new event if pending */ 3831 if (vcpu->arch.nmi_pending) { 3832 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3833 vcpu->arch.nmi_pending = false; 3834 vcpu->arch.nmi_injected = true; 3835 kvm_x86_ops->set_nmi(vcpu); 3836 } 3837 } else if (kvm_cpu_has_interrupt(vcpu)) { 3838 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3839 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3840 false); 3841 kvm_x86_ops->set_irq(vcpu); 3842 } 3843 } 3844 } 3845 3846 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 3847 { 3848 int r; 3849 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3850 vcpu->run->request_interrupt_window; 3851 3852 if (vcpu->requests) 3853 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3854 kvm_mmu_unload(vcpu); 3855 3856 r = kvm_mmu_reload(vcpu); 3857 if (unlikely(r)) 3858 goto out; 3859 3860 if (vcpu->requests) { 3861 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3862 __kvm_migrate_timers(vcpu); 3863 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3864 kvm_write_guest_time(vcpu); 3865 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3866 kvm_mmu_sync_roots(vcpu); 3867 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3868 kvm_x86_ops->tlb_flush(vcpu); 3869 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3870 &vcpu->requests)) { 3871 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 3872 r = 0; 3873 goto out; 3874 } 3875 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3876 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 3877 r = 0; 3878 goto out; 3879 } 3880 } 3881 3882 preempt_disable(); 3883 3884 kvm_x86_ops->prepare_guest_switch(vcpu); 3885 kvm_load_guest_fpu(vcpu); 3886 3887 local_irq_disable(); 3888 3889 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3890 smp_mb__after_clear_bit(); 3891 3892 if (vcpu->requests || need_resched() || signal_pending(current)) { 3893 set_bit(KVM_REQ_KICK, &vcpu->requests); 3894 local_irq_enable(); 3895 preempt_enable(); 3896 r = 1; 3897 goto out; 3898 } 3899 3900 inject_pending_event(vcpu); 3901 3902 /* enable NMI/IRQ window open exits if needed */ 3903 if (vcpu->arch.nmi_pending) 3904 kvm_x86_ops->enable_nmi_window(vcpu); 3905 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3906 kvm_x86_ops->enable_irq_window(vcpu); 3907 3908 if (kvm_lapic_enabled(vcpu)) { 3909 update_cr8_intercept(vcpu); 3910 kvm_lapic_sync_to_vapic(vcpu); 3911 } 3912 3913 up_read(&vcpu->kvm->slots_lock); 3914 3915 kvm_guest_enter(); 3916 3917 if (unlikely(vcpu->arch.switch_db_regs)) { 3918 set_debugreg(0, 7); 3919 set_debugreg(vcpu->arch.eff_db[0], 0); 3920 set_debugreg(vcpu->arch.eff_db[1], 1); 3921 set_debugreg(vcpu->arch.eff_db[2], 2); 3922 set_debugreg(vcpu->arch.eff_db[3], 3); 3923 } 3924 3925 trace_kvm_entry(vcpu->vcpu_id); 3926 kvm_x86_ops->run(vcpu); 3927 3928 /* 3929 * If the guest has used debug registers, at least dr7 3930 * will be disabled while returning to the host. 3931 * If we don't have active breakpoints in the host, we don't 3932 * care about the messed up debug address registers. But if 3933 * we have some of them active, restore the old state. 3934 */ 3935 if (hw_breakpoint_active()) 3936 hw_breakpoint_restore(); 3937 3938 set_bit(KVM_REQ_KICK, &vcpu->requests); 3939 local_irq_enable(); 3940 3941 ++vcpu->stat.exits; 3942 3943 /* 3944 * We must have an instruction between local_irq_enable() and 3945 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3946 * the interrupt shadow. The stat.exits increment will do nicely. 3947 * But we need to prevent reordering, hence this barrier(): 3948 */ 3949 barrier(); 3950 3951 kvm_guest_exit(); 3952 3953 preempt_enable(); 3954 3955 down_read(&vcpu->kvm->slots_lock); 3956 3957 /* 3958 * Profile KVM exit RIPs: 3959 */ 3960 if (unlikely(prof_on == KVM_PROFILING)) { 3961 unsigned long rip = kvm_rip_read(vcpu); 3962 profile_hit(KVM_PROFILING, (void *)rip); 3963 } 3964 3965 3966 kvm_lapic_sync_from_vapic(vcpu); 3967 3968 r = kvm_x86_ops->handle_exit(vcpu); 3969 out: 3970 return r; 3971 } 3972 3973 3974 static int __vcpu_run(struct kvm_vcpu *vcpu) 3975 { 3976 int r; 3977 3978 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3979 pr_debug("vcpu %d received sipi with vector # %x\n", 3980 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3981 kvm_lapic_reset(vcpu); 3982 r = kvm_arch_vcpu_reset(vcpu); 3983 if (r) 3984 return r; 3985 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3986 } 3987 3988 down_read(&vcpu->kvm->slots_lock); 3989 vapic_enter(vcpu); 3990 3991 r = 1; 3992 while (r > 0) { 3993 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3994 r = vcpu_enter_guest(vcpu); 3995 else { 3996 up_read(&vcpu->kvm->slots_lock); 3997 kvm_vcpu_block(vcpu); 3998 down_read(&vcpu->kvm->slots_lock); 3999 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4000 { 4001 switch(vcpu->arch.mp_state) { 4002 case KVM_MP_STATE_HALTED: 4003 vcpu->arch.mp_state = 4004 KVM_MP_STATE_RUNNABLE; 4005 case KVM_MP_STATE_RUNNABLE: 4006 break; 4007 case KVM_MP_STATE_SIPI_RECEIVED: 4008 default: 4009 r = -EINTR; 4010 break; 4011 } 4012 } 4013 } 4014 4015 if (r <= 0) 4016 break; 4017 4018 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 4019 if (kvm_cpu_has_pending_timer(vcpu)) 4020 kvm_inject_pending_timer_irqs(vcpu); 4021 4022 if (dm_request_for_irq_injection(vcpu)) { 4023 r = -EINTR; 4024 vcpu->run->exit_reason = KVM_EXIT_INTR; 4025 ++vcpu->stat.request_irq_exits; 4026 } 4027 if (signal_pending(current)) { 4028 r = -EINTR; 4029 vcpu->run->exit_reason = KVM_EXIT_INTR; 4030 ++vcpu->stat.signal_exits; 4031 } 4032 if (need_resched()) { 4033 up_read(&vcpu->kvm->slots_lock); 4034 kvm_resched(vcpu); 4035 down_read(&vcpu->kvm->slots_lock); 4036 } 4037 } 4038 4039 up_read(&vcpu->kvm->slots_lock); 4040 post_kvm_run_save(vcpu); 4041 4042 vapic_exit(vcpu); 4043 4044 return r; 4045 } 4046 4047 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4048 { 4049 int r; 4050 sigset_t sigsaved; 4051 4052 vcpu_load(vcpu); 4053 4054 if (vcpu->sigset_active) 4055 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4056 4057 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 4058 kvm_vcpu_block(vcpu); 4059 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 4060 r = -EAGAIN; 4061 goto out; 4062 } 4063 4064 /* re-sync apic's tpr */ 4065 if (!irqchip_in_kernel(vcpu->kvm)) 4066 kvm_set_cr8(vcpu, kvm_run->cr8); 4067 4068 if (vcpu->arch.pio.cur_count) { 4069 r = complete_pio(vcpu); 4070 if (r) 4071 goto out; 4072 } 4073 if (vcpu->mmio_needed) { 4074 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4075 vcpu->mmio_read_completed = 1; 4076 vcpu->mmio_needed = 0; 4077 4078 down_read(&vcpu->kvm->slots_lock); 4079 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4080 EMULTYPE_NO_DECODE); 4081 up_read(&vcpu->kvm->slots_lock); 4082 if (r == EMULATE_DO_MMIO) { 4083 /* 4084 * Read-modify-write. Back to userspace. 4085 */ 4086 r = 0; 4087 goto out; 4088 } 4089 } 4090 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4091 kvm_register_write(vcpu, VCPU_REGS_RAX, 4092 kvm_run->hypercall.ret); 4093 4094 r = __vcpu_run(vcpu); 4095 4096 out: 4097 if (vcpu->sigset_active) 4098 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4099 4100 vcpu_put(vcpu); 4101 return r; 4102 } 4103 4104 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4105 { 4106 vcpu_load(vcpu); 4107 4108 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4109 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4110 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4111 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4112 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4113 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4114 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4115 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4116 #ifdef CONFIG_X86_64 4117 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 4118 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 4119 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 4120 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 4121 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 4122 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 4123 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 4124 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 4125 #endif 4126 4127 regs->rip = kvm_rip_read(vcpu); 4128 regs->rflags = kvm_get_rflags(vcpu); 4129 4130 vcpu_put(vcpu); 4131 4132 return 0; 4133 } 4134 4135 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4136 { 4137 vcpu_load(vcpu); 4138 4139 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4140 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4141 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4142 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 4143 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 4144 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 4145 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 4146 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 4147 #ifdef CONFIG_X86_64 4148 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 4149 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 4150 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 4151 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 4152 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 4153 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4154 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4155 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4156 #endif 4157 4158 kvm_rip_write(vcpu, regs->rip); 4159 kvm_set_rflags(vcpu, regs->rflags); 4160 4161 vcpu->arch.exception.pending = false; 4162 4163 vcpu_put(vcpu); 4164 4165 return 0; 4166 } 4167 4168 void kvm_get_segment(struct kvm_vcpu *vcpu, 4169 struct kvm_segment *var, int seg) 4170 { 4171 kvm_x86_ops->get_segment(vcpu, var, seg); 4172 } 4173 4174 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4175 { 4176 struct kvm_segment cs; 4177 4178 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 4179 *db = cs.db; 4180 *l = cs.l; 4181 } 4182 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 4183 4184 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4185 struct kvm_sregs *sregs) 4186 { 4187 struct descriptor_table dt; 4188 4189 vcpu_load(vcpu); 4190 4191 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4192 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4193 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4194 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4195 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4196 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4197 4198 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4199 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4200 4201 kvm_x86_ops->get_idt(vcpu, &dt); 4202 sregs->idt.limit = dt.limit; 4203 sregs->idt.base = dt.base; 4204 kvm_x86_ops->get_gdt(vcpu, &dt); 4205 sregs->gdt.limit = dt.limit; 4206 sregs->gdt.base = dt.base; 4207 4208 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4209 sregs->cr0 = vcpu->arch.cr0; 4210 sregs->cr2 = vcpu->arch.cr2; 4211 sregs->cr3 = vcpu->arch.cr3; 4212 sregs->cr4 = vcpu->arch.cr4; 4213 sregs->cr8 = kvm_get_cr8(vcpu); 4214 sregs->efer = vcpu->arch.shadow_efer; 4215 sregs->apic_base = kvm_get_apic_base(vcpu); 4216 4217 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4218 4219 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 4220 set_bit(vcpu->arch.interrupt.nr, 4221 (unsigned long *)sregs->interrupt_bitmap); 4222 4223 vcpu_put(vcpu); 4224 4225 return 0; 4226 } 4227 4228 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 4229 struct kvm_mp_state *mp_state) 4230 { 4231 vcpu_load(vcpu); 4232 mp_state->mp_state = vcpu->arch.mp_state; 4233 vcpu_put(vcpu); 4234 return 0; 4235 } 4236 4237 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 4238 struct kvm_mp_state *mp_state) 4239 { 4240 vcpu_load(vcpu); 4241 vcpu->arch.mp_state = mp_state->mp_state; 4242 vcpu_put(vcpu); 4243 return 0; 4244 } 4245 4246 static void kvm_set_segment(struct kvm_vcpu *vcpu, 4247 struct kvm_segment *var, int seg) 4248 { 4249 kvm_x86_ops->set_segment(vcpu, var, seg); 4250 } 4251 4252 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 4253 struct kvm_segment *kvm_desct) 4254 { 4255 kvm_desct->base = get_desc_base(seg_desc); 4256 kvm_desct->limit = get_desc_limit(seg_desc); 4257 if (seg_desc->g) { 4258 kvm_desct->limit <<= 12; 4259 kvm_desct->limit |= 0xfff; 4260 } 4261 kvm_desct->selector = selector; 4262 kvm_desct->type = seg_desc->type; 4263 kvm_desct->present = seg_desc->p; 4264 kvm_desct->dpl = seg_desc->dpl; 4265 kvm_desct->db = seg_desc->d; 4266 kvm_desct->s = seg_desc->s; 4267 kvm_desct->l = seg_desc->l; 4268 kvm_desct->g = seg_desc->g; 4269 kvm_desct->avl = seg_desc->avl; 4270 if (!selector) 4271 kvm_desct->unusable = 1; 4272 else 4273 kvm_desct->unusable = 0; 4274 kvm_desct->padding = 0; 4275 } 4276 4277 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 4278 u16 selector, 4279 struct descriptor_table *dtable) 4280 { 4281 if (selector & 1 << 2) { 4282 struct kvm_segment kvm_seg; 4283 4284 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 4285 4286 if (kvm_seg.unusable) 4287 dtable->limit = 0; 4288 else 4289 dtable->limit = kvm_seg.limit; 4290 dtable->base = kvm_seg.base; 4291 } 4292 else 4293 kvm_x86_ops->get_gdt(vcpu, dtable); 4294 } 4295 4296 /* allowed just for 8 bytes segments */ 4297 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4298 struct desc_struct *seg_desc) 4299 { 4300 struct descriptor_table dtable; 4301 u16 index = selector >> 3; 4302 4303 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4304 4305 if (dtable.limit < index * 8 + 7) { 4306 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4307 return 1; 4308 } 4309 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4310 } 4311 4312 /* allowed just for 8 bytes segments */ 4313 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4314 struct desc_struct *seg_desc) 4315 { 4316 struct descriptor_table dtable; 4317 u16 index = selector >> 3; 4318 4319 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4320 4321 if (dtable.limit < index * 8 + 7) 4322 return 1; 4323 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4324 } 4325 4326 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4327 struct desc_struct *seg_desc) 4328 { 4329 u32 base_addr = get_desc_base(seg_desc); 4330 4331 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4332 } 4333 4334 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4335 { 4336 struct kvm_segment kvm_seg; 4337 4338 kvm_get_segment(vcpu, &kvm_seg, seg); 4339 return kvm_seg.selector; 4340 } 4341 4342 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 4343 u16 selector, 4344 struct kvm_segment *kvm_seg) 4345 { 4346 struct desc_struct seg_desc; 4347 4348 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 4349 return 1; 4350 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 4351 return 0; 4352 } 4353 4354 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4355 { 4356 struct kvm_segment segvar = { 4357 .base = selector << 4, 4358 .limit = 0xffff, 4359 .selector = selector, 4360 .type = 3, 4361 .present = 1, 4362 .dpl = 3, 4363 .db = 0, 4364 .s = 1, 4365 .l = 0, 4366 .g = 0, 4367 .avl = 0, 4368 .unusable = 0, 4369 }; 4370 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4371 return 0; 4372 } 4373 4374 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4375 { 4376 return (seg != VCPU_SREG_LDTR) && 4377 (seg != VCPU_SREG_TR) && 4378 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); 4379 } 4380 4381 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4382 int type_bits, int seg) 4383 { 4384 struct kvm_segment kvm_seg; 4385 4386 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4387 return kvm_load_realmode_segment(vcpu, selector, seg); 4388 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4389 return 1; 4390 kvm_seg.type |= type_bits; 4391 4392 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4393 seg != VCPU_SREG_LDTR) 4394 if (!kvm_seg.s) 4395 kvm_seg.unusable = 1; 4396 4397 kvm_set_segment(vcpu, &kvm_seg, seg); 4398 return 0; 4399 } 4400 4401 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4402 struct tss_segment_32 *tss) 4403 { 4404 tss->cr3 = vcpu->arch.cr3; 4405 tss->eip = kvm_rip_read(vcpu); 4406 tss->eflags = kvm_get_rflags(vcpu); 4407 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4408 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4409 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4410 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4411 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4412 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4413 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4414 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4415 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4416 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4417 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4418 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4419 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 4420 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 4421 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4422 } 4423 4424 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4425 struct tss_segment_32 *tss) 4426 { 4427 kvm_set_cr3(vcpu, tss->cr3); 4428 4429 kvm_rip_write(vcpu, tss->eip); 4430 kvm_set_rflags(vcpu, tss->eflags | 2); 4431 4432 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4433 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4434 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 4435 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 4436 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 4437 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 4438 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4439 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4440 4441 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4442 return 1; 4443 4444 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4445 return 1; 4446 4447 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4448 return 1; 4449 4450 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4451 return 1; 4452 4453 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4454 return 1; 4455 4456 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4457 return 1; 4458 4459 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4460 return 1; 4461 return 0; 4462 } 4463 4464 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 4465 struct tss_segment_16 *tss) 4466 { 4467 tss->ip = kvm_rip_read(vcpu); 4468 tss->flag = kvm_get_rflags(vcpu); 4469 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4470 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4471 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4472 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4473 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4474 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4475 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 4476 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 4477 4478 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4479 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4480 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4481 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4482 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4483 } 4484 4485 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4486 struct tss_segment_16 *tss) 4487 { 4488 kvm_rip_write(vcpu, tss->ip); 4489 kvm_set_rflags(vcpu, tss->flag | 2); 4490 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4491 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4492 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4493 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 4494 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 4495 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 4496 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 4497 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 4498 4499 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 4500 return 1; 4501 4502 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4503 return 1; 4504 4505 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4506 return 1; 4507 4508 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4509 return 1; 4510 4511 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4512 return 1; 4513 return 0; 4514 } 4515 4516 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 4517 u16 old_tss_sel, u32 old_tss_base, 4518 struct desc_struct *nseg_desc) 4519 { 4520 struct tss_segment_16 tss_segment_16; 4521 int ret = 0; 4522 4523 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4524 sizeof tss_segment_16)) 4525 goto out; 4526 4527 save_state_to_tss16(vcpu, &tss_segment_16); 4528 4529 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4530 sizeof tss_segment_16)) 4531 goto out; 4532 4533 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4534 &tss_segment_16, sizeof tss_segment_16)) 4535 goto out; 4536 4537 if (old_tss_sel != 0xffff) { 4538 tss_segment_16.prev_task_link = old_tss_sel; 4539 4540 if (kvm_write_guest(vcpu->kvm, 4541 get_tss_base_addr(vcpu, nseg_desc), 4542 &tss_segment_16.prev_task_link, 4543 sizeof tss_segment_16.prev_task_link)) 4544 goto out; 4545 } 4546 4547 if (load_state_from_tss16(vcpu, &tss_segment_16)) 4548 goto out; 4549 4550 ret = 1; 4551 out: 4552 return ret; 4553 } 4554 4555 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 4556 u16 old_tss_sel, u32 old_tss_base, 4557 struct desc_struct *nseg_desc) 4558 { 4559 struct tss_segment_32 tss_segment_32; 4560 int ret = 0; 4561 4562 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4563 sizeof tss_segment_32)) 4564 goto out; 4565 4566 save_state_to_tss32(vcpu, &tss_segment_32); 4567 4568 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4569 sizeof tss_segment_32)) 4570 goto out; 4571 4572 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4573 &tss_segment_32, sizeof tss_segment_32)) 4574 goto out; 4575 4576 if (old_tss_sel != 0xffff) { 4577 tss_segment_32.prev_task_link = old_tss_sel; 4578 4579 if (kvm_write_guest(vcpu->kvm, 4580 get_tss_base_addr(vcpu, nseg_desc), 4581 &tss_segment_32.prev_task_link, 4582 sizeof tss_segment_32.prev_task_link)) 4583 goto out; 4584 } 4585 4586 if (load_state_from_tss32(vcpu, &tss_segment_32)) 4587 goto out; 4588 4589 ret = 1; 4590 out: 4591 return ret; 4592 } 4593 4594 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 4595 { 4596 struct kvm_segment tr_seg; 4597 struct desc_struct cseg_desc; 4598 struct desc_struct nseg_desc; 4599 int ret = 0; 4600 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 4601 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 4602 4603 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 4604 4605 /* FIXME: Handle errors. Failure to read either TSS or their 4606 * descriptors should generate a pagefault. 4607 */ 4608 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 4609 goto out; 4610 4611 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 4612 goto out; 4613 4614 if (reason != TASK_SWITCH_IRET) { 4615 int cpl; 4616 4617 cpl = kvm_x86_ops->get_cpl(vcpu); 4618 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 4619 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4620 return 1; 4621 } 4622 } 4623 4624 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4625 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4626 return 1; 4627 } 4628 4629 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4630 cseg_desc.type &= ~(1 << 1); //clear the B flag 4631 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4632 } 4633 4634 if (reason == TASK_SWITCH_IRET) { 4635 u32 eflags = kvm_get_rflags(vcpu); 4636 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4637 } 4638 4639 /* set back link to prev task only if NT bit is set in eflags 4640 note that old_tss_sel is not used afetr this point */ 4641 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4642 old_tss_sel = 0xffff; 4643 4644 if (nseg_desc.type & 8) 4645 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4646 old_tss_base, &nseg_desc); 4647 else 4648 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4649 old_tss_base, &nseg_desc); 4650 4651 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4652 u32 eflags = kvm_get_rflags(vcpu); 4653 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4654 } 4655 4656 if (reason != TASK_SWITCH_IRET) { 4657 nseg_desc.type |= (1 << 1); 4658 save_guest_segment_descriptor(vcpu, tss_selector, 4659 &nseg_desc); 4660 } 4661 4662 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4663 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4664 tr_seg.type = 11; 4665 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4666 out: 4667 return ret; 4668 } 4669 EXPORT_SYMBOL_GPL(kvm_task_switch); 4670 4671 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4672 struct kvm_sregs *sregs) 4673 { 4674 int mmu_reset_needed = 0; 4675 int pending_vec, max_bits; 4676 struct descriptor_table dt; 4677 4678 vcpu_load(vcpu); 4679 4680 dt.limit = sregs->idt.limit; 4681 dt.base = sregs->idt.base; 4682 kvm_x86_ops->set_idt(vcpu, &dt); 4683 dt.limit = sregs->gdt.limit; 4684 dt.base = sregs->gdt.base; 4685 kvm_x86_ops->set_gdt(vcpu, &dt); 4686 4687 vcpu->arch.cr2 = sregs->cr2; 4688 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4689 vcpu->arch.cr3 = sregs->cr3; 4690 4691 kvm_set_cr8(vcpu, sregs->cr8); 4692 4693 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4694 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4695 kvm_set_apic_base(vcpu, sregs->apic_base); 4696 4697 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4698 4699 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4700 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4701 vcpu->arch.cr0 = sregs->cr0; 4702 4703 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4704 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4705 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 4706 load_pdptrs(vcpu, vcpu->arch.cr3); 4707 mmu_reset_needed = 1; 4708 } 4709 4710 if (mmu_reset_needed) 4711 kvm_mmu_reset_context(vcpu); 4712 4713 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4714 pending_vec = find_first_bit( 4715 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4716 if (pending_vec < max_bits) { 4717 kvm_queue_interrupt(vcpu, pending_vec, false); 4718 pr_debug("Set back pending irq %d\n", pending_vec); 4719 if (irqchip_in_kernel(vcpu->kvm)) 4720 kvm_pic_clear_isr_ack(vcpu->kvm); 4721 } 4722 4723 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4724 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4725 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4726 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4727 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4728 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4729 4730 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4731 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4732 4733 update_cr8_intercept(vcpu); 4734 4735 /* Older userspace won't unhalt the vcpu on reset. */ 4736 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 4737 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4738 !(vcpu->arch.cr0 & X86_CR0_PE)) 4739 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4740 4741 vcpu_put(vcpu); 4742 4743 return 0; 4744 } 4745 4746 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4747 struct kvm_guest_debug *dbg) 4748 { 4749 unsigned long rflags; 4750 int i, r; 4751 4752 vcpu_load(vcpu); 4753 4754 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 4755 r = -EBUSY; 4756 if (vcpu->arch.exception.pending) 4757 goto unlock_out; 4758 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4759 kvm_queue_exception(vcpu, DB_VECTOR); 4760 else 4761 kvm_queue_exception(vcpu, BP_VECTOR); 4762 } 4763 4764 /* 4765 * Read rflags as long as potentially injected trace flags are still 4766 * filtered out. 4767 */ 4768 rflags = kvm_get_rflags(vcpu); 4769 4770 vcpu->guest_debug = dbg->control; 4771 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 4772 vcpu->guest_debug = 0; 4773 4774 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 4775 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4776 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4777 vcpu->arch.switch_db_regs = 4778 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4779 } else { 4780 for (i = 0; i < KVM_NR_DB_REGS; i++) 4781 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4782 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4783 } 4784 4785 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 4786 vcpu->arch.singlestep_cs = 4787 get_segment_selector(vcpu, VCPU_SREG_CS); 4788 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); 4789 } 4790 4791 /* 4792 * Trigger an rflags update that will inject or remove the trace 4793 * flags. 4794 */ 4795 kvm_set_rflags(vcpu, rflags); 4796 4797 kvm_x86_ops->set_guest_debug(vcpu, dbg); 4798 4799 r = 0; 4800 4801 unlock_out: 4802 vcpu_put(vcpu); 4803 4804 return r; 4805 } 4806 4807 /* 4808 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4809 * we have asm/x86/processor.h 4810 */ 4811 struct fxsave { 4812 u16 cwd; 4813 u16 swd; 4814 u16 twd; 4815 u16 fop; 4816 u64 rip; 4817 u64 rdp; 4818 u32 mxcsr; 4819 u32 mxcsr_mask; 4820 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4821 #ifdef CONFIG_X86_64 4822 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4823 #else 4824 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4825 #endif 4826 }; 4827 4828 /* 4829 * Translate a guest virtual address to a guest physical address. 4830 */ 4831 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4832 struct kvm_translation *tr) 4833 { 4834 unsigned long vaddr = tr->linear_address; 4835 gpa_t gpa; 4836 4837 vcpu_load(vcpu); 4838 down_read(&vcpu->kvm->slots_lock); 4839 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4840 up_read(&vcpu->kvm->slots_lock); 4841 tr->physical_address = gpa; 4842 tr->valid = gpa != UNMAPPED_GVA; 4843 tr->writeable = 1; 4844 tr->usermode = 0; 4845 vcpu_put(vcpu); 4846 4847 return 0; 4848 } 4849 4850 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4851 { 4852 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4853 4854 vcpu_load(vcpu); 4855 4856 memcpy(fpu->fpr, fxsave->st_space, 128); 4857 fpu->fcw = fxsave->cwd; 4858 fpu->fsw = fxsave->swd; 4859 fpu->ftwx = fxsave->twd; 4860 fpu->last_opcode = fxsave->fop; 4861 fpu->last_ip = fxsave->rip; 4862 fpu->last_dp = fxsave->rdp; 4863 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4864 4865 vcpu_put(vcpu); 4866 4867 return 0; 4868 } 4869 4870 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4871 { 4872 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4873 4874 vcpu_load(vcpu); 4875 4876 memcpy(fxsave->st_space, fpu->fpr, 128); 4877 fxsave->cwd = fpu->fcw; 4878 fxsave->swd = fpu->fsw; 4879 fxsave->twd = fpu->ftwx; 4880 fxsave->fop = fpu->last_opcode; 4881 fxsave->rip = fpu->last_ip; 4882 fxsave->rdp = fpu->last_dp; 4883 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4884 4885 vcpu_put(vcpu); 4886 4887 return 0; 4888 } 4889 4890 void fx_init(struct kvm_vcpu *vcpu) 4891 { 4892 unsigned after_mxcsr_mask; 4893 4894 /* 4895 * Touch the fpu the first time in non atomic context as if 4896 * this is the first fpu instruction the exception handler 4897 * will fire before the instruction returns and it'll have to 4898 * allocate ram with GFP_KERNEL. 4899 */ 4900 if (!used_math()) 4901 kvm_fx_save(&vcpu->arch.host_fx_image); 4902 4903 /* Initialize guest FPU by resetting ours and saving into guest's */ 4904 preempt_disable(); 4905 kvm_fx_save(&vcpu->arch.host_fx_image); 4906 kvm_fx_finit(); 4907 kvm_fx_save(&vcpu->arch.guest_fx_image); 4908 kvm_fx_restore(&vcpu->arch.host_fx_image); 4909 preempt_enable(); 4910 4911 vcpu->arch.cr0 |= X86_CR0_ET; 4912 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4913 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4914 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4915 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4916 } 4917 EXPORT_SYMBOL_GPL(fx_init); 4918 4919 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4920 { 4921 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4922 return; 4923 4924 vcpu->guest_fpu_loaded = 1; 4925 kvm_fx_save(&vcpu->arch.host_fx_image); 4926 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4927 } 4928 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4929 4930 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4931 { 4932 if (!vcpu->guest_fpu_loaded) 4933 return; 4934 4935 vcpu->guest_fpu_loaded = 0; 4936 kvm_fx_save(&vcpu->arch.guest_fx_image); 4937 kvm_fx_restore(&vcpu->arch.host_fx_image); 4938 ++vcpu->stat.fpu_reload; 4939 } 4940 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4941 4942 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4943 { 4944 if (vcpu->arch.time_page) { 4945 kvm_release_page_dirty(vcpu->arch.time_page); 4946 vcpu->arch.time_page = NULL; 4947 } 4948 4949 kvm_x86_ops->vcpu_free(vcpu); 4950 } 4951 4952 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4953 unsigned int id) 4954 { 4955 return kvm_x86_ops->vcpu_create(kvm, id); 4956 } 4957 4958 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4959 { 4960 int r; 4961 4962 /* We do fxsave: this must be aligned. */ 4963 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4964 4965 vcpu->arch.mtrr_state.have_fixed = 1; 4966 vcpu_load(vcpu); 4967 r = kvm_arch_vcpu_reset(vcpu); 4968 if (r == 0) 4969 r = kvm_mmu_setup(vcpu); 4970 vcpu_put(vcpu); 4971 if (r < 0) 4972 goto free_vcpu; 4973 4974 return 0; 4975 free_vcpu: 4976 kvm_x86_ops->vcpu_free(vcpu); 4977 return r; 4978 } 4979 4980 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4981 { 4982 vcpu_load(vcpu); 4983 kvm_mmu_unload(vcpu); 4984 vcpu_put(vcpu); 4985 4986 kvm_x86_ops->vcpu_free(vcpu); 4987 } 4988 4989 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4990 { 4991 vcpu->arch.nmi_pending = false; 4992 vcpu->arch.nmi_injected = false; 4993 4994 vcpu->arch.switch_db_regs = 0; 4995 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4996 vcpu->arch.dr6 = DR6_FIXED_1; 4997 vcpu->arch.dr7 = DR7_FIXED_1; 4998 4999 return kvm_x86_ops->vcpu_reset(vcpu); 5000 } 5001 5002 int kvm_arch_hardware_enable(void *garbage) 5003 { 5004 /* 5005 * Since this may be called from a hotplug notifcation, 5006 * we can't get the CPU frequency directly. 5007 */ 5008 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 5009 int cpu = raw_smp_processor_id(); 5010 per_cpu(cpu_tsc_khz, cpu) = 0; 5011 } 5012 5013 kvm_shared_msr_cpu_online(); 5014 5015 return kvm_x86_ops->hardware_enable(garbage); 5016 } 5017 5018 void kvm_arch_hardware_disable(void *garbage) 5019 { 5020 kvm_x86_ops->hardware_disable(garbage); 5021 drop_user_return_notifiers(garbage); 5022 } 5023 5024 int kvm_arch_hardware_setup(void) 5025 { 5026 return kvm_x86_ops->hardware_setup(); 5027 } 5028 5029 void kvm_arch_hardware_unsetup(void) 5030 { 5031 kvm_x86_ops->hardware_unsetup(); 5032 } 5033 5034 void kvm_arch_check_processor_compat(void *rtn) 5035 { 5036 kvm_x86_ops->check_processor_compatibility(rtn); 5037 } 5038 5039 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 5040 { 5041 struct page *page; 5042 struct kvm *kvm; 5043 int r; 5044 5045 BUG_ON(vcpu->kvm == NULL); 5046 kvm = vcpu->kvm; 5047 5048 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5049 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5050 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5051 else 5052 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 5053 5054 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 5055 if (!page) { 5056 r = -ENOMEM; 5057 goto fail; 5058 } 5059 vcpu->arch.pio_data = page_address(page); 5060 5061 r = kvm_mmu_create(vcpu); 5062 if (r < 0) 5063 goto fail_free_pio_data; 5064 5065 if (irqchip_in_kernel(kvm)) { 5066 r = kvm_create_lapic(vcpu); 5067 if (r < 0) 5068 goto fail_mmu_destroy; 5069 } 5070 5071 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 5072 GFP_KERNEL); 5073 if (!vcpu->arch.mce_banks) { 5074 r = -ENOMEM; 5075 goto fail_mmu_destroy; 5076 } 5077 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5078 5079 return 0; 5080 5081 fail_mmu_destroy: 5082 kvm_mmu_destroy(vcpu); 5083 fail_free_pio_data: 5084 free_page((unsigned long)vcpu->arch.pio_data); 5085 fail: 5086 return r; 5087 } 5088 5089 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5090 { 5091 kvm_free_lapic(vcpu); 5092 down_read(&vcpu->kvm->slots_lock); 5093 kvm_mmu_destroy(vcpu); 5094 up_read(&vcpu->kvm->slots_lock); 5095 free_page((unsigned long)vcpu->arch.pio_data); 5096 } 5097 5098 struct kvm *kvm_arch_create_vm(void) 5099 { 5100 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 5101 5102 if (!kvm) 5103 return ERR_PTR(-ENOMEM); 5104 5105 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5106 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5107 5108 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5109 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5110 5111 rdtscll(kvm->arch.vm_init_tsc); 5112 5113 return kvm; 5114 } 5115 5116 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 5117 { 5118 vcpu_load(vcpu); 5119 kvm_mmu_unload(vcpu); 5120 vcpu_put(vcpu); 5121 } 5122 5123 static void kvm_free_vcpus(struct kvm *kvm) 5124 { 5125 unsigned int i; 5126 struct kvm_vcpu *vcpu; 5127 5128 /* 5129 * Unpin any mmu pages first. 5130 */ 5131 kvm_for_each_vcpu(i, vcpu, kvm) 5132 kvm_unload_vcpu_mmu(vcpu); 5133 kvm_for_each_vcpu(i, vcpu, kvm) 5134 kvm_arch_vcpu_free(vcpu); 5135 5136 mutex_lock(&kvm->lock); 5137 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 5138 kvm->vcpus[i] = NULL; 5139 5140 atomic_set(&kvm->online_vcpus, 0); 5141 mutex_unlock(&kvm->lock); 5142 } 5143 5144 void kvm_arch_sync_events(struct kvm *kvm) 5145 { 5146 kvm_free_all_assigned_devices(kvm); 5147 } 5148 5149 void kvm_arch_destroy_vm(struct kvm *kvm) 5150 { 5151 kvm_iommu_unmap_guest(kvm); 5152 kvm_free_pit(kvm); 5153 kfree(kvm->arch.vpic); 5154 kfree(kvm->arch.vioapic); 5155 kvm_free_vcpus(kvm); 5156 kvm_free_physmem(kvm); 5157 if (kvm->arch.apic_access_page) 5158 put_page(kvm->arch.apic_access_page); 5159 if (kvm->arch.ept_identity_pagetable) 5160 put_page(kvm->arch.ept_identity_pagetable); 5161 kfree(kvm); 5162 } 5163 5164 int kvm_arch_set_memory_region(struct kvm *kvm, 5165 struct kvm_userspace_memory_region *mem, 5166 struct kvm_memory_slot old, 5167 int user_alloc) 5168 { 5169 int npages = mem->memory_size >> PAGE_SHIFT; 5170 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 5171 5172 /*To keep backward compatibility with older userspace, 5173 *x86 needs to hanlde !user_alloc case. 5174 */ 5175 if (!user_alloc) { 5176 if (npages && !old.rmap) { 5177 unsigned long userspace_addr; 5178 5179 down_write(¤t->mm->mmap_sem); 5180 userspace_addr = do_mmap(NULL, 0, 5181 npages * PAGE_SIZE, 5182 PROT_READ | PROT_WRITE, 5183 MAP_PRIVATE | MAP_ANONYMOUS, 5184 0); 5185 up_write(¤t->mm->mmap_sem); 5186 5187 if (IS_ERR((void *)userspace_addr)) 5188 return PTR_ERR((void *)userspace_addr); 5189 5190 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 5191 spin_lock(&kvm->mmu_lock); 5192 memslot->userspace_addr = userspace_addr; 5193 spin_unlock(&kvm->mmu_lock); 5194 } else { 5195 if (!old.user_alloc && old.rmap) { 5196 int ret; 5197 5198 down_write(¤t->mm->mmap_sem); 5199 ret = do_munmap(current->mm, old.userspace_addr, 5200 old.npages * PAGE_SIZE); 5201 up_write(¤t->mm->mmap_sem); 5202 if (ret < 0) 5203 printk(KERN_WARNING 5204 "kvm_vm_ioctl_set_memory_region: " 5205 "failed to munmap memory\n"); 5206 } 5207 } 5208 } 5209 5210 spin_lock(&kvm->mmu_lock); 5211 if (!kvm->arch.n_requested_mmu_pages) { 5212 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5213 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 5214 } 5215 5216 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5217 spin_unlock(&kvm->mmu_lock); 5218 5219 return 0; 5220 } 5221 5222 void kvm_arch_flush_shadow(struct kvm *kvm) 5223 { 5224 kvm_mmu_zap_all(kvm); 5225 kvm_reload_remote_mmus(kvm); 5226 } 5227 5228 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 5229 { 5230 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 5231 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 5232 || vcpu->arch.nmi_pending || 5233 (kvm_arch_interrupt_allowed(vcpu) && 5234 kvm_cpu_has_interrupt(vcpu)); 5235 } 5236 5237 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 5238 { 5239 int me; 5240 int cpu = vcpu->cpu; 5241 5242 if (waitqueue_active(&vcpu->wq)) { 5243 wake_up_interruptible(&vcpu->wq); 5244 ++vcpu->stat.halt_wakeup; 5245 } 5246 5247 me = get_cpu(); 5248 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5249 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5250 smp_send_reschedule(cpu); 5251 put_cpu(); 5252 } 5253 5254 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 5255 { 5256 return kvm_x86_ops->interrupt_allowed(vcpu); 5257 } 5258 5259 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5260 { 5261 unsigned long rflags; 5262 5263 rflags = kvm_x86_ops->get_rflags(vcpu); 5264 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5265 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5266 return rflags; 5267 } 5268 EXPORT_SYMBOL_GPL(kvm_get_rflags); 5269 5270 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5271 { 5272 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5273 vcpu->arch.singlestep_cs == 5274 get_segment_selector(vcpu, VCPU_SREG_CS) && 5275 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) 5276 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; 5277 kvm_x86_ops->set_rflags(vcpu, rflags); 5278 } 5279 EXPORT_SYMBOL_GPL(kvm_set_rflags); 5280 5281 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5282 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5283 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5284 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5285 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5286 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 5287 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 5288 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 5289 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5290 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5291 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5292