1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 #include <trace/events/kvm.h> 41 #undef TRACE_INCLUDE_FILE 42 #define CREATE_TRACE_POINTS 43 #include "trace.h" 44 45 #include <asm/uaccess.h> 46 #include <asm/msr.h> 47 #include <asm/desc.h> 48 #include <asm/mtrr.h> 49 #include <asm/mce.h> 50 51 #define MAX_IO_MSRS 256 52 #define CR0_RESERVED_BITS \ 53 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 54 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 55 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 56 #define CR4_RESERVED_BITS \ 57 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 58 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 59 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 60 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 61 62 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 63 64 #define KVM_MAX_MCE_BANKS 32 65 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 66 67 /* EFER defaults: 68 * - enable syscall per default because its emulated by KVM 69 * - enable LME and LMA per default on 64 bit KVM 70 */ 71 #ifdef CONFIG_X86_64 72 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 73 #else 74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 75 #endif 76 77 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 78 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 79 80 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 81 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 82 struct kvm_cpuid_entry2 __user *entries); 83 84 struct kvm_x86_ops *kvm_x86_ops; 85 EXPORT_SYMBOL_GPL(kvm_x86_ops); 86 87 int ignore_msrs = 0; 88 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 89 90 struct kvm_stats_debugfs_item debugfs_entries[] = { 91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 92 { "pf_guest", VCPU_STAT(pf_guest) }, 93 { "tlb_flush", VCPU_STAT(tlb_flush) }, 94 { "invlpg", VCPU_STAT(invlpg) }, 95 { "exits", VCPU_STAT(exits) }, 96 { "io_exits", VCPU_STAT(io_exits) }, 97 { "mmio_exits", VCPU_STAT(mmio_exits) }, 98 { "signal_exits", VCPU_STAT(signal_exits) }, 99 { "irq_window", VCPU_STAT(irq_window_exits) }, 100 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 101 { "halt_exits", VCPU_STAT(halt_exits) }, 102 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 103 { "hypercalls", VCPU_STAT(hypercalls) }, 104 { "request_irq", VCPU_STAT(request_irq_exits) }, 105 { "irq_exits", VCPU_STAT(irq_exits) }, 106 { "host_state_reload", VCPU_STAT(host_state_reload) }, 107 { "efer_reload", VCPU_STAT(efer_reload) }, 108 { "fpu_reload", VCPU_STAT(fpu_reload) }, 109 { "insn_emulation", VCPU_STAT(insn_emulation) }, 110 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 111 { "irq_injections", VCPU_STAT(irq_injections) }, 112 { "nmi_injections", VCPU_STAT(nmi_injections) }, 113 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 114 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 115 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 116 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 117 { "mmu_flooded", VM_STAT(mmu_flooded) }, 118 { "mmu_recycled", VM_STAT(mmu_recycled) }, 119 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 120 { "mmu_unsync", VM_STAT(mmu_unsync) }, 121 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 122 { "largepages", VM_STAT(lpages) }, 123 { NULL } 124 }; 125 126 unsigned long segment_base(u16 selector) 127 { 128 struct descriptor_table gdt; 129 struct desc_struct *d; 130 unsigned long table_base; 131 unsigned long v; 132 133 if (selector == 0) 134 return 0; 135 136 kvm_get_gdt(&gdt); 137 table_base = gdt.base; 138 139 if (selector & 4) { /* from ldt */ 140 u16 ldt_selector = kvm_read_ldt(); 141 142 table_base = segment_base(ldt_selector); 143 } 144 d = (struct desc_struct *)(table_base + (selector & ~7)); 145 v = get_desc_base(d); 146 #ifdef CONFIG_X86_64 147 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 148 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 149 #endif 150 return v; 151 } 152 EXPORT_SYMBOL_GPL(segment_base); 153 154 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 155 { 156 if (irqchip_in_kernel(vcpu->kvm)) 157 return vcpu->arch.apic_base; 158 else 159 return vcpu->arch.apic_base; 160 } 161 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 162 163 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 164 { 165 /* TODO: reserve bits check */ 166 if (irqchip_in_kernel(vcpu->kvm)) 167 kvm_lapic_set_base(vcpu, data); 168 else 169 vcpu->arch.apic_base = data; 170 } 171 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 172 173 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 174 { 175 WARN_ON(vcpu->arch.exception.pending); 176 vcpu->arch.exception.pending = true; 177 vcpu->arch.exception.has_error_code = false; 178 vcpu->arch.exception.nr = nr; 179 } 180 EXPORT_SYMBOL_GPL(kvm_queue_exception); 181 182 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 183 u32 error_code) 184 { 185 ++vcpu->stat.pf_guest; 186 187 if (vcpu->arch.exception.pending) { 188 switch(vcpu->arch.exception.nr) { 189 case DF_VECTOR: 190 /* triple fault -> shutdown */ 191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 192 return; 193 case PF_VECTOR: 194 vcpu->arch.exception.nr = DF_VECTOR; 195 vcpu->arch.exception.error_code = 0; 196 return; 197 default: 198 /* replace previous exception with a new one in a hope 199 that instruction re-execution will regenerate lost 200 exception */ 201 vcpu->arch.exception.pending = false; 202 break; 203 } 204 } 205 vcpu->arch.cr2 = addr; 206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 207 } 208 209 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 210 { 211 vcpu->arch.nmi_pending = 1; 212 } 213 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 214 215 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 216 { 217 WARN_ON(vcpu->arch.exception.pending); 218 vcpu->arch.exception.pending = true; 219 vcpu->arch.exception.has_error_code = true; 220 vcpu->arch.exception.nr = nr; 221 vcpu->arch.exception.error_code = error_code; 222 } 223 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 224 225 /* 226 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 227 * a #GP and return false. 228 */ 229 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 230 { 231 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 232 return true; 233 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 234 return false; 235 } 236 EXPORT_SYMBOL_GPL(kvm_require_cpl); 237 238 /* 239 * Load the pae pdptrs. Return true is they are all valid. 240 */ 241 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 242 { 243 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 244 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 245 int i; 246 int ret; 247 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 248 249 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 250 offset * sizeof(u64), sizeof(pdpte)); 251 if (ret < 0) { 252 ret = 0; 253 goto out; 254 } 255 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 256 if (is_present_gpte(pdpte[i]) && 257 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 258 ret = 0; 259 goto out; 260 } 261 } 262 ret = 1; 263 264 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 265 __set_bit(VCPU_EXREG_PDPTR, 266 (unsigned long *)&vcpu->arch.regs_avail); 267 __set_bit(VCPU_EXREG_PDPTR, 268 (unsigned long *)&vcpu->arch.regs_dirty); 269 out: 270 271 return ret; 272 } 273 EXPORT_SYMBOL_GPL(load_pdptrs); 274 275 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 276 { 277 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 278 bool changed = true; 279 int r; 280 281 if (is_long_mode(vcpu) || !is_pae(vcpu)) 282 return false; 283 284 if (!test_bit(VCPU_EXREG_PDPTR, 285 (unsigned long *)&vcpu->arch.regs_avail)) 286 return true; 287 288 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 289 if (r < 0) 290 goto out; 291 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 292 out: 293 294 return changed; 295 } 296 297 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 298 { 299 if (cr0 & CR0_RESERVED_BITS) { 300 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 301 cr0, vcpu->arch.cr0); 302 kvm_inject_gp(vcpu, 0); 303 return; 304 } 305 306 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 307 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 308 kvm_inject_gp(vcpu, 0); 309 return; 310 } 311 312 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 313 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 314 "and a clear PE flag\n"); 315 kvm_inject_gp(vcpu, 0); 316 return; 317 } 318 319 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 320 #ifdef CONFIG_X86_64 321 if ((vcpu->arch.shadow_efer & EFER_LME)) { 322 int cs_db, cs_l; 323 324 if (!is_pae(vcpu)) { 325 printk(KERN_DEBUG "set_cr0: #GP, start paging " 326 "in long mode while PAE is disabled\n"); 327 kvm_inject_gp(vcpu, 0); 328 return; 329 } 330 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 331 if (cs_l) { 332 printk(KERN_DEBUG "set_cr0: #GP, start paging " 333 "in long mode while CS.L == 1\n"); 334 kvm_inject_gp(vcpu, 0); 335 return; 336 337 } 338 } else 339 #endif 340 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 341 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 342 "reserved bits\n"); 343 kvm_inject_gp(vcpu, 0); 344 return; 345 } 346 347 } 348 349 kvm_x86_ops->set_cr0(vcpu, cr0); 350 vcpu->arch.cr0 = cr0; 351 352 kvm_mmu_reset_context(vcpu); 353 return; 354 } 355 EXPORT_SYMBOL_GPL(kvm_set_cr0); 356 357 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 358 { 359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 360 } 361 EXPORT_SYMBOL_GPL(kvm_lmsw); 362 363 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 364 { 365 unsigned long old_cr4 = vcpu->arch.cr4; 366 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 367 368 if (cr4 & CR4_RESERVED_BITS) { 369 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 370 kvm_inject_gp(vcpu, 0); 371 return; 372 } 373 374 if (is_long_mode(vcpu)) { 375 if (!(cr4 & X86_CR4_PAE)) { 376 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 377 "in long mode\n"); 378 kvm_inject_gp(vcpu, 0); 379 return; 380 } 381 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 382 && ((cr4 ^ old_cr4) & pdptr_bits) 383 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 384 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 385 kvm_inject_gp(vcpu, 0); 386 return; 387 } 388 389 if (cr4 & X86_CR4_VMXE) { 390 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 391 kvm_inject_gp(vcpu, 0); 392 return; 393 } 394 kvm_x86_ops->set_cr4(vcpu, cr4); 395 vcpu->arch.cr4 = cr4; 396 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 397 kvm_mmu_reset_context(vcpu); 398 } 399 EXPORT_SYMBOL_GPL(kvm_set_cr4); 400 401 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 402 { 403 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 404 kvm_mmu_sync_roots(vcpu); 405 kvm_mmu_flush_tlb(vcpu); 406 return; 407 } 408 409 if (is_long_mode(vcpu)) { 410 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 411 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 412 kvm_inject_gp(vcpu, 0); 413 return; 414 } 415 } else { 416 if (is_pae(vcpu)) { 417 if (cr3 & CR3_PAE_RESERVED_BITS) { 418 printk(KERN_DEBUG 419 "set_cr3: #GP, reserved bits\n"); 420 kvm_inject_gp(vcpu, 0); 421 return; 422 } 423 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 424 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 425 "reserved bits\n"); 426 kvm_inject_gp(vcpu, 0); 427 return; 428 } 429 } 430 /* 431 * We don't check reserved bits in nonpae mode, because 432 * this isn't enforced, and VMware depends on this. 433 */ 434 } 435 436 /* 437 * Does the new cr3 value map to physical memory? (Note, we 438 * catch an invalid cr3 even in real-mode, because it would 439 * cause trouble later on when we turn on paging anyway.) 440 * 441 * A real CPU would silently accept an invalid cr3 and would 442 * attempt to use it - with largely undefined (and often hard 443 * to debug) behavior on the guest side. 444 */ 445 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 446 kvm_inject_gp(vcpu, 0); 447 else { 448 vcpu->arch.cr3 = cr3; 449 vcpu->arch.mmu.new_cr3(vcpu); 450 } 451 } 452 EXPORT_SYMBOL_GPL(kvm_set_cr3); 453 454 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 455 { 456 if (cr8 & CR8_RESERVED_BITS) { 457 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 458 kvm_inject_gp(vcpu, 0); 459 return; 460 } 461 if (irqchip_in_kernel(vcpu->kvm)) 462 kvm_lapic_set_tpr(vcpu, cr8); 463 else 464 vcpu->arch.cr8 = cr8; 465 } 466 EXPORT_SYMBOL_GPL(kvm_set_cr8); 467 468 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 469 { 470 if (irqchip_in_kernel(vcpu->kvm)) 471 return kvm_lapic_get_cr8(vcpu); 472 else 473 return vcpu->arch.cr8; 474 } 475 EXPORT_SYMBOL_GPL(kvm_get_cr8); 476 477 static inline u32 bit(int bitno) 478 { 479 return 1 << (bitno & 31); 480 } 481 482 /* 483 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 485 * 486 * This list is modified at module load time to reflect the 487 * capabilities of the host cpu. This capabilities test skips MSRs that are 488 * kvm-specific. Those are put in the beginning of the list. 489 */ 490 491 #define KVM_SAVE_MSRS_BEGIN 2 492 static u32 msrs_to_save[] = { 493 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 494 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 495 MSR_K6_STAR, 496 #ifdef CONFIG_X86_64 497 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 498 #endif 499 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 500 }; 501 502 static unsigned num_msrs_to_save; 503 504 static u32 emulated_msrs[] = { 505 MSR_IA32_MISC_ENABLE, 506 }; 507 508 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 509 { 510 if (efer & efer_reserved_bits) { 511 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 512 efer); 513 kvm_inject_gp(vcpu, 0); 514 return; 515 } 516 517 if (is_paging(vcpu) 518 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 519 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 520 kvm_inject_gp(vcpu, 0); 521 return; 522 } 523 524 if (efer & EFER_FFXSR) { 525 struct kvm_cpuid_entry2 *feat; 526 527 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 528 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 529 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 530 kvm_inject_gp(vcpu, 0); 531 return; 532 } 533 } 534 535 if (efer & EFER_SVME) { 536 struct kvm_cpuid_entry2 *feat; 537 538 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 539 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 540 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 541 kvm_inject_gp(vcpu, 0); 542 return; 543 } 544 } 545 546 kvm_x86_ops->set_efer(vcpu, efer); 547 548 efer &= ~EFER_LMA; 549 efer |= vcpu->arch.shadow_efer & EFER_LMA; 550 551 vcpu->arch.shadow_efer = efer; 552 553 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 554 kvm_mmu_reset_context(vcpu); 555 } 556 557 void kvm_enable_efer_bits(u64 mask) 558 { 559 efer_reserved_bits &= ~mask; 560 } 561 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 562 563 564 /* 565 * Writes msr value into into the appropriate "register". 566 * Returns 0 on success, non-0 otherwise. 567 * Assumes vcpu_load() was already called. 568 */ 569 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 570 { 571 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 572 } 573 574 /* 575 * Adapt set_msr() to msr_io()'s calling convention 576 */ 577 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 578 { 579 return kvm_set_msr(vcpu, index, *data); 580 } 581 582 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 583 { 584 static int version; 585 struct pvclock_wall_clock wc; 586 struct timespec now, sys, boot; 587 588 if (!wall_clock) 589 return; 590 591 version++; 592 593 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 594 595 /* 596 * The guest calculates current wall clock time by adding 597 * system time (updated by kvm_write_guest_time below) to the 598 * wall clock specified here. guest system time equals host 599 * system time for us, thus we must fill in host boot time here. 600 */ 601 now = current_kernel_time(); 602 ktime_get_ts(&sys); 603 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 604 605 wc.sec = boot.tv_sec; 606 wc.nsec = boot.tv_nsec; 607 wc.version = version; 608 609 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 610 611 version++; 612 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 613 } 614 615 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 616 { 617 uint32_t quotient, remainder; 618 619 /* Don't try to replace with do_div(), this one calculates 620 * "(dividend << 32) / divisor" */ 621 __asm__ ( "divl %4" 622 : "=a" (quotient), "=d" (remainder) 623 : "0" (0), "1" (dividend), "r" (divisor) ); 624 return quotient; 625 } 626 627 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 628 { 629 uint64_t nsecs = 1000000000LL; 630 int32_t shift = 0; 631 uint64_t tps64; 632 uint32_t tps32; 633 634 tps64 = tsc_khz * 1000LL; 635 while (tps64 > nsecs*2) { 636 tps64 >>= 1; 637 shift--; 638 } 639 640 tps32 = (uint32_t)tps64; 641 while (tps32 <= (uint32_t)nsecs) { 642 tps32 <<= 1; 643 shift++; 644 } 645 646 hv_clock->tsc_shift = shift; 647 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 648 649 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 650 __func__, tsc_khz, hv_clock->tsc_shift, 651 hv_clock->tsc_to_system_mul); 652 } 653 654 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 655 656 static void kvm_write_guest_time(struct kvm_vcpu *v) 657 { 658 struct timespec ts; 659 unsigned long flags; 660 struct kvm_vcpu_arch *vcpu = &v->arch; 661 void *shared_kaddr; 662 unsigned long this_tsc_khz; 663 664 if ((!vcpu->time_page)) 665 return; 666 667 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 668 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 669 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 670 vcpu->hv_clock_tsc_khz = this_tsc_khz; 671 } 672 put_cpu_var(cpu_tsc_khz); 673 674 /* Keep irq disabled to prevent changes to the clock */ 675 local_irq_save(flags); 676 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 677 ktime_get_ts(&ts); 678 local_irq_restore(flags); 679 680 /* With all the info we got, fill in the values */ 681 682 vcpu->hv_clock.system_time = ts.tv_nsec + 683 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 684 685 /* 686 * The interface expects us to write an even number signaling that the 687 * update is finished. Since the guest won't see the intermediate 688 * state, we just increase by 2 at the end. 689 */ 690 vcpu->hv_clock.version += 2; 691 692 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 693 694 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 695 sizeof(vcpu->hv_clock)); 696 697 kunmap_atomic(shared_kaddr, KM_USER0); 698 699 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 700 } 701 702 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 703 { 704 struct kvm_vcpu_arch *vcpu = &v->arch; 705 706 if (!vcpu->time_page) 707 return 0; 708 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 709 return 1; 710 } 711 712 static bool msr_mtrr_valid(unsigned msr) 713 { 714 switch (msr) { 715 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 716 case MSR_MTRRfix64K_00000: 717 case MSR_MTRRfix16K_80000: 718 case MSR_MTRRfix16K_A0000: 719 case MSR_MTRRfix4K_C0000: 720 case MSR_MTRRfix4K_C8000: 721 case MSR_MTRRfix4K_D0000: 722 case MSR_MTRRfix4K_D8000: 723 case MSR_MTRRfix4K_E0000: 724 case MSR_MTRRfix4K_E8000: 725 case MSR_MTRRfix4K_F0000: 726 case MSR_MTRRfix4K_F8000: 727 case MSR_MTRRdefType: 728 case MSR_IA32_CR_PAT: 729 return true; 730 case 0x2f8: 731 return true; 732 } 733 return false; 734 } 735 736 static bool valid_pat_type(unsigned t) 737 { 738 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 739 } 740 741 static bool valid_mtrr_type(unsigned t) 742 { 743 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 744 } 745 746 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 747 { 748 int i; 749 750 if (!msr_mtrr_valid(msr)) 751 return false; 752 753 if (msr == MSR_IA32_CR_PAT) { 754 for (i = 0; i < 8; i++) 755 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 756 return false; 757 return true; 758 } else if (msr == MSR_MTRRdefType) { 759 if (data & ~0xcff) 760 return false; 761 return valid_mtrr_type(data & 0xff); 762 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 763 for (i = 0; i < 8 ; i++) 764 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 765 return false; 766 return true; 767 } 768 769 /* variable MTRRs */ 770 return valid_mtrr_type(data & 0xff); 771 } 772 773 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 774 { 775 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 776 777 if (!mtrr_valid(vcpu, msr, data)) 778 return 1; 779 780 if (msr == MSR_MTRRdefType) { 781 vcpu->arch.mtrr_state.def_type = data; 782 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 783 } else if (msr == MSR_MTRRfix64K_00000) 784 p[0] = data; 785 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 786 p[1 + msr - MSR_MTRRfix16K_80000] = data; 787 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 788 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 789 else if (msr == MSR_IA32_CR_PAT) 790 vcpu->arch.pat = data; 791 else { /* Variable MTRRs */ 792 int idx, is_mtrr_mask; 793 u64 *pt; 794 795 idx = (msr - 0x200) / 2; 796 is_mtrr_mask = msr - 0x200 - 2 * idx; 797 if (!is_mtrr_mask) 798 pt = 799 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 800 else 801 pt = 802 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 803 *pt = data; 804 } 805 806 kvm_mmu_reset_context(vcpu); 807 return 0; 808 } 809 810 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 811 { 812 u64 mcg_cap = vcpu->arch.mcg_cap; 813 unsigned bank_num = mcg_cap & 0xff; 814 815 switch (msr) { 816 case MSR_IA32_MCG_STATUS: 817 vcpu->arch.mcg_status = data; 818 break; 819 case MSR_IA32_MCG_CTL: 820 if (!(mcg_cap & MCG_CTL_P)) 821 return 1; 822 if (data != 0 && data != ~(u64)0) 823 return -1; 824 vcpu->arch.mcg_ctl = data; 825 break; 826 default: 827 if (msr >= MSR_IA32_MC0_CTL && 828 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 829 u32 offset = msr - MSR_IA32_MC0_CTL; 830 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 831 if ((offset & 0x3) == 0 && 832 data != 0 && data != ~(u64)0) 833 return -1; 834 vcpu->arch.mce_banks[offset] = data; 835 break; 836 } 837 return 1; 838 } 839 return 0; 840 } 841 842 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) 843 { 844 struct kvm *kvm = vcpu->kvm; 845 int lm = is_long_mode(vcpu); 846 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 847 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; 848 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 849 : kvm->arch.xen_hvm_config.blob_size_32; 850 u32 page_num = data & ~PAGE_MASK; 851 u64 page_addr = data & PAGE_MASK; 852 u8 *page; 853 int r; 854 855 r = -E2BIG; 856 if (page_num >= blob_size) 857 goto out; 858 r = -ENOMEM; 859 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 860 if (!page) 861 goto out; 862 r = -EFAULT; 863 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) 864 goto out_free; 865 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 866 goto out_free; 867 r = 0; 868 out_free: 869 kfree(page); 870 out: 871 return r; 872 } 873 874 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 875 { 876 switch (msr) { 877 case MSR_EFER: 878 set_efer(vcpu, data); 879 break; 880 case MSR_K7_HWCR: 881 data &= ~(u64)0x40; /* ignore flush filter disable */ 882 if (data != 0) { 883 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 884 data); 885 return 1; 886 } 887 break; 888 case MSR_FAM10H_MMIO_CONF_BASE: 889 if (data != 0) { 890 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 891 "0x%llx\n", data); 892 return 1; 893 } 894 break; 895 case MSR_AMD64_NB_CFG: 896 break; 897 case MSR_IA32_DEBUGCTLMSR: 898 if (!data) { 899 /* We support the non-activated case already */ 900 break; 901 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 902 /* Values other than LBR and BTF are vendor-specific, 903 thus reserved and should throw a #GP */ 904 return 1; 905 } 906 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 907 __func__, data); 908 break; 909 case MSR_IA32_UCODE_REV: 910 case MSR_IA32_UCODE_WRITE: 911 case MSR_VM_HSAVE_PA: 912 case MSR_AMD64_PATCH_LOADER: 913 break; 914 case 0x200 ... 0x2ff: 915 return set_msr_mtrr(vcpu, msr, data); 916 case MSR_IA32_APICBASE: 917 kvm_set_apic_base(vcpu, data); 918 break; 919 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 920 return kvm_x2apic_msr_write(vcpu, msr, data); 921 case MSR_IA32_MISC_ENABLE: 922 vcpu->arch.ia32_misc_enable_msr = data; 923 break; 924 case MSR_KVM_WALL_CLOCK: 925 vcpu->kvm->arch.wall_clock = data; 926 kvm_write_wall_clock(vcpu->kvm, data); 927 break; 928 case MSR_KVM_SYSTEM_TIME: { 929 if (vcpu->arch.time_page) { 930 kvm_release_page_dirty(vcpu->arch.time_page); 931 vcpu->arch.time_page = NULL; 932 } 933 934 vcpu->arch.time = data; 935 936 /* we verify if the enable bit is set... */ 937 if (!(data & 1)) 938 break; 939 940 /* ...but clean it before doing the actual write */ 941 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 942 943 vcpu->arch.time_page = 944 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 945 946 if (is_error_page(vcpu->arch.time_page)) { 947 kvm_release_page_clean(vcpu->arch.time_page); 948 vcpu->arch.time_page = NULL; 949 } 950 951 kvm_request_guest_time_update(vcpu); 952 break; 953 } 954 case MSR_IA32_MCG_CTL: 955 case MSR_IA32_MCG_STATUS: 956 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 957 return set_msr_mce(vcpu, msr, data); 958 959 /* Performance counters are not protected by a CPUID bit, 960 * so we should check all of them in the generic path for the sake of 961 * cross vendor migration. 962 * Writing a zero into the event select MSRs disables them, 963 * which we perfectly emulate ;-). Any other value should be at least 964 * reported, some guests depend on them. 965 */ 966 case MSR_P6_EVNTSEL0: 967 case MSR_P6_EVNTSEL1: 968 case MSR_K7_EVNTSEL0: 969 case MSR_K7_EVNTSEL1: 970 case MSR_K7_EVNTSEL2: 971 case MSR_K7_EVNTSEL3: 972 if (data != 0) 973 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 974 "0x%x data 0x%llx\n", msr, data); 975 break; 976 /* at least RHEL 4 unconditionally writes to the perfctr registers, 977 * so we ignore writes to make it happy. 978 */ 979 case MSR_P6_PERFCTR0: 980 case MSR_P6_PERFCTR1: 981 case MSR_K7_PERFCTR0: 982 case MSR_K7_PERFCTR1: 983 case MSR_K7_PERFCTR2: 984 case MSR_K7_PERFCTR3: 985 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 986 "0x%x data 0x%llx\n", msr, data); 987 break; 988 default: 989 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 990 return xen_hvm_config(vcpu, data); 991 if (!ignore_msrs) { 992 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 993 msr, data); 994 return 1; 995 } else { 996 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 997 msr, data); 998 break; 999 } 1000 } 1001 return 0; 1002 } 1003 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1004 1005 1006 /* 1007 * Reads an msr value (of 'msr_index') into 'pdata'. 1008 * Returns 0 on success, non-0 otherwise. 1009 * Assumes vcpu_load() was already called. 1010 */ 1011 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1012 { 1013 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 1014 } 1015 1016 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1017 { 1018 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1019 1020 if (!msr_mtrr_valid(msr)) 1021 return 1; 1022 1023 if (msr == MSR_MTRRdefType) 1024 *pdata = vcpu->arch.mtrr_state.def_type + 1025 (vcpu->arch.mtrr_state.enabled << 10); 1026 else if (msr == MSR_MTRRfix64K_00000) 1027 *pdata = p[0]; 1028 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1029 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 1030 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1031 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 1032 else if (msr == MSR_IA32_CR_PAT) 1033 *pdata = vcpu->arch.pat; 1034 else { /* Variable MTRRs */ 1035 int idx, is_mtrr_mask; 1036 u64 *pt; 1037 1038 idx = (msr - 0x200) / 2; 1039 is_mtrr_mask = msr - 0x200 - 2 * idx; 1040 if (!is_mtrr_mask) 1041 pt = 1042 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1043 else 1044 pt = 1045 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1046 *pdata = *pt; 1047 } 1048 1049 return 0; 1050 } 1051 1052 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1053 { 1054 u64 data; 1055 u64 mcg_cap = vcpu->arch.mcg_cap; 1056 unsigned bank_num = mcg_cap & 0xff; 1057 1058 switch (msr) { 1059 case MSR_IA32_P5_MC_ADDR: 1060 case MSR_IA32_P5_MC_TYPE: 1061 data = 0; 1062 break; 1063 case MSR_IA32_MCG_CAP: 1064 data = vcpu->arch.mcg_cap; 1065 break; 1066 case MSR_IA32_MCG_CTL: 1067 if (!(mcg_cap & MCG_CTL_P)) 1068 return 1; 1069 data = vcpu->arch.mcg_ctl; 1070 break; 1071 case MSR_IA32_MCG_STATUS: 1072 data = vcpu->arch.mcg_status; 1073 break; 1074 default: 1075 if (msr >= MSR_IA32_MC0_CTL && 1076 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1077 u32 offset = msr - MSR_IA32_MC0_CTL; 1078 data = vcpu->arch.mce_banks[offset]; 1079 break; 1080 } 1081 return 1; 1082 } 1083 *pdata = data; 1084 return 0; 1085 } 1086 1087 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1088 { 1089 u64 data; 1090 1091 switch (msr) { 1092 case MSR_IA32_PLATFORM_ID: 1093 case MSR_IA32_UCODE_REV: 1094 case MSR_IA32_EBL_CR_POWERON: 1095 case MSR_IA32_DEBUGCTLMSR: 1096 case MSR_IA32_LASTBRANCHFROMIP: 1097 case MSR_IA32_LASTBRANCHTOIP: 1098 case MSR_IA32_LASTINTFROMIP: 1099 case MSR_IA32_LASTINTTOIP: 1100 case MSR_K8_SYSCFG: 1101 case MSR_K7_HWCR: 1102 case MSR_VM_HSAVE_PA: 1103 case MSR_P6_PERFCTR0: 1104 case MSR_P6_PERFCTR1: 1105 case MSR_P6_EVNTSEL0: 1106 case MSR_P6_EVNTSEL1: 1107 case MSR_K7_EVNTSEL0: 1108 case MSR_K7_PERFCTR0: 1109 case MSR_K8_INT_PENDING_MSG: 1110 case MSR_AMD64_NB_CFG: 1111 case MSR_FAM10H_MMIO_CONF_BASE: 1112 data = 0; 1113 break; 1114 case MSR_MTRRcap: 1115 data = 0x500 | KVM_NR_VAR_MTRR; 1116 break; 1117 case 0x200 ... 0x2ff: 1118 return get_msr_mtrr(vcpu, msr, pdata); 1119 case 0xcd: /* fsb frequency */ 1120 data = 3; 1121 break; 1122 case MSR_IA32_APICBASE: 1123 data = kvm_get_apic_base(vcpu); 1124 break; 1125 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1126 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1127 break; 1128 case MSR_IA32_MISC_ENABLE: 1129 data = vcpu->arch.ia32_misc_enable_msr; 1130 break; 1131 case MSR_IA32_PERF_STATUS: 1132 /* TSC increment by tick */ 1133 data = 1000ULL; 1134 /* CPU multiplier */ 1135 data |= (((uint64_t)4ULL) << 40); 1136 break; 1137 case MSR_EFER: 1138 data = vcpu->arch.shadow_efer; 1139 break; 1140 case MSR_KVM_WALL_CLOCK: 1141 data = vcpu->kvm->arch.wall_clock; 1142 break; 1143 case MSR_KVM_SYSTEM_TIME: 1144 data = vcpu->arch.time; 1145 break; 1146 case MSR_IA32_P5_MC_ADDR: 1147 case MSR_IA32_P5_MC_TYPE: 1148 case MSR_IA32_MCG_CAP: 1149 case MSR_IA32_MCG_CTL: 1150 case MSR_IA32_MCG_STATUS: 1151 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1152 return get_msr_mce(vcpu, msr, pdata); 1153 default: 1154 if (!ignore_msrs) { 1155 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1156 return 1; 1157 } else { 1158 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1159 data = 0; 1160 } 1161 break; 1162 } 1163 *pdata = data; 1164 return 0; 1165 } 1166 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1167 1168 /* 1169 * Read or write a bunch of msrs. All parameters are kernel addresses. 1170 * 1171 * @return number of msrs set successfully. 1172 */ 1173 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1174 struct kvm_msr_entry *entries, 1175 int (*do_msr)(struct kvm_vcpu *vcpu, 1176 unsigned index, u64 *data)) 1177 { 1178 int i; 1179 1180 vcpu_load(vcpu); 1181 1182 down_read(&vcpu->kvm->slots_lock); 1183 for (i = 0; i < msrs->nmsrs; ++i) 1184 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1185 break; 1186 up_read(&vcpu->kvm->slots_lock); 1187 1188 vcpu_put(vcpu); 1189 1190 return i; 1191 } 1192 1193 /* 1194 * Read or write a bunch of msrs. Parameters are user addresses. 1195 * 1196 * @return number of msrs set successfully. 1197 */ 1198 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1199 int (*do_msr)(struct kvm_vcpu *vcpu, 1200 unsigned index, u64 *data), 1201 int writeback) 1202 { 1203 struct kvm_msrs msrs; 1204 struct kvm_msr_entry *entries; 1205 int r, n; 1206 unsigned size; 1207 1208 r = -EFAULT; 1209 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1210 goto out; 1211 1212 r = -E2BIG; 1213 if (msrs.nmsrs >= MAX_IO_MSRS) 1214 goto out; 1215 1216 r = -ENOMEM; 1217 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1218 entries = vmalloc(size); 1219 if (!entries) 1220 goto out; 1221 1222 r = -EFAULT; 1223 if (copy_from_user(entries, user_msrs->entries, size)) 1224 goto out_free; 1225 1226 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1227 if (r < 0) 1228 goto out_free; 1229 1230 r = -EFAULT; 1231 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1232 goto out_free; 1233 1234 r = n; 1235 1236 out_free: 1237 vfree(entries); 1238 out: 1239 return r; 1240 } 1241 1242 int kvm_dev_ioctl_check_extension(long ext) 1243 { 1244 int r; 1245 1246 switch (ext) { 1247 case KVM_CAP_IRQCHIP: 1248 case KVM_CAP_HLT: 1249 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1250 case KVM_CAP_SET_TSS_ADDR: 1251 case KVM_CAP_EXT_CPUID: 1252 case KVM_CAP_CLOCKSOURCE: 1253 case KVM_CAP_PIT: 1254 case KVM_CAP_NOP_IO_DELAY: 1255 case KVM_CAP_MP_STATE: 1256 case KVM_CAP_SYNC_MMU: 1257 case KVM_CAP_REINJECT_CONTROL: 1258 case KVM_CAP_IRQ_INJECT_STATUS: 1259 case KVM_CAP_ASSIGN_DEV_IRQ: 1260 case KVM_CAP_IRQFD: 1261 case KVM_CAP_IOEVENTFD: 1262 case KVM_CAP_PIT2: 1263 case KVM_CAP_PIT_STATE2: 1264 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1265 case KVM_CAP_XEN_HVM: 1266 case KVM_CAP_ADJUST_CLOCK: 1267 r = 1; 1268 break; 1269 case KVM_CAP_COALESCED_MMIO: 1270 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1271 break; 1272 case KVM_CAP_VAPIC: 1273 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1274 break; 1275 case KVM_CAP_NR_VCPUS: 1276 r = KVM_MAX_VCPUS; 1277 break; 1278 case KVM_CAP_NR_MEMSLOTS: 1279 r = KVM_MEMORY_SLOTS; 1280 break; 1281 case KVM_CAP_PV_MMU: /* obsolete */ 1282 r = 0; 1283 break; 1284 case KVM_CAP_IOMMU: 1285 r = iommu_found(); 1286 break; 1287 case KVM_CAP_MCE: 1288 r = KVM_MAX_MCE_BANKS; 1289 break; 1290 default: 1291 r = 0; 1292 break; 1293 } 1294 return r; 1295 1296 } 1297 1298 long kvm_arch_dev_ioctl(struct file *filp, 1299 unsigned int ioctl, unsigned long arg) 1300 { 1301 void __user *argp = (void __user *)arg; 1302 long r; 1303 1304 switch (ioctl) { 1305 case KVM_GET_MSR_INDEX_LIST: { 1306 struct kvm_msr_list __user *user_msr_list = argp; 1307 struct kvm_msr_list msr_list; 1308 unsigned n; 1309 1310 r = -EFAULT; 1311 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1312 goto out; 1313 n = msr_list.nmsrs; 1314 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1315 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1316 goto out; 1317 r = -E2BIG; 1318 if (n < msr_list.nmsrs) 1319 goto out; 1320 r = -EFAULT; 1321 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1322 num_msrs_to_save * sizeof(u32))) 1323 goto out; 1324 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 1325 &emulated_msrs, 1326 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1327 goto out; 1328 r = 0; 1329 break; 1330 } 1331 case KVM_GET_SUPPORTED_CPUID: { 1332 struct kvm_cpuid2 __user *cpuid_arg = argp; 1333 struct kvm_cpuid2 cpuid; 1334 1335 r = -EFAULT; 1336 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1337 goto out; 1338 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1339 cpuid_arg->entries); 1340 if (r) 1341 goto out; 1342 1343 r = -EFAULT; 1344 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1345 goto out; 1346 r = 0; 1347 break; 1348 } 1349 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 1350 u64 mce_cap; 1351 1352 mce_cap = KVM_MCE_CAP_SUPPORTED; 1353 r = -EFAULT; 1354 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 1355 goto out; 1356 r = 0; 1357 break; 1358 } 1359 default: 1360 r = -EINVAL; 1361 } 1362 out: 1363 return r; 1364 } 1365 1366 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1367 { 1368 kvm_x86_ops->vcpu_load(vcpu, cpu); 1369 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1370 unsigned long khz = cpufreq_quick_get(cpu); 1371 if (!khz) 1372 khz = tsc_khz; 1373 per_cpu(cpu_tsc_khz, cpu) = khz; 1374 } 1375 kvm_request_guest_time_update(vcpu); 1376 } 1377 1378 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1379 { 1380 kvm_x86_ops->vcpu_put(vcpu); 1381 kvm_put_guest_fpu(vcpu); 1382 } 1383 1384 static int is_efer_nx(void) 1385 { 1386 unsigned long long efer = 0; 1387 1388 rdmsrl_safe(MSR_EFER, &efer); 1389 return efer & EFER_NX; 1390 } 1391 1392 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1393 { 1394 int i; 1395 struct kvm_cpuid_entry2 *e, *entry; 1396 1397 entry = NULL; 1398 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1399 e = &vcpu->arch.cpuid_entries[i]; 1400 if (e->function == 0x80000001) { 1401 entry = e; 1402 break; 1403 } 1404 } 1405 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1406 entry->edx &= ~(1 << 20); 1407 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1408 } 1409 } 1410 1411 /* when an old userspace process fills a new kernel module */ 1412 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1413 struct kvm_cpuid *cpuid, 1414 struct kvm_cpuid_entry __user *entries) 1415 { 1416 int r, i; 1417 struct kvm_cpuid_entry *cpuid_entries; 1418 1419 r = -E2BIG; 1420 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1421 goto out; 1422 r = -ENOMEM; 1423 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1424 if (!cpuid_entries) 1425 goto out; 1426 r = -EFAULT; 1427 if (copy_from_user(cpuid_entries, entries, 1428 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1429 goto out_free; 1430 for (i = 0; i < cpuid->nent; i++) { 1431 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1432 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1433 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1434 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1435 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1436 vcpu->arch.cpuid_entries[i].index = 0; 1437 vcpu->arch.cpuid_entries[i].flags = 0; 1438 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1439 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1440 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1441 } 1442 vcpu->arch.cpuid_nent = cpuid->nent; 1443 cpuid_fix_nx_cap(vcpu); 1444 r = 0; 1445 kvm_apic_set_version(vcpu); 1446 1447 out_free: 1448 vfree(cpuid_entries); 1449 out: 1450 return r; 1451 } 1452 1453 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1454 struct kvm_cpuid2 *cpuid, 1455 struct kvm_cpuid_entry2 __user *entries) 1456 { 1457 int r; 1458 1459 r = -E2BIG; 1460 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1461 goto out; 1462 r = -EFAULT; 1463 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1464 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1465 goto out; 1466 vcpu->arch.cpuid_nent = cpuid->nent; 1467 kvm_apic_set_version(vcpu); 1468 return 0; 1469 1470 out: 1471 return r; 1472 } 1473 1474 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1475 struct kvm_cpuid2 *cpuid, 1476 struct kvm_cpuid_entry2 __user *entries) 1477 { 1478 int r; 1479 1480 r = -E2BIG; 1481 if (cpuid->nent < vcpu->arch.cpuid_nent) 1482 goto out; 1483 r = -EFAULT; 1484 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1485 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1486 goto out; 1487 return 0; 1488 1489 out: 1490 cpuid->nent = vcpu->arch.cpuid_nent; 1491 return r; 1492 } 1493 1494 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1495 u32 index) 1496 { 1497 entry->function = function; 1498 entry->index = index; 1499 cpuid_count(entry->function, entry->index, 1500 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1501 entry->flags = 0; 1502 } 1503 1504 #define F(x) bit(X86_FEATURE_##x) 1505 1506 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1507 u32 index, int *nent, int maxnent) 1508 { 1509 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1510 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; 1511 #ifdef CONFIG_X86_64 1512 unsigned f_lm = F(LM); 1513 #else 1514 unsigned f_lm = 0; 1515 #endif 1516 1517 /* cpuid 1.edx */ 1518 const u32 kvm_supported_word0_x86_features = 1519 F(FPU) | F(VME) | F(DE) | F(PSE) | 1520 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1521 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1522 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1523 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1524 0 /* Reserved, DS, ACPI */ | F(MMX) | 1525 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1526 0 /* HTT, TM, Reserved, PBE */; 1527 /* cpuid 0x80000001.edx */ 1528 const u32 kvm_supported_word1_x86_features = 1529 F(FPU) | F(VME) | F(DE) | F(PSE) | 1530 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1531 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1532 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1533 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1534 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1535 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1536 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1537 /* cpuid 1.ecx */ 1538 const u32 kvm_supported_word4_x86_features = 1539 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1540 0 /* DS-CPL, VMX, SMX, EST */ | 1541 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1542 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1543 0 /* Reserved, DCA */ | F(XMM4_1) | 1544 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1545 0 /* Reserved, XSAVE, OSXSAVE */; 1546 /* cpuid 0x80000001.ecx */ 1547 const u32 kvm_supported_word6_x86_features = 1548 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1549 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1550 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1551 0 /* SKINIT */ | 0 /* WDT */; 1552 1553 /* all calls to cpuid_count() should be made on the same cpu */ 1554 get_cpu(); 1555 do_cpuid_1_ent(entry, function, index); 1556 ++*nent; 1557 1558 switch (function) { 1559 case 0: 1560 entry->eax = min(entry->eax, (u32)0xb); 1561 break; 1562 case 1: 1563 entry->edx &= kvm_supported_word0_x86_features; 1564 entry->ecx &= kvm_supported_word4_x86_features; 1565 /* we support x2apic emulation even if host does not support 1566 * it since we emulate x2apic in software */ 1567 entry->ecx |= F(X2APIC); 1568 break; 1569 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1570 * may return different values. This forces us to get_cpu() before 1571 * issuing the first command, and also to emulate this annoying behavior 1572 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1573 case 2: { 1574 int t, times = entry->eax & 0xff; 1575 1576 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1577 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1578 for (t = 1; t < times && *nent < maxnent; ++t) { 1579 do_cpuid_1_ent(&entry[t], function, 0); 1580 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1581 ++*nent; 1582 } 1583 break; 1584 } 1585 /* function 4 and 0xb have additional index. */ 1586 case 4: { 1587 int i, cache_type; 1588 1589 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1590 /* read more entries until cache_type is zero */ 1591 for (i = 1; *nent < maxnent; ++i) { 1592 cache_type = entry[i - 1].eax & 0x1f; 1593 if (!cache_type) 1594 break; 1595 do_cpuid_1_ent(&entry[i], function, i); 1596 entry[i].flags |= 1597 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1598 ++*nent; 1599 } 1600 break; 1601 } 1602 case 0xb: { 1603 int i, level_type; 1604 1605 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1606 /* read more entries until level_type is zero */ 1607 for (i = 1; *nent < maxnent; ++i) { 1608 level_type = entry[i - 1].ecx & 0xff00; 1609 if (!level_type) 1610 break; 1611 do_cpuid_1_ent(&entry[i], function, i); 1612 entry[i].flags |= 1613 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1614 ++*nent; 1615 } 1616 break; 1617 } 1618 case 0x80000000: 1619 entry->eax = min(entry->eax, 0x8000001a); 1620 break; 1621 case 0x80000001: 1622 entry->edx &= kvm_supported_word1_x86_features; 1623 entry->ecx &= kvm_supported_word6_x86_features; 1624 break; 1625 } 1626 put_cpu(); 1627 } 1628 1629 #undef F 1630 1631 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1632 struct kvm_cpuid_entry2 __user *entries) 1633 { 1634 struct kvm_cpuid_entry2 *cpuid_entries; 1635 int limit, nent = 0, r = -E2BIG; 1636 u32 func; 1637 1638 if (cpuid->nent < 1) 1639 goto out; 1640 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1641 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 1642 r = -ENOMEM; 1643 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1644 if (!cpuid_entries) 1645 goto out; 1646 1647 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1648 limit = cpuid_entries[0].eax; 1649 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1650 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1651 &nent, cpuid->nent); 1652 r = -E2BIG; 1653 if (nent >= cpuid->nent) 1654 goto out_free; 1655 1656 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1657 limit = cpuid_entries[nent - 1].eax; 1658 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1659 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1660 &nent, cpuid->nent); 1661 r = -E2BIG; 1662 if (nent >= cpuid->nent) 1663 goto out_free; 1664 1665 r = -EFAULT; 1666 if (copy_to_user(entries, cpuid_entries, 1667 nent * sizeof(struct kvm_cpuid_entry2))) 1668 goto out_free; 1669 cpuid->nent = nent; 1670 r = 0; 1671 1672 out_free: 1673 vfree(cpuid_entries); 1674 out: 1675 return r; 1676 } 1677 1678 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1679 struct kvm_lapic_state *s) 1680 { 1681 vcpu_load(vcpu); 1682 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1683 vcpu_put(vcpu); 1684 1685 return 0; 1686 } 1687 1688 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1689 struct kvm_lapic_state *s) 1690 { 1691 vcpu_load(vcpu); 1692 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1693 kvm_apic_post_state_restore(vcpu); 1694 update_cr8_intercept(vcpu); 1695 vcpu_put(vcpu); 1696 1697 return 0; 1698 } 1699 1700 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1701 struct kvm_interrupt *irq) 1702 { 1703 if (irq->irq < 0 || irq->irq >= 256) 1704 return -EINVAL; 1705 if (irqchip_in_kernel(vcpu->kvm)) 1706 return -ENXIO; 1707 vcpu_load(vcpu); 1708 1709 kvm_queue_interrupt(vcpu, irq->irq, false); 1710 1711 vcpu_put(vcpu); 1712 1713 return 0; 1714 } 1715 1716 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1717 { 1718 vcpu_load(vcpu); 1719 kvm_inject_nmi(vcpu); 1720 vcpu_put(vcpu); 1721 1722 return 0; 1723 } 1724 1725 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1726 struct kvm_tpr_access_ctl *tac) 1727 { 1728 if (tac->flags) 1729 return -EINVAL; 1730 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1731 return 0; 1732 } 1733 1734 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 1735 u64 mcg_cap) 1736 { 1737 int r; 1738 unsigned bank_num = mcg_cap & 0xff, bank; 1739 1740 r = -EINVAL; 1741 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 1742 goto out; 1743 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1744 goto out; 1745 r = 0; 1746 vcpu->arch.mcg_cap = mcg_cap; 1747 /* Init IA32_MCG_CTL to all 1s */ 1748 if (mcg_cap & MCG_CTL_P) 1749 vcpu->arch.mcg_ctl = ~(u64)0; 1750 /* Init IA32_MCi_CTL to all 1s */ 1751 for (bank = 0; bank < bank_num; bank++) 1752 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 1753 out: 1754 return r; 1755 } 1756 1757 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 1758 struct kvm_x86_mce *mce) 1759 { 1760 u64 mcg_cap = vcpu->arch.mcg_cap; 1761 unsigned bank_num = mcg_cap & 0xff; 1762 u64 *banks = vcpu->arch.mce_banks; 1763 1764 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 1765 return -EINVAL; 1766 /* 1767 * if IA32_MCG_CTL is not all 1s, the uncorrected error 1768 * reporting is disabled 1769 */ 1770 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 1771 vcpu->arch.mcg_ctl != ~(u64)0) 1772 return 0; 1773 banks += 4 * mce->bank; 1774 /* 1775 * if IA32_MCi_CTL is not all 1s, the uncorrected error 1776 * reporting is disabled for the bank 1777 */ 1778 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 1779 return 0; 1780 if (mce->status & MCI_STATUS_UC) { 1781 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 1782 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 1783 printk(KERN_DEBUG "kvm: set_mce: " 1784 "injects mce exception while " 1785 "previous one is in progress!\n"); 1786 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 1787 return 0; 1788 } 1789 if (banks[1] & MCI_STATUS_VAL) 1790 mce->status |= MCI_STATUS_OVER; 1791 banks[2] = mce->addr; 1792 banks[3] = mce->misc; 1793 vcpu->arch.mcg_status = mce->mcg_status; 1794 banks[1] = mce->status; 1795 kvm_queue_exception(vcpu, MC_VECTOR); 1796 } else if (!(banks[1] & MCI_STATUS_VAL) 1797 || !(banks[1] & MCI_STATUS_UC)) { 1798 if (banks[1] & MCI_STATUS_VAL) 1799 mce->status |= MCI_STATUS_OVER; 1800 banks[2] = mce->addr; 1801 banks[3] = mce->misc; 1802 banks[1] = mce->status; 1803 } else 1804 banks[1] |= MCI_STATUS_OVER; 1805 return 0; 1806 } 1807 1808 long kvm_arch_vcpu_ioctl(struct file *filp, 1809 unsigned int ioctl, unsigned long arg) 1810 { 1811 struct kvm_vcpu *vcpu = filp->private_data; 1812 void __user *argp = (void __user *)arg; 1813 int r; 1814 struct kvm_lapic_state *lapic = NULL; 1815 1816 switch (ioctl) { 1817 case KVM_GET_LAPIC: { 1818 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1819 1820 r = -ENOMEM; 1821 if (!lapic) 1822 goto out; 1823 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1824 if (r) 1825 goto out; 1826 r = -EFAULT; 1827 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1828 goto out; 1829 r = 0; 1830 break; 1831 } 1832 case KVM_SET_LAPIC: { 1833 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1834 r = -ENOMEM; 1835 if (!lapic) 1836 goto out; 1837 r = -EFAULT; 1838 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1839 goto out; 1840 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1841 if (r) 1842 goto out; 1843 r = 0; 1844 break; 1845 } 1846 case KVM_INTERRUPT: { 1847 struct kvm_interrupt irq; 1848 1849 r = -EFAULT; 1850 if (copy_from_user(&irq, argp, sizeof irq)) 1851 goto out; 1852 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1853 if (r) 1854 goto out; 1855 r = 0; 1856 break; 1857 } 1858 case KVM_NMI: { 1859 r = kvm_vcpu_ioctl_nmi(vcpu); 1860 if (r) 1861 goto out; 1862 r = 0; 1863 break; 1864 } 1865 case KVM_SET_CPUID: { 1866 struct kvm_cpuid __user *cpuid_arg = argp; 1867 struct kvm_cpuid cpuid; 1868 1869 r = -EFAULT; 1870 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1871 goto out; 1872 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1873 if (r) 1874 goto out; 1875 break; 1876 } 1877 case KVM_SET_CPUID2: { 1878 struct kvm_cpuid2 __user *cpuid_arg = argp; 1879 struct kvm_cpuid2 cpuid; 1880 1881 r = -EFAULT; 1882 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1883 goto out; 1884 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1885 cpuid_arg->entries); 1886 if (r) 1887 goto out; 1888 break; 1889 } 1890 case KVM_GET_CPUID2: { 1891 struct kvm_cpuid2 __user *cpuid_arg = argp; 1892 struct kvm_cpuid2 cpuid; 1893 1894 r = -EFAULT; 1895 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1896 goto out; 1897 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1898 cpuid_arg->entries); 1899 if (r) 1900 goto out; 1901 r = -EFAULT; 1902 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1903 goto out; 1904 r = 0; 1905 break; 1906 } 1907 case KVM_GET_MSRS: 1908 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1909 break; 1910 case KVM_SET_MSRS: 1911 r = msr_io(vcpu, argp, do_set_msr, 0); 1912 break; 1913 case KVM_TPR_ACCESS_REPORTING: { 1914 struct kvm_tpr_access_ctl tac; 1915 1916 r = -EFAULT; 1917 if (copy_from_user(&tac, argp, sizeof tac)) 1918 goto out; 1919 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1920 if (r) 1921 goto out; 1922 r = -EFAULT; 1923 if (copy_to_user(argp, &tac, sizeof tac)) 1924 goto out; 1925 r = 0; 1926 break; 1927 }; 1928 case KVM_SET_VAPIC_ADDR: { 1929 struct kvm_vapic_addr va; 1930 1931 r = -EINVAL; 1932 if (!irqchip_in_kernel(vcpu->kvm)) 1933 goto out; 1934 r = -EFAULT; 1935 if (copy_from_user(&va, argp, sizeof va)) 1936 goto out; 1937 r = 0; 1938 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1939 break; 1940 } 1941 case KVM_X86_SETUP_MCE: { 1942 u64 mcg_cap; 1943 1944 r = -EFAULT; 1945 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 1946 goto out; 1947 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 1948 break; 1949 } 1950 case KVM_X86_SET_MCE: { 1951 struct kvm_x86_mce mce; 1952 1953 r = -EFAULT; 1954 if (copy_from_user(&mce, argp, sizeof mce)) 1955 goto out; 1956 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 1957 break; 1958 } 1959 default: 1960 r = -EINVAL; 1961 } 1962 out: 1963 kfree(lapic); 1964 return r; 1965 } 1966 1967 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1968 { 1969 int ret; 1970 1971 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1972 return -1; 1973 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1974 return ret; 1975 } 1976 1977 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 1978 u64 ident_addr) 1979 { 1980 kvm->arch.ept_identity_map_addr = ident_addr; 1981 return 0; 1982 } 1983 1984 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1985 u32 kvm_nr_mmu_pages) 1986 { 1987 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1988 return -EINVAL; 1989 1990 down_write(&kvm->slots_lock); 1991 spin_lock(&kvm->mmu_lock); 1992 1993 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1994 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1995 1996 spin_unlock(&kvm->mmu_lock); 1997 up_write(&kvm->slots_lock); 1998 return 0; 1999 } 2000 2001 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 2002 { 2003 return kvm->arch.n_alloc_mmu_pages; 2004 } 2005 2006 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2007 { 2008 int i; 2009 struct kvm_mem_alias *alias; 2010 2011 for (i = 0; i < kvm->arch.naliases; ++i) { 2012 alias = &kvm->arch.aliases[i]; 2013 if (gfn >= alias->base_gfn 2014 && gfn < alias->base_gfn + alias->npages) 2015 return alias->target_gfn + gfn - alias->base_gfn; 2016 } 2017 return gfn; 2018 } 2019 2020 /* 2021 * Set a new alias region. Aliases map a portion of physical memory into 2022 * another portion. This is useful for memory windows, for example the PC 2023 * VGA region. 2024 */ 2025 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 2026 struct kvm_memory_alias *alias) 2027 { 2028 int r, n; 2029 struct kvm_mem_alias *p; 2030 2031 r = -EINVAL; 2032 /* General sanity checks */ 2033 if (alias->memory_size & (PAGE_SIZE - 1)) 2034 goto out; 2035 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 2036 goto out; 2037 if (alias->slot >= KVM_ALIAS_SLOTS) 2038 goto out; 2039 if (alias->guest_phys_addr + alias->memory_size 2040 < alias->guest_phys_addr) 2041 goto out; 2042 if (alias->target_phys_addr + alias->memory_size 2043 < alias->target_phys_addr) 2044 goto out; 2045 2046 down_write(&kvm->slots_lock); 2047 spin_lock(&kvm->mmu_lock); 2048 2049 p = &kvm->arch.aliases[alias->slot]; 2050 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2051 p->npages = alias->memory_size >> PAGE_SHIFT; 2052 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2053 2054 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2055 if (kvm->arch.aliases[n - 1].npages) 2056 break; 2057 kvm->arch.naliases = n; 2058 2059 spin_unlock(&kvm->mmu_lock); 2060 kvm_mmu_zap_all(kvm); 2061 2062 up_write(&kvm->slots_lock); 2063 2064 return 0; 2065 2066 out: 2067 return r; 2068 } 2069 2070 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2071 { 2072 int r; 2073 2074 r = 0; 2075 switch (chip->chip_id) { 2076 case KVM_IRQCHIP_PIC_MASTER: 2077 memcpy(&chip->chip.pic, 2078 &pic_irqchip(kvm)->pics[0], 2079 sizeof(struct kvm_pic_state)); 2080 break; 2081 case KVM_IRQCHIP_PIC_SLAVE: 2082 memcpy(&chip->chip.pic, 2083 &pic_irqchip(kvm)->pics[1], 2084 sizeof(struct kvm_pic_state)); 2085 break; 2086 case KVM_IRQCHIP_IOAPIC: 2087 r = kvm_get_ioapic(kvm, &chip->chip.ioapic); 2088 break; 2089 default: 2090 r = -EINVAL; 2091 break; 2092 } 2093 return r; 2094 } 2095 2096 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2097 { 2098 int r; 2099 2100 r = 0; 2101 switch (chip->chip_id) { 2102 case KVM_IRQCHIP_PIC_MASTER: 2103 spin_lock(&pic_irqchip(kvm)->lock); 2104 memcpy(&pic_irqchip(kvm)->pics[0], 2105 &chip->chip.pic, 2106 sizeof(struct kvm_pic_state)); 2107 spin_unlock(&pic_irqchip(kvm)->lock); 2108 break; 2109 case KVM_IRQCHIP_PIC_SLAVE: 2110 spin_lock(&pic_irqchip(kvm)->lock); 2111 memcpy(&pic_irqchip(kvm)->pics[1], 2112 &chip->chip.pic, 2113 sizeof(struct kvm_pic_state)); 2114 spin_unlock(&pic_irqchip(kvm)->lock); 2115 break; 2116 case KVM_IRQCHIP_IOAPIC: 2117 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2118 break; 2119 default: 2120 r = -EINVAL; 2121 break; 2122 } 2123 kvm_pic_update_irq(pic_irqchip(kvm)); 2124 return r; 2125 } 2126 2127 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2128 { 2129 int r = 0; 2130 2131 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2132 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2133 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2134 return r; 2135 } 2136 2137 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2138 { 2139 int r = 0; 2140 2141 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2142 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2143 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 2144 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2145 return r; 2146 } 2147 2148 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2149 { 2150 int r = 0; 2151 2152 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2153 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 2154 sizeof(ps->channels)); 2155 ps->flags = kvm->arch.vpit->pit_state.flags; 2156 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2157 return r; 2158 } 2159 2160 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2161 { 2162 int r = 0, start = 0; 2163 u32 prev_legacy, cur_legacy; 2164 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2165 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 2166 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 2167 if (!prev_legacy && cur_legacy) 2168 start = 1; 2169 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 2170 sizeof(kvm->arch.vpit->pit_state.channels)); 2171 kvm->arch.vpit->pit_state.flags = ps->flags; 2172 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 2173 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2174 return r; 2175 } 2176 2177 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 2178 struct kvm_reinject_control *control) 2179 { 2180 if (!kvm->arch.vpit) 2181 return -ENXIO; 2182 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2183 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2184 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2185 return 0; 2186 } 2187 2188 /* 2189 * Get (and clear) the dirty memory log for a memory slot. 2190 */ 2191 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2192 struct kvm_dirty_log *log) 2193 { 2194 int r; 2195 int n; 2196 struct kvm_memory_slot *memslot; 2197 int is_dirty = 0; 2198 2199 down_write(&kvm->slots_lock); 2200 2201 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2202 if (r) 2203 goto out; 2204 2205 /* If nothing is dirty, don't bother messing with page tables. */ 2206 if (is_dirty) { 2207 spin_lock(&kvm->mmu_lock); 2208 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2209 spin_unlock(&kvm->mmu_lock); 2210 memslot = &kvm->memslots[log->slot]; 2211 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2212 memset(memslot->dirty_bitmap, 0, n); 2213 } 2214 r = 0; 2215 out: 2216 up_write(&kvm->slots_lock); 2217 return r; 2218 } 2219 2220 long kvm_arch_vm_ioctl(struct file *filp, 2221 unsigned int ioctl, unsigned long arg) 2222 { 2223 struct kvm *kvm = filp->private_data; 2224 void __user *argp = (void __user *)arg; 2225 int r = -ENOTTY; 2226 /* 2227 * This union makes it completely explicit to gcc-3.x 2228 * that these two variables' stack usage should be 2229 * combined, not added together. 2230 */ 2231 union { 2232 struct kvm_pit_state ps; 2233 struct kvm_pit_state2 ps2; 2234 struct kvm_memory_alias alias; 2235 struct kvm_pit_config pit_config; 2236 } u; 2237 2238 switch (ioctl) { 2239 case KVM_SET_TSS_ADDR: 2240 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 2241 if (r < 0) 2242 goto out; 2243 break; 2244 case KVM_SET_IDENTITY_MAP_ADDR: { 2245 u64 ident_addr; 2246 2247 r = -EFAULT; 2248 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 2249 goto out; 2250 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 2251 if (r < 0) 2252 goto out; 2253 break; 2254 } 2255 case KVM_SET_MEMORY_REGION: { 2256 struct kvm_memory_region kvm_mem; 2257 struct kvm_userspace_memory_region kvm_userspace_mem; 2258 2259 r = -EFAULT; 2260 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2261 goto out; 2262 kvm_userspace_mem.slot = kvm_mem.slot; 2263 kvm_userspace_mem.flags = kvm_mem.flags; 2264 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 2265 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 2266 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 2267 if (r) 2268 goto out; 2269 break; 2270 } 2271 case KVM_SET_NR_MMU_PAGES: 2272 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2273 if (r) 2274 goto out; 2275 break; 2276 case KVM_GET_NR_MMU_PAGES: 2277 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2278 break; 2279 case KVM_SET_MEMORY_ALIAS: 2280 r = -EFAULT; 2281 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 2282 goto out; 2283 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 2284 if (r) 2285 goto out; 2286 break; 2287 case KVM_CREATE_IRQCHIP: 2288 r = -ENOMEM; 2289 kvm->arch.vpic = kvm_create_pic(kvm); 2290 if (kvm->arch.vpic) { 2291 r = kvm_ioapic_init(kvm); 2292 if (r) { 2293 kfree(kvm->arch.vpic); 2294 kvm->arch.vpic = NULL; 2295 goto out; 2296 } 2297 } else 2298 goto out; 2299 r = kvm_setup_default_irq_routing(kvm); 2300 if (r) { 2301 kfree(kvm->arch.vpic); 2302 kfree(kvm->arch.vioapic); 2303 goto out; 2304 } 2305 break; 2306 case KVM_CREATE_PIT: 2307 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2308 goto create_pit; 2309 case KVM_CREATE_PIT2: 2310 r = -EFAULT; 2311 if (copy_from_user(&u.pit_config, argp, 2312 sizeof(struct kvm_pit_config))) 2313 goto out; 2314 create_pit: 2315 down_write(&kvm->slots_lock); 2316 r = -EEXIST; 2317 if (kvm->arch.vpit) 2318 goto create_pit_unlock; 2319 r = -ENOMEM; 2320 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 2321 if (kvm->arch.vpit) 2322 r = 0; 2323 create_pit_unlock: 2324 up_write(&kvm->slots_lock); 2325 break; 2326 case KVM_IRQ_LINE_STATUS: 2327 case KVM_IRQ_LINE: { 2328 struct kvm_irq_level irq_event; 2329 2330 r = -EFAULT; 2331 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2332 goto out; 2333 if (irqchip_in_kernel(kvm)) { 2334 __s32 status; 2335 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2336 irq_event.irq, irq_event.level); 2337 if (ioctl == KVM_IRQ_LINE_STATUS) { 2338 irq_event.status = status; 2339 if (copy_to_user(argp, &irq_event, 2340 sizeof irq_event)) 2341 goto out; 2342 } 2343 r = 0; 2344 } 2345 break; 2346 } 2347 case KVM_GET_IRQCHIP: { 2348 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2349 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2350 2351 r = -ENOMEM; 2352 if (!chip) 2353 goto out; 2354 r = -EFAULT; 2355 if (copy_from_user(chip, argp, sizeof *chip)) 2356 goto get_irqchip_out; 2357 r = -ENXIO; 2358 if (!irqchip_in_kernel(kvm)) 2359 goto get_irqchip_out; 2360 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 2361 if (r) 2362 goto get_irqchip_out; 2363 r = -EFAULT; 2364 if (copy_to_user(argp, chip, sizeof *chip)) 2365 goto get_irqchip_out; 2366 r = 0; 2367 get_irqchip_out: 2368 kfree(chip); 2369 if (r) 2370 goto out; 2371 break; 2372 } 2373 case KVM_SET_IRQCHIP: { 2374 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2375 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2376 2377 r = -ENOMEM; 2378 if (!chip) 2379 goto out; 2380 r = -EFAULT; 2381 if (copy_from_user(chip, argp, sizeof *chip)) 2382 goto set_irqchip_out; 2383 r = -ENXIO; 2384 if (!irqchip_in_kernel(kvm)) 2385 goto set_irqchip_out; 2386 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 2387 if (r) 2388 goto set_irqchip_out; 2389 r = 0; 2390 set_irqchip_out: 2391 kfree(chip); 2392 if (r) 2393 goto out; 2394 break; 2395 } 2396 case KVM_GET_PIT: { 2397 r = -EFAULT; 2398 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 2399 goto out; 2400 r = -ENXIO; 2401 if (!kvm->arch.vpit) 2402 goto out; 2403 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 2404 if (r) 2405 goto out; 2406 r = -EFAULT; 2407 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 2408 goto out; 2409 r = 0; 2410 break; 2411 } 2412 case KVM_SET_PIT: { 2413 r = -EFAULT; 2414 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 2415 goto out; 2416 r = -ENXIO; 2417 if (!kvm->arch.vpit) 2418 goto out; 2419 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2420 if (r) 2421 goto out; 2422 r = 0; 2423 break; 2424 } 2425 case KVM_GET_PIT2: { 2426 r = -ENXIO; 2427 if (!kvm->arch.vpit) 2428 goto out; 2429 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 2430 if (r) 2431 goto out; 2432 r = -EFAULT; 2433 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 2434 goto out; 2435 r = 0; 2436 break; 2437 } 2438 case KVM_SET_PIT2: { 2439 r = -EFAULT; 2440 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 2441 goto out; 2442 r = -ENXIO; 2443 if (!kvm->arch.vpit) 2444 goto out; 2445 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 2446 if (r) 2447 goto out; 2448 r = 0; 2449 break; 2450 } 2451 case KVM_REINJECT_CONTROL: { 2452 struct kvm_reinject_control control; 2453 r = -EFAULT; 2454 if (copy_from_user(&control, argp, sizeof(control))) 2455 goto out; 2456 r = kvm_vm_ioctl_reinject(kvm, &control); 2457 if (r) 2458 goto out; 2459 r = 0; 2460 break; 2461 } 2462 case KVM_XEN_HVM_CONFIG: { 2463 r = -EFAULT; 2464 if (copy_from_user(&kvm->arch.xen_hvm_config, argp, 2465 sizeof(struct kvm_xen_hvm_config))) 2466 goto out; 2467 r = -EINVAL; 2468 if (kvm->arch.xen_hvm_config.flags) 2469 goto out; 2470 r = 0; 2471 break; 2472 } 2473 case KVM_SET_CLOCK: { 2474 struct timespec now; 2475 struct kvm_clock_data user_ns; 2476 u64 now_ns; 2477 s64 delta; 2478 2479 r = -EFAULT; 2480 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 2481 goto out; 2482 2483 r = -EINVAL; 2484 if (user_ns.flags) 2485 goto out; 2486 2487 r = 0; 2488 ktime_get_ts(&now); 2489 now_ns = timespec_to_ns(&now); 2490 delta = user_ns.clock - now_ns; 2491 kvm->arch.kvmclock_offset = delta; 2492 break; 2493 } 2494 case KVM_GET_CLOCK: { 2495 struct timespec now; 2496 struct kvm_clock_data user_ns; 2497 u64 now_ns; 2498 2499 ktime_get_ts(&now); 2500 now_ns = timespec_to_ns(&now); 2501 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 2502 user_ns.flags = 0; 2503 2504 r = -EFAULT; 2505 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 2506 goto out; 2507 r = 0; 2508 break; 2509 } 2510 2511 default: 2512 ; 2513 } 2514 out: 2515 return r; 2516 } 2517 2518 static void kvm_init_msr_list(void) 2519 { 2520 u32 dummy[2]; 2521 unsigned i, j; 2522 2523 /* skip the first msrs in the list. KVM-specific */ 2524 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 2525 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2526 continue; 2527 if (j < i) 2528 msrs_to_save[j] = msrs_to_save[i]; 2529 j++; 2530 } 2531 num_msrs_to_save = j; 2532 } 2533 2534 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 2535 const void *v) 2536 { 2537 if (vcpu->arch.apic && 2538 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 2539 return 0; 2540 2541 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 2542 } 2543 2544 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 2545 { 2546 if (vcpu->arch.apic && 2547 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 2548 return 0; 2549 2550 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 2551 } 2552 2553 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2554 struct kvm_vcpu *vcpu) 2555 { 2556 void *data = val; 2557 int r = X86EMUL_CONTINUE; 2558 2559 while (bytes) { 2560 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2561 unsigned offset = addr & (PAGE_SIZE-1); 2562 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2563 int ret; 2564 2565 if (gpa == UNMAPPED_GVA) { 2566 r = X86EMUL_PROPAGATE_FAULT; 2567 goto out; 2568 } 2569 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2570 if (ret < 0) { 2571 r = X86EMUL_UNHANDLEABLE; 2572 goto out; 2573 } 2574 2575 bytes -= toread; 2576 data += toread; 2577 addr += toread; 2578 } 2579 out: 2580 return r; 2581 } 2582 2583 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2584 struct kvm_vcpu *vcpu) 2585 { 2586 void *data = val; 2587 int r = X86EMUL_CONTINUE; 2588 2589 while (bytes) { 2590 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2591 unsigned offset = addr & (PAGE_SIZE-1); 2592 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2593 int ret; 2594 2595 if (gpa == UNMAPPED_GVA) { 2596 r = X86EMUL_PROPAGATE_FAULT; 2597 goto out; 2598 } 2599 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2600 if (ret < 0) { 2601 r = X86EMUL_UNHANDLEABLE; 2602 goto out; 2603 } 2604 2605 bytes -= towrite; 2606 data += towrite; 2607 addr += towrite; 2608 } 2609 out: 2610 return r; 2611 } 2612 2613 2614 static int emulator_read_emulated(unsigned long addr, 2615 void *val, 2616 unsigned int bytes, 2617 struct kvm_vcpu *vcpu) 2618 { 2619 gpa_t gpa; 2620 2621 if (vcpu->mmio_read_completed) { 2622 memcpy(val, vcpu->mmio_data, bytes); 2623 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 2624 vcpu->mmio_phys_addr, *(u64 *)val); 2625 vcpu->mmio_read_completed = 0; 2626 return X86EMUL_CONTINUE; 2627 } 2628 2629 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2630 2631 /* For APIC access vmexit */ 2632 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2633 goto mmio; 2634 2635 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2636 == X86EMUL_CONTINUE) 2637 return X86EMUL_CONTINUE; 2638 if (gpa == UNMAPPED_GVA) 2639 return X86EMUL_PROPAGATE_FAULT; 2640 2641 mmio: 2642 /* 2643 * Is this MMIO handled locally? 2644 */ 2645 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 2646 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 2647 return X86EMUL_CONTINUE; 2648 } 2649 2650 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 2651 2652 vcpu->mmio_needed = 1; 2653 vcpu->mmio_phys_addr = gpa; 2654 vcpu->mmio_size = bytes; 2655 vcpu->mmio_is_write = 0; 2656 2657 return X86EMUL_UNHANDLEABLE; 2658 } 2659 2660 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2661 const void *val, int bytes) 2662 { 2663 int ret; 2664 2665 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2666 if (ret < 0) 2667 return 0; 2668 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2669 return 1; 2670 } 2671 2672 static int emulator_write_emulated_onepage(unsigned long addr, 2673 const void *val, 2674 unsigned int bytes, 2675 struct kvm_vcpu *vcpu) 2676 { 2677 gpa_t gpa; 2678 2679 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2680 2681 if (gpa == UNMAPPED_GVA) { 2682 kvm_inject_page_fault(vcpu, addr, 2); 2683 return X86EMUL_PROPAGATE_FAULT; 2684 } 2685 2686 /* For APIC access vmexit */ 2687 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2688 goto mmio; 2689 2690 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2691 return X86EMUL_CONTINUE; 2692 2693 mmio: 2694 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 2695 /* 2696 * Is this MMIO handled locally? 2697 */ 2698 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 2699 return X86EMUL_CONTINUE; 2700 2701 vcpu->mmio_needed = 1; 2702 vcpu->mmio_phys_addr = gpa; 2703 vcpu->mmio_size = bytes; 2704 vcpu->mmio_is_write = 1; 2705 memcpy(vcpu->mmio_data, val, bytes); 2706 2707 return X86EMUL_CONTINUE; 2708 } 2709 2710 int emulator_write_emulated(unsigned long addr, 2711 const void *val, 2712 unsigned int bytes, 2713 struct kvm_vcpu *vcpu) 2714 { 2715 /* Crossing a page boundary? */ 2716 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2717 int rc, now; 2718 2719 now = -addr & ~PAGE_MASK; 2720 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2721 if (rc != X86EMUL_CONTINUE) 2722 return rc; 2723 addr += now; 2724 val += now; 2725 bytes -= now; 2726 } 2727 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2728 } 2729 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2730 2731 static int emulator_cmpxchg_emulated(unsigned long addr, 2732 const void *old, 2733 const void *new, 2734 unsigned int bytes, 2735 struct kvm_vcpu *vcpu) 2736 { 2737 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 2738 #ifndef CONFIG_X86_64 2739 /* guests cmpxchg8b have to be emulated atomically */ 2740 if (bytes == 8) { 2741 gpa_t gpa; 2742 struct page *page; 2743 char *kaddr; 2744 u64 val; 2745 2746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2747 2748 if (gpa == UNMAPPED_GVA || 2749 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2750 goto emul_write; 2751 2752 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2753 goto emul_write; 2754 2755 val = *(u64 *)new; 2756 2757 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2758 2759 kaddr = kmap_atomic(page, KM_USER0); 2760 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2761 kunmap_atomic(kaddr, KM_USER0); 2762 kvm_release_page_dirty(page); 2763 } 2764 emul_write: 2765 #endif 2766 2767 return emulator_write_emulated(addr, new, bytes, vcpu); 2768 } 2769 2770 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2771 { 2772 return kvm_x86_ops->get_segment_base(vcpu, seg); 2773 } 2774 2775 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2776 { 2777 kvm_mmu_invlpg(vcpu, address); 2778 return X86EMUL_CONTINUE; 2779 } 2780 2781 int emulate_clts(struct kvm_vcpu *vcpu) 2782 { 2783 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2784 return X86EMUL_CONTINUE; 2785 } 2786 2787 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2788 { 2789 struct kvm_vcpu *vcpu = ctxt->vcpu; 2790 2791 switch (dr) { 2792 case 0 ... 3: 2793 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2794 return X86EMUL_CONTINUE; 2795 default: 2796 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2797 return X86EMUL_UNHANDLEABLE; 2798 } 2799 } 2800 2801 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2802 { 2803 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2804 int exception; 2805 2806 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2807 if (exception) { 2808 /* FIXME: better handling */ 2809 return X86EMUL_UNHANDLEABLE; 2810 } 2811 return X86EMUL_CONTINUE; 2812 } 2813 2814 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2815 { 2816 u8 opcodes[4]; 2817 unsigned long rip = kvm_rip_read(vcpu); 2818 unsigned long rip_linear; 2819 2820 if (!printk_ratelimit()) 2821 return; 2822 2823 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2824 2825 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2826 2827 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2828 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2829 } 2830 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2831 2832 static struct x86_emulate_ops emulate_ops = { 2833 .read_std = kvm_read_guest_virt, 2834 .read_emulated = emulator_read_emulated, 2835 .write_emulated = emulator_write_emulated, 2836 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2837 }; 2838 2839 static void cache_all_regs(struct kvm_vcpu *vcpu) 2840 { 2841 kvm_register_read(vcpu, VCPU_REGS_RAX); 2842 kvm_register_read(vcpu, VCPU_REGS_RSP); 2843 kvm_register_read(vcpu, VCPU_REGS_RIP); 2844 vcpu->arch.regs_dirty = ~0; 2845 } 2846 2847 int emulate_instruction(struct kvm_vcpu *vcpu, 2848 unsigned long cr2, 2849 u16 error_code, 2850 int emulation_type) 2851 { 2852 int r, shadow_mask; 2853 struct decode_cache *c; 2854 struct kvm_run *run = vcpu->run; 2855 2856 kvm_clear_exception_queue(vcpu); 2857 vcpu->arch.mmio_fault_cr2 = cr2; 2858 /* 2859 * TODO: fix emulate.c to use guest_read/write_register 2860 * instead of direct ->regs accesses, can save hundred cycles 2861 * on Intel for instructions that don't read/change RSP, for 2862 * for example. 2863 */ 2864 cache_all_regs(vcpu); 2865 2866 vcpu->mmio_is_write = 0; 2867 vcpu->arch.pio.string = 0; 2868 2869 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2870 int cs_db, cs_l; 2871 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2872 2873 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2874 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 2875 vcpu->arch.emulate_ctxt.mode = 2876 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2877 ? X86EMUL_MODE_REAL : cs_l 2878 ? X86EMUL_MODE_PROT64 : cs_db 2879 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2880 2881 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2882 2883 /* Only allow emulation of specific instructions on #UD 2884 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 2885 c = &vcpu->arch.emulate_ctxt.decode; 2886 if (emulation_type & EMULTYPE_TRAP_UD) { 2887 if (!c->twobyte) 2888 return EMULATE_FAIL; 2889 switch (c->b) { 2890 case 0x01: /* VMMCALL */ 2891 if (c->modrm_mod != 3 || c->modrm_rm != 1) 2892 return EMULATE_FAIL; 2893 break; 2894 case 0x34: /* sysenter */ 2895 case 0x35: /* sysexit */ 2896 if (c->modrm_mod != 0 || c->modrm_rm != 0) 2897 return EMULATE_FAIL; 2898 break; 2899 case 0x05: /* syscall */ 2900 if (c->modrm_mod != 0 || c->modrm_rm != 0) 2901 return EMULATE_FAIL; 2902 break; 2903 default: 2904 return EMULATE_FAIL; 2905 } 2906 2907 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 2908 return EMULATE_FAIL; 2909 } 2910 2911 ++vcpu->stat.insn_emulation; 2912 if (r) { 2913 ++vcpu->stat.insn_emulation_fail; 2914 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2915 return EMULATE_DONE; 2916 return EMULATE_FAIL; 2917 } 2918 } 2919 2920 if (emulation_type & EMULTYPE_SKIP) { 2921 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 2922 return EMULATE_DONE; 2923 } 2924 2925 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2926 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 2927 2928 if (r == 0) 2929 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 2930 2931 if (vcpu->arch.pio.string) 2932 return EMULATE_DO_MMIO; 2933 2934 if ((r || vcpu->mmio_is_write) && run) { 2935 run->exit_reason = KVM_EXIT_MMIO; 2936 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2937 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2938 run->mmio.len = vcpu->mmio_size; 2939 run->mmio.is_write = vcpu->mmio_is_write; 2940 } 2941 2942 if (r) { 2943 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2944 return EMULATE_DONE; 2945 if (!vcpu->mmio_needed) { 2946 kvm_report_emulation_failure(vcpu, "mmio"); 2947 return EMULATE_FAIL; 2948 } 2949 return EMULATE_DO_MMIO; 2950 } 2951 2952 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2953 2954 if (vcpu->mmio_is_write) { 2955 vcpu->mmio_needed = 0; 2956 return EMULATE_DO_MMIO; 2957 } 2958 2959 return EMULATE_DONE; 2960 } 2961 EXPORT_SYMBOL_GPL(emulate_instruction); 2962 2963 static int pio_copy_data(struct kvm_vcpu *vcpu) 2964 { 2965 void *p = vcpu->arch.pio_data; 2966 gva_t q = vcpu->arch.pio.guest_gva; 2967 unsigned bytes; 2968 int ret; 2969 2970 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2971 if (vcpu->arch.pio.in) 2972 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2973 else 2974 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2975 return ret; 2976 } 2977 2978 int complete_pio(struct kvm_vcpu *vcpu) 2979 { 2980 struct kvm_pio_request *io = &vcpu->arch.pio; 2981 long delta; 2982 int r; 2983 unsigned long val; 2984 2985 if (!io->string) { 2986 if (io->in) { 2987 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2988 memcpy(&val, vcpu->arch.pio_data, io->size); 2989 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2990 } 2991 } else { 2992 if (io->in) { 2993 r = pio_copy_data(vcpu); 2994 if (r) 2995 return r; 2996 } 2997 2998 delta = 1; 2999 if (io->rep) { 3000 delta *= io->cur_count; 3001 /* 3002 * The size of the register should really depend on 3003 * current address size. 3004 */ 3005 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 3006 val -= delta; 3007 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 3008 } 3009 if (io->down) 3010 delta = -delta; 3011 delta *= io->size; 3012 if (io->in) { 3013 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 3014 val += delta; 3015 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 3016 } else { 3017 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 3018 val += delta; 3019 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3020 } 3021 } 3022 3023 io->count -= io->cur_count; 3024 io->cur_count = 0; 3025 3026 return 0; 3027 } 3028 3029 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3030 { 3031 /* TODO: String I/O for in kernel device */ 3032 int r; 3033 3034 if (vcpu->arch.pio.in) 3035 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3036 vcpu->arch.pio.size, pd); 3037 else 3038 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3039 vcpu->arch.pio.size, pd); 3040 return r; 3041 } 3042 3043 static int pio_string_write(struct kvm_vcpu *vcpu) 3044 { 3045 struct kvm_pio_request *io = &vcpu->arch.pio; 3046 void *pd = vcpu->arch.pio_data; 3047 int i, r = 0; 3048 3049 for (i = 0; i < io->cur_count; i++) { 3050 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3051 io->port, io->size, pd)) { 3052 r = -EOPNOTSUPP; 3053 break; 3054 } 3055 pd += io->size; 3056 } 3057 return r; 3058 } 3059 3060 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) 3061 { 3062 unsigned long val; 3063 3064 vcpu->run->exit_reason = KVM_EXIT_IO; 3065 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3066 vcpu->run->io.size = vcpu->arch.pio.size = size; 3067 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3068 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 3069 vcpu->run->io.port = vcpu->arch.pio.port = port; 3070 vcpu->arch.pio.in = in; 3071 vcpu->arch.pio.string = 0; 3072 vcpu->arch.pio.down = 0; 3073 vcpu->arch.pio.rep = 0; 3074 3075 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3076 size, 1); 3077 3078 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3079 memcpy(vcpu->arch.pio_data, &val, 4); 3080 3081 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3082 complete_pio(vcpu); 3083 return 1; 3084 } 3085 return 0; 3086 } 3087 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3088 3089 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3090 int size, unsigned long count, int down, 3091 gva_t address, int rep, unsigned port) 3092 { 3093 unsigned now, in_page; 3094 int ret = 0; 3095 3096 vcpu->run->exit_reason = KVM_EXIT_IO; 3097 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3098 vcpu->run->io.size = vcpu->arch.pio.size = size; 3099 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3100 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 3101 vcpu->run->io.port = vcpu->arch.pio.port = port; 3102 vcpu->arch.pio.in = in; 3103 vcpu->arch.pio.string = 1; 3104 vcpu->arch.pio.down = down; 3105 vcpu->arch.pio.rep = rep; 3106 3107 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3108 size, count); 3109 3110 if (!count) { 3111 kvm_x86_ops->skip_emulated_instruction(vcpu); 3112 return 1; 3113 } 3114 3115 if (!down) 3116 in_page = PAGE_SIZE - offset_in_page(address); 3117 else 3118 in_page = offset_in_page(address) + size; 3119 now = min(count, (unsigned long)in_page / size); 3120 if (!now) 3121 now = 1; 3122 if (down) { 3123 /* 3124 * String I/O in reverse. Yuck. Kill the guest, fix later. 3125 */ 3126 pr_unimpl(vcpu, "guest string pio down\n"); 3127 kvm_inject_gp(vcpu, 0); 3128 return 1; 3129 } 3130 vcpu->run->io.count = now; 3131 vcpu->arch.pio.cur_count = now; 3132 3133 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 3134 kvm_x86_ops->skip_emulated_instruction(vcpu); 3135 3136 vcpu->arch.pio.guest_gva = address; 3137 3138 if (!vcpu->arch.pio.in) { 3139 /* string PIO write */ 3140 ret = pio_copy_data(vcpu); 3141 if (ret == X86EMUL_PROPAGATE_FAULT) { 3142 kvm_inject_gp(vcpu, 0); 3143 return 1; 3144 } 3145 if (ret == 0 && !pio_string_write(vcpu)) { 3146 complete_pio(vcpu); 3147 if (vcpu->arch.pio.count == 0) 3148 ret = 1; 3149 } 3150 } 3151 /* no string PIO read support yet */ 3152 3153 return ret; 3154 } 3155 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3156 3157 static void bounce_off(void *info) 3158 { 3159 /* nothing */ 3160 } 3161 3162 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3163 void *data) 3164 { 3165 struct cpufreq_freqs *freq = data; 3166 struct kvm *kvm; 3167 struct kvm_vcpu *vcpu; 3168 int i, send_ipi = 0; 3169 3170 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3171 return 0; 3172 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3173 return 0; 3174 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 3175 3176 spin_lock(&kvm_lock); 3177 list_for_each_entry(kvm, &vm_list, vm_list) { 3178 kvm_for_each_vcpu(i, vcpu, kvm) { 3179 if (vcpu->cpu != freq->cpu) 3180 continue; 3181 if (!kvm_request_guest_time_update(vcpu)) 3182 continue; 3183 if (vcpu->cpu != smp_processor_id()) 3184 send_ipi++; 3185 } 3186 } 3187 spin_unlock(&kvm_lock); 3188 3189 if (freq->old < freq->new && send_ipi) { 3190 /* 3191 * We upscale the frequency. Must make the guest 3192 * doesn't see old kvmclock values while running with 3193 * the new frequency, otherwise we risk the guest sees 3194 * time go backwards. 3195 * 3196 * In case we update the frequency for another cpu 3197 * (which might be in guest context) send an interrupt 3198 * to kick the cpu out of guest context. Next time 3199 * guest context is entered kvmclock will be updated, 3200 * so the guest will not see stale values. 3201 */ 3202 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 3203 } 3204 return 0; 3205 } 3206 3207 static struct notifier_block kvmclock_cpufreq_notifier_block = { 3208 .notifier_call = kvmclock_cpufreq_notifier 3209 }; 3210 3211 static void kvm_timer_init(void) 3212 { 3213 int cpu; 3214 3215 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 3216 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 3217 CPUFREQ_TRANSITION_NOTIFIER); 3218 for_each_online_cpu(cpu) { 3219 unsigned long khz = cpufreq_get(cpu); 3220 if (!khz) 3221 khz = tsc_khz; 3222 per_cpu(cpu_tsc_khz, cpu) = khz; 3223 } 3224 } else { 3225 for_each_possible_cpu(cpu) 3226 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 3227 } 3228 } 3229 3230 int kvm_arch_init(void *opaque) 3231 { 3232 int r; 3233 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3234 3235 if (kvm_x86_ops) { 3236 printk(KERN_ERR "kvm: already loaded the other module\n"); 3237 r = -EEXIST; 3238 goto out; 3239 } 3240 3241 if (!ops->cpu_has_kvm_support()) { 3242 printk(KERN_ERR "kvm: no hardware support\n"); 3243 r = -EOPNOTSUPP; 3244 goto out; 3245 } 3246 if (ops->disabled_by_bios()) { 3247 printk(KERN_ERR "kvm: disabled by bios\n"); 3248 r = -EOPNOTSUPP; 3249 goto out; 3250 } 3251 3252 r = kvm_mmu_module_init(); 3253 if (r) 3254 goto out; 3255 3256 kvm_init_msr_list(); 3257 3258 kvm_x86_ops = ops; 3259 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 3260 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 3261 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3262 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3263 3264 kvm_timer_init(); 3265 3266 return 0; 3267 3268 out: 3269 return r; 3270 } 3271 3272 void kvm_arch_exit(void) 3273 { 3274 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 3275 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 3276 CPUFREQ_TRANSITION_NOTIFIER); 3277 kvm_x86_ops = NULL; 3278 kvm_mmu_module_exit(); 3279 } 3280 3281 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3282 { 3283 ++vcpu->stat.halt_exits; 3284 if (irqchip_in_kernel(vcpu->kvm)) { 3285 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3286 return 1; 3287 } else { 3288 vcpu->run->exit_reason = KVM_EXIT_HLT; 3289 return 0; 3290 } 3291 } 3292 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 3293 3294 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 3295 unsigned long a1) 3296 { 3297 if (is_long_mode(vcpu)) 3298 return a0; 3299 else 3300 return a0 | ((gpa_t)a1 << 32); 3301 } 3302 3303 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3304 { 3305 unsigned long nr, a0, a1, a2, a3, ret; 3306 int r = 1; 3307 3308 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3309 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3310 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3311 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3312 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3313 3314 trace_kvm_hypercall(nr, a0, a1, a2, a3); 3315 3316 if (!is_long_mode(vcpu)) { 3317 nr &= 0xFFFFFFFF; 3318 a0 &= 0xFFFFFFFF; 3319 a1 &= 0xFFFFFFFF; 3320 a2 &= 0xFFFFFFFF; 3321 a3 &= 0xFFFFFFFF; 3322 } 3323 3324 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 3325 ret = -KVM_EPERM; 3326 goto out; 3327 } 3328 3329 switch (nr) { 3330 case KVM_HC_VAPIC_POLL_IRQ: 3331 ret = 0; 3332 break; 3333 case KVM_HC_MMU_OP: 3334 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 3335 break; 3336 default: 3337 ret = -KVM_ENOSYS; 3338 break; 3339 } 3340 out: 3341 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3342 ++vcpu->stat.hypercalls; 3343 return r; 3344 } 3345 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 3346 3347 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3348 { 3349 char instruction[3]; 3350 int ret = 0; 3351 unsigned long rip = kvm_rip_read(vcpu); 3352 3353 3354 /* 3355 * Blow out the MMU to ensure that no other VCPU has an active mapping 3356 * to ensure that the updated hypercall appears atomically across all 3357 * VCPUs. 3358 */ 3359 kvm_mmu_zap_all(vcpu->kvm); 3360 3361 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3362 if (emulator_write_emulated(rip, instruction, 3, vcpu) 3363 != X86EMUL_CONTINUE) 3364 ret = -EFAULT; 3365 3366 return ret; 3367 } 3368 3369 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3370 { 3371 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 3372 } 3373 3374 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3375 { 3376 struct descriptor_table dt = { limit, base }; 3377 3378 kvm_x86_ops->set_gdt(vcpu, &dt); 3379 } 3380 3381 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3382 { 3383 struct descriptor_table dt = { limit, base }; 3384 3385 kvm_x86_ops->set_idt(vcpu, &dt); 3386 } 3387 3388 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 3389 unsigned long *rflags) 3390 { 3391 kvm_lmsw(vcpu, msw); 3392 *rflags = kvm_get_rflags(vcpu); 3393 } 3394 3395 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3396 { 3397 unsigned long value; 3398 3399 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3400 switch (cr) { 3401 case 0: 3402 value = vcpu->arch.cr0; 3403 break; 3404 case 2: 3405 value = vcpu->arch.cr2; 3406 break; 3407 case 3: 3408 value = vcpu->arch.cr3; 3409 break; 3410 case 4: 3411 value = vcpu->arch.cr4; 3412 break; 3413 case 8: 3414 value = kvm_get_cr8(vcpu); 3415 break; 3416 default: 3417 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3418 return 0; 3419 } 3420 3421 return value; 3422 } 3423 3424 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3425 unsigned long *rflags) 3426 { 3427 switch (cr) { 3428 case 0: 3429 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3430 *rflags = kvm_get_rflags(vcpu); 3431 break; 3432 case 2: 3433 vcpu->arch.cr2 = val; 3434 break; 3435 case 3: 3436 kvm_set_cr3(vcpu, val); 3437 break; 3438 case 4: 3439 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 3440 break; 3441 case 8: 3442 kvm_set_cr8(vcpu, val & 0xfUL); 3443 break; 3444 default: 3445 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3446 } 3447 } 3448 3449 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 3450 { 3451 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 3452 int j, nent = vcpu->arch.cpuid_nent; 3453 3454 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 3455 /* when no next entry is found, the current entry[i] is reselected */ 3456 for (j = i + 1; ; j = (j + 1) % nent) { 3457 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 3458 if (ej->function == e->function) { 3459 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 3460 return j; 3461 } 3462 } 3463 return 0; /* silence gcc, even though control never reaches here */ 3464 } 3465 3466 /* find an entry with matching function, matching index (if needed), and that 3467 * should be read next (if it's stateful) */ 3468 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3469 u32 function, u32 index) 3470 { 3471 if (e->function != function) 3472 return 0; 3473 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3474 return 0; 3475 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3476 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3477 return 0; 3478 return 1; 3479 } 3480 3481 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3482 u32 function, u32 index) 3483 { 3484 int i; 3485 struct kvm_cpuid_entry2 *best = NULL; 3486 3487 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3488 struct kvm_cpuid_entry2 *e; 3489 3490 e = &vcpu->arch.cpuid_entries[i]; 3491 if (is_matching_cpuid_entry(e, function, index)) { 3492 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3493 move_to_next_stateful_cpuid_entry(vcpu, i); 3494 best = e; 3495 break; 3496 } 3497 /* 3498 * Both basic or both extended? 3499 */ 3500 if (((e->function ^ function) & 0x80000000) == 0) 3501 if (!best || e->function > best->function) 3502 best = e; 3503 } 3504 return best; 3505 } 3506 3507 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3508 { 3509 struct kvm_cpuid_entry2 *best; 3510 3511 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3512 if (best) 3513 return best->eax & 0xff; 3514 return 36; 3515 } 3516 3517 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3518 { 3519 u32 function, index; 3520 struct kvm_cpuid_entry2 *best; 3521 3522 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3523 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3524 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3525 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3526 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3527 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3528 best = kvm_find_cpuid_entry(vcpu, function, index); 3529 if (best) { 3530 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3531 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3532 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3533 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3534 } 3535 kvm_x86_ops->skip_emulated_instruction(vcpu); 3536 trace_kvm_cpuid(function, 3537 kvm_register_read(vcpu, VCPU_REGS_RAX), 3538 kvm_register_read(vcpu, VCPU_REGS_RBX), 3539 kvm_register_read(vcpu, VCPU_REGS_RCX), 3540 kvm_register_read(vcpu, VCPU_REGS_RDX)); 3541 } 3542 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3543 3544 /* 3545 * Check if userspace requested an interrupt window, and that the 3546 * interrupt window is open. 3547 * 3548 * No need to exit to userspace if we already have an interrupt queued. 3549 */ 3550 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 3551 { 3552 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3553 vcpu->run->request_interrupt_window && 3554 kvm_arch_interrupt_allowed(vcpu)); 3555 } 3556 3557 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 3558 { 3559 struct kvm_run *kvm_run = vcpu->run; 3560 3561 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3562 kvm_run->cr8 = kvm_get_cr8(vcpu); 3563 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3564 if (irqchip_in_kernel(vcpu->kvm)) 3565 kvm_run->ready_for_interrupt_injection = 1; 3566 else 3567 kvm_run->ready_for_interrupt_injection = 3568 kvm_arch_interrupt_allowed(vcpu) && 3569 !kvm_cpu_has_interrupt(vcpu) && 3570 !kvm_event_needs_reinjection(vcpu); 3571 } 3572 3573 static void vapic_enter(struct kvm_vcpu *vcpu) 3574 { 3575 struct kvm_lapic *apic = vcpu->arch.apic; 3576 struct page *page; 3577 3578 if (!apic || !apic->vapic_addr) 3579 return; 3580 3581 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3582 3583 vcpu->arch.apic->vapic_page = page; 3584 } 3585 3586 static void vapic_exit(struct kvm_vcpu *vcpu) 3587 { 3588 struct kvm_lapic *apic = vcpu->arch.apic; 3589 3590 if (!apic || !apic->vapic_addr) 3591 return; 3592 3593 down_read(&vcpu->kvm->slots_lock); 3594 kvm_release_page_dirty(apic->vapic_page); 3595 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3596 up_read(&vcpu->kvm->slots_lock); 3597 } 3598 3599 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3600 { 3601 int max_irr, tpr; 3602 3603 if (!kvm_x86_ops->update_cr8_intercept) 3604 return; 3605 3606 if (!vcpu->arch.apic) 3607 return; 3608 3609 if (!vcpu->arch.apic->vapic_addr) 3610 max_irr = kvm_lapic_find_highest_irr(vcpu); 3611 else 3612 max_irr = -1; 3613 3614 if (max_irr != -1) 3615 max_irr >>= 4; 3616 3617 tpr = kvm_lapic_get_cr8(vcpu); 3618 3619 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3620 } 3621 3622 static void inject_pending_event(struct kvm_vcpu *vcpu) 3623 { 3624 /* try to reinject previous events if any */ 3625 if (vcpu->arch.exception.pending) { 3626 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 3627 vcpu->arch.exception.has_error_code, 3628 vcpu->arch.exception.error_code); 3629 return; 3630 } 3631 3632 if (vcpu->arch.nmi_injected) { 3633 kvm_x86_ops->set_nmi(vcpu); 3634 return; 3635 } 3636 3637 if (vcpu->arch.interrupt.pending) { 3638 kvm_x86_ops->set_irq(vcpu); 3639 return; 3640 } 3641 3642 /* try to inject new event if pending */ 3643 if (vcpu->arch.nmi_pending) { 3644 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3645 vcpu->arch.nmi_pending = false; 3646 vcpu->arch.nmi_injected = true; 3647 kvm_x86_ops->set_nmi(vcpu); 3648 } 3649 } else if (kvm_cpu_has_interrupt(vcpu)) { 3650 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3651 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3652 false); 3653 kvm_x86_ops->set_irq(vcpu); 3654 } 3655 } 3656 } 3657 3658 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 3659 { 3660 int r; 3661 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3662 vcpu->run->request_interrupt_window; 3663 3664 if (vcpu->requests) 3665 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3666 kvm_mmu_unload(vcpu); 3667 3668 r = kvm_mmu_reload(vcpu); 3669 if (unlikely(r)) 3670 goto out; 3671 3672 if (vcpu->requests) { 3673 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3674 __kvm_migrate_timers(vcpu); 3675 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3676 kvm_write_guest_time(vcpu); 3677 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3678 kvm_mmu_sync_roots(vcpu); 3679 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3680 kvm_x86_ops->tlb_flush(vcpu); 3681 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3682 &vcpu->requests)) { 3683 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 3684 r = 0; 3685 goto out; 3686 } 3687 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3688 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 3689 r = 0; 3690 goto out; 3691 } 3692 } 3693 3694 preempt_disable(); 3695 3696 kvm_x86_ops->prepare_guest_switch(vcpu); 3697 kvm_load_guest_fpu(vcpu); 3698 3699 local_irq_disable(); 3700 3701 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3702 smp_mb__after_clear_bit(); 3703 3704 if (vcpu->requests || need_resched() || signal_pending(current)) { 3705 set_bit(KVM_REQ_KICK, &vcpu->requests); 3706 local_irq_enable(); 3707 preempt_enable(); 3708 r = 1; 3709 goto out; 3710 } 3711 3712 inject_pending_event(vcpu); 3713 3714 /* enable NMI/IRQ window open exits if needed */ 3715 if (vcpu->arch.nmi_pending) 3716 kvm_x86_ops->enable_nmi_window(vcpu); 3717 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3718 kvm_x86_ops->enable_irq_window(vcpu); 3719 3720 if (kvm_lapic_enabled(vcpu)) { 3721 update_cr8_intercept(vcpu); 3722 kvm_lapic_sync_to_vapic(vcpu); 3723 } 3724 3725 up_read(&vcpu->kvm->slots_lock); 3726 3727 kvm_guest_enter(); 3728 3729 if (unlikely(vcpu->arch.switch_db_regs)) { 3730 set_debugreg(0, 7); 3731 set_debugreg(vcpu->arch.eff_db[0], 0); 3732 set_debugreg(vcpu->arch.eff_db[1], 1); 3733 set_debugreg(vcpu->arch.eff_db[2], 2); 3734 set_debugreg(vcpu->arch.eff_db[3], 3); 3735 } 3736 3737 trace_kvm_entry(vcpu->vcpu_id); 3738 kvm_x86_ops->run(vcpu); 3739 3740 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3741 set_debugreg(current->thread.debugreg0, 0); 3742 set_debugreg(current->thread.debugreg1, 1); 3743 set_debugreg(current->thread.debugreg2, 2); 3744 set_debugreg(current->thread.debugreg3, 3); 3745 set_debugreg(current->thread.debugreg6, 6); 3746 set_debugreg(current->thread.debugreg7, 7); 3747 } 3748 3749 set_bit(KVM_REQ_KICK, &vcpu->requests); 3750 local_irq_enable(); 3751 3752 ++vcpu->stat.exits; 3753 3754 /* 3755 * We must have an instruction between local_irq_enable() and 3756 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3757 * the interrupt shadow. The stat.exits increment will do nicely. 3758 * But we need to prevent reordering, hence this barrier(): 3759 */ 3760 barrier(); 3761 3762 kvm_guest_exit(); 3763 3764 preempt_enable(); 3765 3766 down_read(&vcpu->kvm->slots_lock); 3767 3768 /* 3769 * Profile KVM exit RIPs: 3770 */ 3771 if (unlikely(prof_on == KVM_PROFILING)) { 3772 unsigned long rip = kvm_rip_read(vcpu); 3773 profile_hit(KVM_PROFILING, (void *)rip); 3774 } 3775 3776 3777 kvm_lapic_sync_from_vapic(vcpu); 3778 3779 r = kvm_x86_ops->handle_exit(vcpu); 3780 out: 3781 return r; 3782 } 3783 3784 3785 static int __vcpu_run(struct kvm_vcpu *vcpu) 3786 { 3787 int r; 3788 3789 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3790 pr_debug("vcpu %d received sipi with vector # %x\n", 3791 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3792 kvm_lapic_reset(vcpu); 3793 r = kvm_arch_vcpu_reset(vcpu); 3794 if (r) 3795 return r; 3796 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3797 } 3798 3799 down_read(&vcpu->kvm->slots_lock); 3800 vapic_enter(vcpu); 3801 3802 r = 1; 3803 while (r > 0) { 3804 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3805 r = vcpu_enter_guest(vcpu); 3806 else { 3807 up_read(&vcpu->kvm->slots_lock); 3808 kvm_vcpu_block(vcpu); 3809 down_read(&vcpu->kvm->slots_lock); 3810 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3811 { 3812 switch(vcpu->arch.mp_state) { 3813 case KVM_MP_STATE_HALTED: 3814 vcpu->arch.mp_state = 3815 KVM_MP_STATE_RUNNABLE; 3816 case KVM_MP_STATE_RUNNABLE: 3817 break; 3818 case KVM_MP_STATE_SIPI_RECEIVED: 3819 default: 3820 r = -EINTR; 3821 break; 3822 } 3823 } 3824 } 3825 3826 if (r <= 0) 3827 break; 3828 3829 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3830 if (kvm_cpu_has_pending_timer(vcpu)) 3831 kvm_inject_pending_timer_irqs(vcpu); 3832 3833 if (dm_request_for_irq_injection(vcpu)) { 3834 r = -EINTR; 3835 vcpu->run->exit_reason = KVM_EXIT_INTR; 3836 ++vcpu->stat.request_irq_exits; 3837 } 3838 if (signal_pending(current)) { 3839 r = -EINTR; 3840 vcpu->run->exit_reason = KVM_EXIT_INTR; 3841 ++vcpu->stat.signal_exits; 3842 } 3843 if (need_resched()) { 3844 up_read(&vcpu->kvm->slots_lock); 3845 kvm_resched(vcpu); 3846 down_read(&vcpu->kvm->slots_lock); 3847 } 3848 } 3849 3850 up_read(&vcpu->kvm->slots_lock); 3851 post_kvm_run_save(vcpu); 3852 3853 vapic_exit(vcpu); 3854 3855 return r; 3856 } 3857 3858 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3859 { 3860 int r; 3861 sigset_t sigsaved; 3862 3863 vcpu_load(vcpu); 3864 3865 if (vcpu->sigset_active) 3866 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3867 3868 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3869 kvm_vcpu_block(vcpu); 3870 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3871 r = -EAGAIN; 3872 goto out; 3873 } 3874 3875 /* re-sync apic's tpr */ 3876 if (!irqchip_in_kernel(vcpu->kvm)) 3877 kvm_set_cr8(vcpu, kvm_run->cr8); 3878 3879 if (vcpu->arch.pio.cur_count) { 3880 r = complete_pio(vcpu); 3881 if (r) 3882 goto out; 3883 } 3884 if (vcpu->mmio_needed) { 3885 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3886 vcpu->mmio_read_completed = 1; 3887 vcpu->mmio_needed = 0; 3888 3889 down_read(&vcpu->kvm->slots_lock); 3890 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 3891 EMULTYPE_NO_DECODE); 3892 up_read(&vcpu->kvm->slots_lock); 3893 if (r == EMULATE_DO_MMIO) { 3894 /* 3895 * Read-modify-write. Back to userspace. 3896 */ 3897 r = 0; 3898 goto out; 3899 } 3900 } 3901 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3902 kvm_register_write(vcpu, VCPU_REGS_RAX, 3903 kvm_run->hypercall.ret); 3904 3905 r = __vcpu_run(vcpu); 3906 3907 out: 3908 if (vcpu->sigset_active) 3909 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3910 3911 vcpu_put(vcpu); 3912 return r; 3913 } 3914 3915 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3916 { 3917 vcpu_load(vcpu); 3918 3919 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3920 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3921 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3922 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3923 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3924 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3925 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3926 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3927 #ifdef CONFIG_X86_64 3928 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3929 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3930 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3931 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3932 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3933 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3934 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3935 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3936 #endif 3937 3938 regs->rip = kvm_rip_read(vcpu); 3939 regs->rflags = kvm_get_rflags(vcpu); 3940 3941 vcpu_put(vcpu); 3942 3943 return 0; 3944 } 3945 3946 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3947 { 3948 vcpu_load(vcpu); 3949 3950 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3951 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3952 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3953 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3954 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3955 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3956 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3957 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3958 #ifdef CONFIG_X86_64 3959 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3960 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3961 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3962 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3963 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3964 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3965 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3966 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3967 #endif 3968 3969 kvm_rip_write(vcpu, regs->rip); 3970 kvm_set_rflags(vcpu, regs->rflags); 3971 3972 vcpu->arch.exception.pending = false; 3973 3974 vcpu_put(vcpu); 3975 3976 return 0; 3977 } 3978 3979 void kvm_get_segment(struct kvm_vcpu *vcpu, 3980 struct kvm_segment *var, int seg) 3981 { 3982 kvm_x86_ops->get_segment(vcpu, var, seg); 3983 } 3984 3985 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3986 { 3987 struct kvm_segment cs; 3988 3989 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3990 *db = cs.db; 3991 *l = cs.l; 3992 } 3993 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3994 3995 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3996 struct kvm_sregs *sregs) 3997 { 3998 struct descriptor_table dt; 3999 4000 vcpu_load(vcpu); 4001 4002 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4003 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4004 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4005 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4006 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4007 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4008 4009 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4010 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4011 4012 kvm_x86_ops->get_idt(vcpu, &dt); 4013 sregs->idt.limit = dt.limit; 4014 sregs->idt.base = dt.base; 4015 kvm_x86_ops->get_gdt(vcpu, &dt); 4016 sregs->gdt.limit = dt.limit; 4017 sregs->gdt.base = dt.base; 4018 4019 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4020 sregs->cr0 = vcpu->arch.cr0; 4021 sregs->cr2 = vcpu->arch.cr2; 4022 sregs->cr3 = vcpu->arch.cr3; 4023 sregs->cr4 = vcpu->arch.cr4; 4024 sregs->cr8 = kvm_get_cr8(vcpu); 4025 sregs->efer = vcpu->arch.shadow_efer; 4026 sregs->apic_base = kvm_get_apic_base(vcpu); 4027 4028 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4029 4030 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 4031 set_bit(vcpu->arch.interrupt.nr, 4032 (unsigned long *)sregs->interrupt_bitmap); 4033 4034 vcpu_put(vcpu); 4035 4036 return 0; 4037 } 4038 4039 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 4040 struct kvm_mp_state *mp_state) 4041 { 4042 vcpu_load(vcpu); 4043 mp_state->mp_state = vcpu->arch.mp_state; 4044 vcpu_put(vcpu); 4045 return 0; 4046 } 4047 4048 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 4049 struct kvm_mp_state *mp_state) 4050 { 4051 vcpu_load(vcpu); 4052 vcpu->arch.mp_state = mp_state->mp_state; 4053 vcpu_put(vcpu); 4054 return 0; 4055 } 4056 4057 static void kvm_set_segment(struct kvm_vcpu *vcpu, 4058 struct kvm_segment *var, int seg) 4059 { 4060 kvm_x86_ops->set_segment(vcpu, var, seg); 4061 } 4062 4063 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 4064 struct kvm_segment *kvm_desct) 4065 { 4066 kvm_desct->base = get_desc_base(seg_desc); 4067 kvm_desct->limit = get_desc_limit(seg_desc); 4068 if (seg_desc->g) { 4069 kvm_desct->limit <<= 12; 4070 kvm_desct->limit |= 0xfff; 4071 } 4072 kvm_desct->selector = selector; 4073 kvm_desct->type = seg_desc->type; 4074 kvm_desct->present = seg_desc->p; 4075 kvm_desct->dpl = seg_desc->dpl; 4076 kvm_desct->db = seg_desc->d; 4077 kvm_desct->s = seg_desc->s; 4078 kvm_desct->l = seg_desc->l; 4079 kvm_desct->g = seg_desc->g; 4080 kvm_desct->avl = seg_desc->avl; 4081 if (!selector) 4082 kvm_desct->unusable = 1; 4083 else 4084 kvm_desct->unusable = 0; 4085 kvm_desct->padding = 0; 4086 } 4087 4088 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 4089 u16 selector, 4090 struct descriptor_table *dtable) 4091 { 4092 if (selector & 1 << 2) { 4093 struct kvm_segment kvm_seg; 4094 4095 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 4096 4097 if (kvm_seg.unusable) 4098 dtable->limit = 0; 4099 else 4100 dtable->limit = kvm_seg.limit; 4101 dtable->base = kvm_seg.base; 4102 } 4103 else 4104 kvm_x86_ops->get_gdt(vcpu, dtable); 4105 } 4106 4107 /* allowed just for 8 bytes segments */ 4108 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4109 struct desc_struct *seg_desc) 4110 { 4111 struct descriptor_table dtable; 4112 u16 index = selector >> 3; 4113 4114 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4115 4116 if (dtable.limit < index * 8 + 7) { 4117 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4118 return 1; 4119 } 4120 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4121 } 4122 4123 /* allowed just for 8 bytes segments */ 4124 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4125 struct desc_struct *seg_desc) 4126 { 4127 struct descriptor_table dtable; 4128 u16 index = selector >> 3; 4129 4130 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4131 4132 if (dtable.limit < index * 8 + 7) 4133 return 1; 4134 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4135 } 4136 4137 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4138 struct desc_struct *seg_desc) 4139 { 4140 u32 base_addr = get_desc_base(seg_desc); 4141 4142 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4143 } 4144 4145 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4146 { 4147 struct kvm_segment kvm_seg; 4148 4149 kvm_get_segment(vcpu, &kvm_seg, seg); 4150 return kvm_seg.selector; 4151 } 4152 4153 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 4154 u16 selector, 4155 struct kvm_segment *kvm_seg) 4156 { 4157 struct desc_struct seg_desc; 4158 4159 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 4160 return 1; 4161 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 4162 return 0; 4163 } 4164 4165 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4166 { 4167 struct kvm_segment segvar = { 4168 .base = selector << 4, 4169 .limit = 0xffff, 4170 .selector = selector, 4171 .type = 3, 4172 .present = 1, 4173 .dpl = 3, 4174 .db = 0, 4175 .s = 1, 4176 .l = 0, 4177 .g = 0, 4178 .avl = 0, 4179 .unusable = 0, 4180 }; 4181 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4182 return 0; 4183 } 4184 4185 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4186 { 4187 return (seg != VCPU_SREG_LDTR) && 4188 (seg != VCPU_SREG_TR) && 4189 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); 4190 } 4191 4192 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4193 int type_bits, int seg) 4194 { 4195 struct kvm_segment kvm_seg; 4196 4197 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4198 return kvm_load_realmode_segment(vcpu, selector, seg); 4199 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4200 return 1; 4201 kvm_seg.type |= type_bits; 4202 4203 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4204 seg != VCPU_SREG_LDTR) 4205 if (!kvm_seg.s) 4206 kvm_seg.unusable = 1; 4207 4208 kvm_set_segment(vcpu, &kvm_seg, seg); 4209 return 0; 4210 } 4211 4212 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4213 struct tss_segment_32 *tss) 4214 { 4215 tss->cr3 = vcpu->arch.cr3; 4216 tss->eip = kvm_rip_read(vcpu); 4217 tss->eflags = kvm_get_rflags(vcpu); 4218 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4219 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4220 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4221 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4222 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4223 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4224 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4225 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4226 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4227 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4228 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4229 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4230 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 4231 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 4232 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4233 } 4234 4235 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4236 struct tss_segment_32 *tss) 4237 { 4238 kvm_set_cr3(vcpu, tss->cr3); 4239 4240 kvm_rip_write(vcpu, tss->eip); 4241 kvm_set_rflags(vcpu, tss->eflags | 2); 4242 4243 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4244 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4245 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 4246 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 4247 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 4248 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 4249 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4250 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4251 4252 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4253 return 1; 4254 4255 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4256 return 1; 4257 4258 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4259 return 1; 4260 4261 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4262 return 1; 4263 4264 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4265 return 1; 4266 4267 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4268 return 1; 4269 4270 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4271 return 1; 4272 return 0; 4273 } 4274 4275 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 4276 struct tss_segment_16 *tss) 4277 { 4278 tss->ip = kvm_rip_read(vcpu); 4279 tss->flag = kvm_get_rflags(vcpu); 4280 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4281 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4282 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4283 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4284 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4285 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4286 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 4287 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 4288 4289 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4290 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4291 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4292 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4293 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4294 } 4295 4296 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4297 struct tss_segment_16 *tss) 4298 { 4299 kvm_rip_write(vcpu, tss->ip); 4300 kvm_set_rflags(vcpu, tss->flag | 2); 4301 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4302 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4303 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4304 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 4305 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 4306 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 4307 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 4308 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 4309 4310 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 4311 return 1; 4312 4313 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4314 return 1; 4315 4316 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4317 return 1; 4318 4319 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4320 return 1; 4321 4322 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4323 return 1; 4324 return 0; 4325 } 4326 4327 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 4328 u16 old_tss_sel, u32 old_tss_base, 4329 struct desc_struct *nseg_desc) 4330 { 4331 struct tss_segment_16 tss_segment_16; 4332 int ret = 0; 4333 4334 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4335 sizeof tss_segment_16)) 4336 goto out; 4337 4338 save_state_to_tss16(vcpu, &tss_segment_16); 4339 4340 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4341 sizeof tss_segment_16)) 4342 goto out; 4343 4344 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4345 &tss_segment_16, sizeof tss_segment_16)) 4346 goto out; 4347 4348 if (old_tss_sel != 0xffff) { 4349 tss_segment_16.prev_task_link = old_tss_sel; 4350 4351 if (kvm_write_guest(vcpu->kvm, 4352 get_tss_base_addr(vcpu, nseg_desc), 4353 &tss_segment_16.prev_task_link, 4354 sizeof tss_segment_16.prev_task_link)) 4355 goto out; 4356 } 4357 4358 if (load_state_from_tss16(vcpu, &tss_segment_16)) 4359 goto out; 4360 4361 ret = 1; 4362 out: 4363 return ret; 4364 } 4365 4366 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 4367 u16 old_tss_sel, u32 old_tss_base, 4368 struct desc_struct *nseg_desc) 4369 { 4370 struct tss_segment_32 tss_segment_32; 4371 int ret = 0; 4372 4373 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4374 sizeof tss_segment_32)) 4375 goto out; 4376 4377 save_state_to_tss32(vcpu, &tss_segment_32); 4378 4379 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4380 sizeof tss_segment_32)) 4381 goto out; 4382 4383 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4384 &tss_segment_32, sizeof tss_segment_32)) 4385 goto out; 4386 4387 if (old_tss_sel != 0xffff) { 4388 tss_segment_32.prev_task_link = old_tss_sel; 4389 4390 if (kvm_write_guest(vcpu->kvm, 4391 get_tss_base_addr(vcpu, nseg_desc), 4392 &tss_segment_32.prev_task_link, 4393 sizeof tss_segment_32.prev_task_link)) 4394 goto out; 4395 } 4396 4397 if (load_state_from_tss32(vcpu, &tss_segment_32)) 4398 goto out; 4399 4400 ret = 1; 4401 out: 4402 return ret; 4403 } 4404 4405 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 4406 { 4407 struct kvm_segment tr_seg; 4408 struct desc_struct cseg_desc; 4409 struct desc_struct nseg_desc; 4410 int ret = 0; 4411 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 4412 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 4413 4414 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 4415 4416 /* FIXME: Handle errors. Failure to read either TSS or their 4417 * descriptors should generate a pagefault. 4418 */ 4419 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 4420 goto out; 4421 4422 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 4423 goto out; 4424 4425 if (reason != TASK_SWITCH_IRET) { 4426 int cpl; 4427 4428 cpl = kvm_x86_ops->get_cpl(vcpu); 4429 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 4430 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4431 return 1; 4432 } 4433 } 4434 4435 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4436 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4437 return 1; 4438 } 4439 4440 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4441 cseg_desc.type &= ~(1 << 1); //clear the B flag 4442 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4443 } 4444 4445 if (reason == TASK_SWITCH_IRET) { 4446 u32 eflags = kvm_get_rflags(vcpu); 4447 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4448 } 4449 4450 /* set back link to prev task only if NT bit is set in eflags 4451 note that old_tss_sel is not used afetr this point */ 4452 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4453 old_tss_sel = 0xffff; 4454 4455 /* set back link to prev task only if NT bit is set in eflags 4456 note that old_tss_sel is not used afetr this point */ 4457 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4458 old_tss_sel = 0xffff; 4459 4460 if (nseg_desc.type & 8) 4461 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4462 old_tss_base, &nseg_desc); 4463 else 4464 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4465 old_tss_base, &nseg_desc); 4466 4467 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4468 u32 eflags = kvm_get_rflags(vcpu); 4469 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4470 } 4471 4472 if (reason != TASK_SWITCH_IRET) { 4473 nseg_desc.type |= (1 << 1); 4474 save_guest_segment_descriptor(vcpu, tss_selector, 4475 &nseg_desc); 4476 } 4477 4478 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4479 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4480 tr_seg.type = 11; 4481 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4482 out: 4483 return ret; 4484 } 4485 EXPORT_SYMBOL_GPL(kvm_task_switch); 4486 4487 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4488 struct kvm_sregs *sregs) 4489 { 4490 int mmu_reset_needed = 0; 4491 int pending_vec, max_bits; 4492 struct descriptor_table dt; 4493 4494 vcpu_load(vcpu); 4495 4496 dt.limit = sregs->idt.limit; 4497 dt.base = sregs->idt.base; 4498 kvm_x86_ops->set_idt(vcpu, &dt); 4499 dt.limit = sregs->gdt.limit; 4500 dt.base = sregs->gdt.base; 4501 kvm_x86_ops->set_gdt(vcpu, &dt); 4502 4503 vcpu->arch.cr2 = sregs->cr2; 4504 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4505 vcpu->arch.cr3 = sregs->cr3; 4506 4507 kvm_set_cr8(vcpu, sregs->cr8); 4508 4509 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4510 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4511 kvm_set_apic_base(vcpu, sregs->apic_base); 4512 4513 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4514 4515 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4516 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4517 vcpu->arch.cr0 = sregs->cr0; 4518 4519 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4520 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4521 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4522 load_pdptrs(vcpu, vcpu->arch.cr3); 4523 4524 if (mmu_reset_needed) 4525 kvm_mmu_reset_context(vcpu); 4526 4527 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4528 pending_vec = find_first_bit( 4529 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4530 if (pending_vec < max_bits) { 4531 kvm_queue_interrupt(vcpu, pending_vec, false); 4532 pr_debug("Set back pending irq %d\n", pending_vec); 4533 if (irqchip_in_kernel(vcpu->kvm)) 4534 kvm_pic_clear_isr_ack(vcpu->kvm); 4535 } 4536 4537 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4538 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4539 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4540 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4541 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4542 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4543 4544 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4545 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4546 4547 update_cr8_intercept(vcpu); 4548 4549 /* Older userspace won't unhalt the vcpu on reset. */ 4550 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 4551 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4552 !(vcpu->arch.cr0 & X86_CR0_PE)) 4553 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4554 4555 vcpu_put(vcpu); 4556 4557 return 0; 4558 } 4559 4560 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4561 struct kvm_guest_debug *dbg) 4562 { 4563 unsigned long rflags; 4564 int i; 4565 4566 vcpu_load(vcpu); 4567 4568 /* 4569 * Read rflags as long as potentially injected trace flags are still 4570 * filtered out. 4571 */ 4572 rflags = kvm_get_rflags(vcpu); 4573 4574 vcpu->guest_debug = dbg->control; 4575 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 4576 vcpu->guest_debug = 0; 4577 4578 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 4579 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4580 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4581 vcpu->arch.switch_db_regs = 4582 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4583 } else { 4584 for (i = 0; i < KVM_NR_DB_REGS; i++) 4585 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4586 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4587 } 4588 4589 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 4590 vcpu->arch.singlestep_cs = 4591 get_segment_selector(vcpu, VCPU_SREG_CS); 4592 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); 4593 } 4594 4595 /* 4596 * Trigger an rflags update that will inject or remove the trace 4597 * flags. 4598 */ 4599 kvm_set_rflags(vcpu, rflags); 4600 4601 kvm_x86_ops->set_guest_debug(vcpu, dbg); 4602 4603 if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_DB) 4604 kvm_queue_exception(vcpu, DB_VECTOR); 4605 else if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_BP) 4606 kvm_queue_exception(vcpu, BP_VECTOR); 4607 4608 vcpu_put(vcpu); 4609 4610 return 0; 4611 } 4612 4613 /* 4614 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4615 * we have asm/x86/processor.h 4616 */ 4617 struct fxsave { 4618 u16 cwd; 4619 u16 swd; 4620 u16 twd; 4621 u16 fop; 4622 u64 rip; 4623 u64 rdp; 4624 u32 mxcsr; 4625 u32 mxcsr_mask; 4626 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4627 #ifdef CONFIG_X86_64 4628 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4629 #else 4630 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4631 #endif 4632 }; 4633 4634 /* 4635 * Translate a guest virtual address to a guest physical address. 4636 */ 4637 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4638 struct kvm_translation *tr) 4639 { 4640 unsigned long vaddr = tr->linear_address; 4641 gpa_t gpa; 4642 4643 vcpu_load(vcpu); 4644 down_read(&vcpu->kvm->slots_lock); 4645 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4646 up_read(&vcpu->kvm->slots_lock); 4647 tr->physical_address = gpa; 4648 tr->valid = gpa != UNMAPPED_GVA; 4649 tr->writeable = 1; 4650 tr->usermode = 0; 4651 vcpu_put(vcpu); 4652 4653 return 0; 4654 } 4655 4656 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4657 { 4658 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4659 4660 vcpu_load(vcpu); 4661 4662 memcpy(fpu->fpr, fxsave->st_space, 128); 4663 fpu->fcw = fxsave->cwd; 4664 fpu->fsw = fxsave->swd; 4665 fpu->ftwx = fxsave->twd; 4666 fpu->last_opcode = fxsave->fop; 4667 fpu->last_ip = fxsave->rip; 4668 fpu->last_dp = fxsave->rdp; 4669 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4670 4671 vcpu_put(vcpu); 4672 4673 return 0; 4674 } 4675 4676 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4677 { 4678 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4679 4680 vcpu_load(vcpu); 4681 4682 memcpy(fxsave->st_space, fpu->fpr, 128); 4683 fxsave->cwd = fpu->fcw; 4684 fxsave->swd = fpu->fsw; 4685 fxsave->twd = fpu->ftwx; 4686 fxsave->fop = fpu->last_opcode; 4687 fxsave->rip = fpu->last_ip; 4688 fxsave->rdp = fpu->last_dp; 4689 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4690 4691 vcpu_put(vcpu); 4692 4693 return 0; 4694 } 4695 4696 void fx_init(struct kvm_vcpu *vcpu) 4697 { 4698 unsigned after_mxcsr_mask; 4699 4700 /* 4701 * Touch the fpu the first time in non atomic context as if 4702 * this is the first fpu instruction the exception handler 4703 * will fire before the instruction returns and it'll have to 4704 * allocate ram with GFP_KERNEL. 4705 */ 4706 if (!used_math()) 4707 kvm_fx_save(&vcpu->arch.host_fx_image); 4708 4709 /* Initialize guest FPU by resetting ours and saving into guest's */ 4710 preempt_disable(); 4711 kvm_fx_save(&vcpu->arch.host_fx_image); 4712 kvm_fx_finit(); 4713 kvm_fx_save(&vcpu->arch.guest_fx_image); 4714 kvm_fx_restore(&vcpu->arch.host_fx_image); 4715 preempt_enable(); 4716 4717 vcpu->arch.cr0 |= X86_CR0_ET; 4718 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4719 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4720 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4721 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4722 } 4723 EXPORT_SYMBOL_GPL(fx_init); 4724 4725 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4726 { 4727 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4728 return; 4729 4730 vcpu->guest_fpu_loaded = 1; 4731 kvm_fx_save(&vcpu->arch.host_fx_image); 4732 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4733 } 4734 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4735 4736 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4737 { 4738 if (!vcpu->guest_fpu_loaded) 4739 return; 4740 4741 vcpu->guest_fpu_loaded = 0; 4742 kvm_fx_save(&vcpu->arch.guest_fx_image); 4743 kvm_fx_restore(&vcpu->arch.host_fx_image); 4744 ++vcpu->stat.fpu_reload; 4745 } 4746 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4747 4748 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4749 { 4750 if (vcpu->arch.time_page) { 4751 kvm_release_page_dirty(vcpu->arch.time_page); 4752 vcpu->arch.time_page = NULL; 4753 } 4754 4755 kvm_x86_ops->vcpu_free(vcpu); 4756 } 4757 4758 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4759 unsigned int id) 4760 { 4761 return kvm_x86_ops->vcpu_create(kvm, id); 4762 } 4763 4764 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4765 { 4766 int r; 4767 4768 /* We do fxsave: this must be aligned. */ 4769 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4770 4771 vcpu->arch.mtrr_state.have_fixed = 1; 4772 vcpu_load(vcpu); 4773 r = kvm_arch_vcpu_reset(vcpu); 4774 if (r == 0) 4775 r = kvm_mmu_setup(vcpu); 4776 vcpu_put(vcpu); 4777 if (r < 0) 4778 goto free_vcpu; 4779 4780 return 0; 4781 free_vcpu: 4782 kvm_x86_ops->vcpu_free(vcpu); 4783 return r; 4784 } 4785 4786 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4787 { 4788 vcpu_load(vcpu); 4789 kvm_mmu_unload(vcpu); 4790 vcpu_put(vcpu); 4791 4792 kvm_x86_ops->vcpu_free(vcpu); 4793 } 4794 4795 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4796 { 4797 vcpu->arch.nmi_pending = false; 4798 vcpu->arch.nmi_injected = false; 4799 4800 vcpu->arch.switch_db_regs = 0; 4801 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4802 vcpu->arch.dr6 = DR6_FIXED_1; 4803 vcpu->arch.dr7 = DR7_FIXED_1; 4804 4805 return kvm_x86_ops->vcpu_reset(vcpu); 4806 } 4807 4808 int kvm_arch_hardware_enable(void *garbage) 4809 { 4810 /* 4811 * Since this may be called from a hotplug notifcation, 4812 * we can't get the CPU frequency directly. 4813 */ 4814 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4815 int cpu = raw_smp_processor_id(); 4816 per_cpu(cpu_tsc_khz, cpu) = 0; 4817 } 4818 return kvm_x86_ops->hardware_enable(garbage); 4819 } 4820 4821 void kvm_arch_hardware_disable(void *garbage) 4822 { 4823 kvm_x86_ops->hardware_disable(garbage); 4824 } 4825 4826 int kvm_arch_hardware_setup(void) 4827 { 4828 return kvm_x86_ops->hardware_setup(); 4829 } 4830 4831 void kvm_arch_hardware_unsetup(void) 4832 { 4833 kvm_x86_ops->hardware_unsetup(); 4834 } 4835 4836 void kvm_arch_check_processor_compat(void *rtn) 4837 { 4838 kvm_x86_ops->check_processor_compatibility(rtn); 4839 } 4840 4841 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4842 { 4843 struct page *page; 4844 struct kvm *kvm; 4845 int r; 4846 4847 BUG_ON(vcpu->kvm == NULL); 4848 kvm = vcpu->kvm; 4849 4850 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4851 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 4852 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4853 else 4854 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4855 4856 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4857 if (!page) { 4858 r = -ENOMEM; 4859 goto fail; 4860 } 4861 vcpu->arch.pio_data = page_address(page); 4862 4863 r = kvm_mmu_create(vcpu); 4864 if (r < 0) 4865 goto fail_free_pio_data; 4866 4867 if (irqchip_in_kernel(kvm)) { 4868 r = kvm_create_lapic(vcpu); 4869 if (r < 0) 4870 goto fail_mmu_destroy; 4871 } 4872 4873 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 4874 GFP_KERNEL); 4875 if (!vcpu->arch.mce_banks) { 4876 r = -ENOMEM; 4877 goto fail_mmu_destroy; 4878 } 4879 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 4880 4881 return 0; 4882 4883 fail_mmu_destroy: 4884 kvm_mmu_destroy(vcpu); 4885 fail_free_pio_data: 4886 free_page((unsigned long)vcpu->arch.pio_data); 4887 fail: 4888 return r; 4889 } 4890 4891 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4892 { 4893 kvm_free_lapic(vcpu); 4894 down_read(&vcpu->kvm->slots_lock); 4895 kvm_mmu_destroy(vcpu); 4896 up_read(&vcpu->kvm->slots_lock); 4897 free_page((unsigned long)vcpu->arch.pio_data); 4898 } 4899 4900 struct kvm *kvm_arch_create_vm(void) 4901 { 4902 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4903 4904 if (!kvm) 4905 return ERR_PTR(-ENOMEM); 4906 4907 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4908 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4909 4910 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4911 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4912 4913 rdtscll(kvm->arch.vm_init_tsc); 4914 4915 return kvm; 4916 } 4917 4918 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4919 { 4920 vcpu_load(vcpu); 4921 kvm_mmu_unload(vcpu); 4922 vcpu_put(vcpu); 4923 } 4924 4925 static void kvm_free_vcpus(struct kvm *kvm) 4926 { 4927 unsigned int i; 4928 struct kvm_vcpu *vcpu; 4929 4930 /* 4931 * Unpin any mmu pages first. 4932 */ 4933 kvm_for_each_vcpu(i, vcpu, kvm) 4934 kvm_unload_vcpu_mmu(vcpu); 4935 kvm_for_each_vcpu(i, vcpu, kvm) 4936 kvm_arch_vcpu_free(vcpu); 4937 4938 mutex_lock(&kvm->lock); 4939 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 4940 kvm->vcpus[i] = NULL; 4941 4942 atomic_set(&kvm->online_vcpus, 0); 4943 mutex_unlock(&kvm->lock); 4944 } 4945 4946 void kvm_arch_sync_events(struct kvm *kvm) 4947 { 4948 kvm_free_all_assigned_devices(kvm); 4949 } 4950 4951 void kvm_arch_destroy_vm(struct kvm *kvm) 4952 { 4953 kvm_iommu_unmap_guest(kvm); 4954 kvm_free_pit(kvm); 4955 kfree(kvm->arch.vpic); 4956 kfree(kvm->arch.vioapic); 4957 kvm_free_vcpus(kvm); 4958 kvm_free_physmem(kvm); 4959 if (kvm->arch.apic_access_page) 4960 put_page(kvm->arch.apic_access_page); 4961 if (kvm->arch.ept_identity_pagetable) 4962 put_page(kvm->arch.ept_identity_pagetable); 4963 kfree(kvm); 4964 } 4965 4966 int kvm_arch_set_memory_region(struct kvm *kvm, 4967 struct kvm_userspace_memory_region *mem, 4968 struct kvm_memory_slot old, 4969 int user_alloc) 4970 { 4971 int npages = mem->memory_size >> PAGE_SHIFT; 4972 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4973 4974 /*To keep backward compatibility with older userspace, 4975 *x86 needs to hanlde !user_alloc case. 4976 */ 4977 if (!user_alloc) { 4978 if (npages && !old.rmap) { 4979 unsigned long userspace_addr; 4980 4981 down_write(¤t->mm->mmap_sem); 4982 userspace_addr = do_mmap(NULL, 0, 4983 npages * PAGE_SIZE, 4984 PROT_READ | PROT_WRITE, 4985 MAP_PRIVATE | MAP_ANONYMOUS, 4986 0); 4987 up_write(¤t->mm->mmap_sem); 4988 4989 if (IS_ERR((void *)userspace_addr)) 4990 return PTR_ERR((void *)userspace_addr); 4991 4992 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4993 spin_lock(&kvm->mmu_lock); 4994 memslot->userspace_addr = userspace_addr; 4995 spin_unlock(&kvm->mmu_lock); 4996 } else { 4997 if (!old.user_alloc && old.rmap) { 4998 int ret; 4999 5000 down_write(¤t->mm->mmap_sem); 5001 ret = do_munmap(current->mm, old.userspace_addr, 5002 old.npages * PAGE_SIZE); 5003 up_write(¤t->mm->mmap_sem); 5004 if (ret < 0) 5005 printk(KERN_WARNING 5006 "kvm_vm_ioctl_set_memory_region: " 5007 "failed to munmap memory\n"); 5008 } 5009 } 5010 } 5011 5012 spin_lock(&kvm->mmu_lock); 5013 if (!kvm->arch.n_requested_mmu_pages) { 5014 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5015 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 5016 } 5017 5018 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5019 spin_unlock(&kvm->mmu_lock); 5020 5021 return 0; 5022 } 5023 5024 void kvm_arch_flush_shadow(struct kvm *kvm) 5025 { 5026 kvm_mmu_zap_all(kvm); 5027 kvm_reload_remote_mmus(kvm); 5028 } 5029 5030 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 5031 { 5032 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 5033 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 5034 || vcpu->arch.nmi_pending || 5035 (kvm_arch_interrupt_allowed(vcpu) && 5036 kvm_cpu_has_interrupt(vcpu)); 5037 } 5038 5039 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 5040 { 5041 int me; 5042 int cpu = vcpu->cpu; 5043 5044 if (waitqueue_active(&vcpu->wq)) { 5045 wake_up_interruptible(&vcpu->wq); 5046 ++vcpu->stat.halt_wakeup; 5047 } 5048 5049 me = get_cpu(); 5050 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5051 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5052 smp_send_reschedule(cpu); 5053 put_cpu(); 5054 } 5055 5056 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 5057 { 5058 return kvm_x86_ops->interrupt_allowed(vcpu); 5059 } 5060 5061 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5062 { 5063 unsigned long rflags; 5064 5065 rflags = kvm_x86_ops->get_rflags(vcpu); 5066 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5067 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5068 return rflags; 5069 } 5070 EXPORT_SYMBOL_GPL(kvm_get_rflags); 5071 5072 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5073 { 5074 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5075 vcpu->arch.singlestep_cs == 5076 get_segment_selector(vcpu, VCPU_SREG_CS) && 5077 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) 5078 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; 5079 kvm_x86_ops->set_rflags(vcpu, rflags); 5080 } 5081 EXPORT_SYMBOL_GPL(kvm_set_rflags); 5082 5083 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5084 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5085 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5086 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5087 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5088 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 5089 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 5090 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 5091 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5092 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5093 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5094