1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 #include <trace/events/kvm.h> 41 #undef TRACE_INCLUDE_FILE 42 #define CREATE_TRACE_POINTS 43 #include "trace.h" 44 45 #include <asm/uaccess.h> 46 #include <asm/msr.h> 47 #include <asm/desc.h> 48 #include <asm/mtrr.h> 49 #include <asm/mce.h> 50 51 #define MAX_IO_MSRS 256 52 #define CR0_RESERVED_BITS \ 53 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 54 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 55 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 56 #define CR4_RESERVED_BITS \ 57 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 58 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 59 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 60 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 61 62 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 63 64 #define KVM_MAX_MCE_BANKS 32 65 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 66 67 /* EFER defaults: 68 * - enable syscall per default because its emulated by KVM 69 * - enable LME and LMA per default on 64 bit KVM 70 */ 71 #ifdef CONFIG_X86_64 72 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 73 #else 74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 75 #endif 76 77 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 78 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 79 80 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 81 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 82 struct kvm_cpuid_entry2 __user *entries); 83 84 struct kvm_x86_ops *kvm_x86_ops; 85 EXPORT_SYMBOL_GPL(kvm_x86_ops); 86 87 int ignore_msrs = 0; 88 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 89 90 struct kvm_stats_debugfs_item debugfs_entries[] = { 91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 92 { "pf_guest", VCPU_STAT(pf_guest) }, 93 { "tlb_flush", VCPU_STAT(tlb_flush) }, 94 { "invlpg", VCPU_STAT(invlpg) }, 95 { "exits", VCPU_STAT(exits) }, 96 { "io_exits", VCPU_STAT(io_exits) }, 97 { "mmio_exits", VCPU_STAT(mmio_exits) }, 98 { "signal_exits", VCPU_STAT(signal_exits) }, 99 { "irq_window", VCPU_STAT(irq_window_exits) }, 100 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 101 { "halt_exits", VCPU_STAT(halt_exits) }, 102 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 103 { "hypercalls", VCPU_STAT(hypercalls) }, 104 { "request_irq", VCPU_STAT(request_irq_exits) }, 105 { "irq_exits", VCPU_STAT(irq_exits) }, 106 { "host_state_reload", VCPU_STAT(host_state_reload) }, 107 { "efer_reload", VCPU_STAT(efer_reload) }, 108 { "fpu_reload", VCPU_STAT(fpu_reload) }, 109 { "insn_emulation", VCPU_STAT(insn_emulation) }, 110 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 111 { "irq_injections", VCPU_STAT(irq_injections) }, 112 { "nmi_injections", VCPU_STAT(nmi_injections) }, 113 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 114 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 115 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 116 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 117 { "mmu_flooded", VM_STAT(mmu_flooded) }, 118 { "mmu_recycled", VM_STAT(mmu_recycled) }, 119 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 120 { "mmu_unsync", VM_STAT(mmu_unsync) }, 121 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 122 { "largepages", VM_STAT(lpages) }, 123 { NULL } 124 }; 125 126 unsigned long segment_base(u16 selector) 127 { 128 struct descriptor_table gdt; 129 struct desc_struct *d; 130 unsigned long table_base; 131 unsigned long v; 132 133 if (selector == 0) 134 return 0; 135 136 kvm_get_gdt(&gdt); 137 table_base = gdt.base; 138 139 if (selector & 4) { /* from ldt */ 140 u16 ldt_selector = kvm_read_ldt(); 141 142 table_base = segment_base(ldt_selector); 143 } 144 d = (struct desc_struct *)(table_base + (selector & ~7)); 145 v = get_desc_base(d); 146 #ifdef CONFIG_X86_64 147 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 148 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 149 #endif 150 return v; 151 } 152 EXPORT_SYMBOL_GPL(segment_base); 153 154 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 155 { 156 if (irqchip_in_kernel(vcpu->kvm)) 157 return vcpu->arch.apic_base; 158 else 159 return vcpu->arch.apic_base; 160 } 161 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 162 163 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 164 { 165 /* TODO: reserve bits check */ 166 if (irqchip_in_kernel(vcpu->kvm)) 167 kvm_lapic_set_base(vcpu, data); 168 else 169 vcpu->arch.apic_base = data; 170 } 171 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 172 173 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 174 { 175 WARN_ON(vcpu->arch.exception.pending); 176 vcpu->arch.exception.pending = true; 177 vcpu->arch.exception.has_error_code = false; 178 vcpu->arch.exception.nr = nr; 179 } 180 EXPORT_SYMBOL_GPL(kvm_queue_exception); 181 182 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 183 u32 error_code) 184 { 185 ++vcpu->stat.pf_guest; 186 187 if (vcpu->arch.exception.pending) { 188 switch(vcpu->arch.exception.nr) { 189 case DF_VECTOR: 190 /* triple fault -> shutdown */ 191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 192 return; 193 case PF_VECTOR: 194 vcpu->arch.exception.nr = DF_VECTOR; 195 vcpu->arch.exception.error_code = 0; 196 return; 197 default: 198 /* replace previous exception with a new one in a hope 199 that instruction re-execution will regenerate lost 200 exception */ 201 vcpu->arch.exception.pending = false; 202 break; 203 } 204 } 205 vcpu->arch.cr2 = addr; 206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 207 } 208 209 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 210 { 211 vcpu->arch.nmi_pending = 1; 212 } 213 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 214 215 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 216 { 217 WARN_ON(vcpu->arch.exception.pending); 218 vcpu->arch.exception.pending = true; 219 vcpu->arch.exception.has_error_code = true; 220 vcpu->arch.exception.nr = nr; 221 vcpu->arch.exception.error_code = error_code; 222 } 223 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 224 225 /* 226 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 227 * a #GP and return false. 228 */ 229 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 230 { 231 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 232 return true; 233 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 234 return false; 235 } 236 EXPORT_SYMBOL_GPL(kvm_require_cpl); 237 238 /* 239 * Load the pae pdptrs. Return true is they are all valid. 240 */ 241 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 242 { 243 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 244 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 245 int i; 246 int ret; 247 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 248 249 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 250 offset * sizeof(u64), sizeof(pdpte)); 251 if (ret < 0) { 252 ret = 0; 253 goto out; 254 } 255 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 256 if (is_present_gpte(pdpte[i]) && 257 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 258 ret = 0; 259 goto out; 260 } 261 } 262 ret = 1; 263 264 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 265 __set_bit(VCPU_EXREG_PDPTR, 266 (unsigned long *)&vcpu->arch.regs_avail); 267 __set_bit(VCPU_EXREG_PDPTR, 268 (unsigned long *)&vcpu->arch.regs_dirty); 269 out: 270 271 return ret; 272 } 273 EXPORT_SYMBOL_GPL(load_pdptrs); 274 275 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 276 { 277 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 278 bool changed = true; 279 int r; 280 281 if (is_long_mode(vcpu) || !is_pae(vcpu)) 282 return false; 283 284 if (!test_bit(VCPU_EXREG_PDPTR, 285 (unsigned long *)&vcpu->arch.regs_avail)) 286 return true; 287 288 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 289 if (r < 0) 290 goto out; 291 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 292 out: 293 294 return changed; 295 } 296 297 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 298 { 299 if (cr0 & CR0_RESERVED_BITS) { 300 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 301 cr0, vcpu->arch.cr0); 302 kvm_inject_gp(vcpu, 0); 303 return; 304 } 305 306 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 307 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 308 kvm_inject_gp(vcpu, 0); 309 return; 310 } 311 312 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 313 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 314 "and a clear PE flag\n"); 315 kvm_inject_gp(vcpu, 0); 316 return; 317 } 318 319 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 320 #ifdef CONFIG_X86_64 321 if ((vcpu->arch.shadow_efer & EFER_LME)) { 322 int cs_db, cs_l; 323 324 if (!is_pae(vcpu)) { 325 printk(KERN_DEBUG "set_cr0: #GP, start paging " 326 "in long mode while PAE is disabled\n"); 327 kvm_inject_gp(vcpu, 0); 328 return; 329 } 330 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 331 if (cs_l) { 332 printk(KERN_DEBUG "set_cr0: #GP, start paging " 333 "in long mode while CS.L == 1\n"); 334 kvm_inject_gp(vcpu, 0); 335 return; 336 337 } 338 } else 339 #endif 340 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 341 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 342 "reserved bits\n"); 343 kvm_inject_gp(vcpu, 0); 344 return; 345 } 346 347 } 348 349 kvm_x86_ops->set_cr0(vcpu, cr0); 350 vcpu->arch.cr0 = cr0; 351 352 kvm_mmu_reset_context(vcpu); 353 return; 354 } 355 EXPORT_SYMBOL_GPL(kvm_set_cr0); 356 357 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 358 { 359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 360 } 361 EXPORT_SYMBOL_GPL(kvm_lmsw); 362 363 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 364 { 365 unsigned long old_cr4 = vcpu->arch.cr4; 366 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 367 368 if (cr4 & CR4_RESERVED_BITS) { 369 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 370 kvm_inject_gp(vcpu, 0); 371 return; 372 } 373 374 if (is_long_mode(vcpu)) { 375 if (!(cr4 & X86_CR4_PAE)) { 376 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 377 "in long mode\n"); 378 kvm_inject_gp(vcpu, 0); 379 return; 380 } 381 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 382 && ((cr4 ^ old_cr4) & pdptr_bits) 383 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 384 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 385 kvm_inject_gp(vcpu, 0); 386 return; 387 } 388 389 if (cr4 & X86_CR4_VMXE) { 390 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 391 kvm_inject_gp(vcpu, 0); 392 return; 393 } 394 kvm_x86_ops->set_cr4(vcpu, cr4); 395 vcpu->arch.cr4 = cr4; 396 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 397 kvm_mmu_reset_context(vcpu); 398 } 399 EXPORT_SYMBOL_GPL(kvm_set_cr4); 400 401 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 402 { 403 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 404 kvm_mmu_sync_roots(vcpu); 405 kvm_mmu_flush_tlb(vcpu); 406 return; 407 } 408 409 if (is_long_mode(vcpu)) { 410 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 411 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 412 kvm_inject_gp(vcpu, 0); 413 return; 414 } 415 } else { 416 if (is_pae(vcpu)) { 417 if (cr3 & CR3_PAE_RESERVED_BITS) { 418 printk(KERN_DEBUG 419 "set_cr3: #GP, reserved bits\n"); 420 kvm_inject_gp(vcpu, 0); 421 return; 422 } 423 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 424 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 425 "reserved bits\n"); 426 kvm_inject_gp(vcpu, 0); 427 return; 428 } 429 } 430 /* 431 * We don't check reserved bits in nonpae mode, because 432 * this isn't enforced, and VMware depends on this. 433 */ 434 } 435 436 /* 437 * Does the new cr3 value map to physical memory? (Note, we 438 * catch an invalid cr3 even in real-mode, because it would 439 * cause trouble later on when we turn on paging anyway.) 440 * 441 * A real CPU would silently accept an invalid cr3 and would 442 * attempt to use it - with largely undefined (and often hard 443 * to debug) behavior on the guest side. 444 */ 445 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 446 kvm_inject_gp(vcpu, 0); 447 else { 448 vcpu->arch.cr3 = cr3; 449 vcpu->arch.mmu.new_cr3(vcpu); 450 } 451 } 452 EXPORT_SYMBOL_GPL(kvm_set_cr3); 453 454 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 455 { 456 if (cr8 & CR8_RESERVED_BITS) { 457 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 458 kvm_inject_gp(vcpu, 0); 459 return; 460 } 461 if (irqchip_in_kernel(vcpu->kvm)) 462 kvm_lapic_set_tpr(vcpu, cr8); 463 else 464 vcpu->arch.cr8 = cr8; 465 } 466 EXPORT_SYMBOL_GPL(kvm_set_cr8); 467 468 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 469 { 470 if (irqchip_in_kernel(vcpu->kvm)) 471 return kvm_lapic_get_cr8(vcpu); 472 else 473 return vcpu->arch.cr8; 474 } 475 EXPORT_SYMBOL_GPL(kvm_get_cr8); 476 477 static inline u32 bit(int bitno) 478 { 479 return 1 << (bitno & 31); 480 } 481 482 /* 483 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 485 * 486 * This list is modified at module load time to reflect the 487 * capabilities of the host cpu. 488 */ 489 static u32 msrs_to_save[] = { 490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 491 MSR_K6_STAR, 492 #ifdef CONFIG_X86_64 493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 494 #endif 495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 497 }; 498 499 static unsigned num_msrs_to_save; 500 501 static u32 emulated_msrs[] = { 502 MSR_IA32_MISC_ENABLE, 503 }; 504 505 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 506 { 507 if (efer & efer_reserved_bits) { 508 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 509 efer); 510 kvm_inject_gp(vcpu, 0); 511 return; 512 } 513 514 if (is_paging(vcpu) 515 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 516 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 517 kvm_inject_gp(vcpu, 0); 518 return; 519 } 520 521 if (efer & EFER_FFXSR) { 522 struct kvm_cpuid_entry2 *feat; 523 524 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 525 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 526 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 527 kvm_inject_gp(vcpu, 0); 528 return; 529 } 530 } 531 532 if (efer & EFER_SVME) { 533 struct kvm_cpuid_entry2 *feat; 534 535 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 536 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 537 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 538 kvm_inject_gp(vcpu, 0); 539 return; 540 } 541 } 542 543 kvm_x86_ops->set_efer(vcpu, efer); 544 545 efer &= ~EFER_LMA; 546 efer |= vcpu->arch.shadow_efer & EFER_LMA; 547 548 vcpu->arch.shadow_efer = efer; 549 550 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 551 kvm_mmu_reset_context(vcpu); 552 } 553 554 void kvm_enable_efer_bits(u64 mask) 555 { 556 efer_reserved_bits &= ~mask; 557 } 558 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 559 560 561 /* 562 * Writes msr value into into the appropriate "register". 563 * Returns 0 on success, non-0 otherwise. 564 * Assumes vcpu_load() was already called. 565 */ 566 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 567 { 568 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 569 } 570 571 /* 572 * Adapt set_msr() to msr_io()'s calling convention 573 */ 574 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 575 { 576 return kvm_set_msr(vcpu, index, *data); 577 } 578 579 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 580 { 581 static int version; 582 struct pvclock_wall_clock wc; 583 struct timespec now, sys, boot; 584 585 if (!wall_clock) 586 return; 587 588 version++; 589 590 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 591 592 /* 593 * The guest calculates current wall clock time by adding 594 * system time (updated by kvm_write_guest_time below) to the 595 * wall clock specified here. guest system time equals host 596 * system time for us, thus we must fill in host boot time here. 597 */ 598 now = current_kernel_time(); 599 ktime_get_ts(&sys); 600 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 601 602 wc.sec = boot.tv_sec; 603 wc.nsec = boot.tv_nsec; 604 wc.version = version; 605 606 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 607 608 version++; 609 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 610 } 611 612 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 613 { 614 uint32_t quotient, remainder; 615 616 /* Don't try to replace with do_div(), this one calculates 617 * "(dividend << 32) / divisor" */ 618 __asm__ ( "divl %4" 619 : "=a" (quotient), "=d" (remainder) 620 : "0" (0), "1" (dividend), "r" (divisor) ); 621 return quotient; 622 } 623 624 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 625 { 626 uint64_t nsecs = 1000000000LL; 627 int32_t shift = 0; 628 uint64_t tps64; 629 uint32_t tps32; 630 631 tps64 = tsc_khz * 1000LL; 632 while (tps64 > nsecs*2) { 633 tps64 >>= 1; 634 shift--; 635 } 636 637 tps32 = (uint32_t)tps64; 638 while (tps32 <= (uint32_t)nsecs) { 639 tps32 <<= 1; 640 shift++; 641 } 642 643 hv_clock->tsc_shift = shift; 644 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 645 646 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 647 __func__, tsc_khz, hv_clock->tsc_shift, 648 hv_clock->tsc_to_system_mul); 649 } 650 651 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 652 653 static void kvm_write_guest_time(struct kvm_vcpu *v) 654 { 655 struct timespec ts; 656 unsigned long flags; 657 struct kvm_vcpu_arch *vcpu = &v->arch; 658 void *shared_kaddr; 659 unsigned long this_tsc_khz; 660 661 if ((!vcpu->time_page)) 662 return; 663 664 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 665 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 666 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 667 vcpu->hv_clock_tsc_khz = this_tsc_khz; 668 } 669 put_cpu_var(cpu_tsc_khz); 670 671 /* Keep irq disabled to prevent changes to the clock */ 672 local_irq_save(flags); 673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 674 ktime_get_ts(&ts); 675 local_irq_restore(flags); 676 677 /* With all the info we got, fill in the values */ 678 679 vcpu->hv_clock.system_time = ts.tv_nsec + 680 (NSEC_PER_SEC * (u64)ts.tv_sec); 681 /* 682 * The interface expects us to write an even number signaling that the 683 * update is finished. Since the guest won't see the intermediate 684 * state, we just increase by 2 at the end. 685 */ 686 vcpu->hv_clock.version += 2; 687 688 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 689 690 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 691 sizeof(vcpu->hv_clock)); 692 693 kunmap_atomic(shared_kaddr, KM_USER0); 694 695 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 696 } 697 698 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 699 { 700 struct kvm_vcpu_arch *vcpu = &v->arch; 701 702 if (!vcpu->time_page) 703 return 0; 704 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 705 return 1; 706 } 707 708 static bool msr_mtrr_valid(unsigned msr) 709 { 710 switch (msr) { 711 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 712 case MSR_MTRRfix64K_00000: 713 case MSR_MTRRfix16K_80000: 714 case MSR_MTRRfix16K_A0000: 715 case MSR_MTRRfix4K_C0000: 716 case MSR_MTRRfix4K_C8000: 717 case MSR_MTRRfix4K_D0000: 718 case MSR_MTRRfix4K_D8000: 719 case MSR_MTRRfix4K_E0000: 720 case MSR_MTRRfix4K_E8000: 721 case MSR_MTRRfix4K_F0000: 722 case MSR_MTRRfix4K_F8000: 723 case MSR_MTRRdefType: 724 case MSR_IA32_CR_PAT: 725 return true; 726 case 0x2f8: 727 return true; 728 } 729 return false; 730 } 731 732 static bool valid_pat_type(unsigned t) 733 { 734 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 735 } 736 737 static bool valid_mtrr_type(unsigned t) 738 { 739 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 740 } 741 742 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 743 { 744 int i; 745 746 if (!msr_mtrr_valid(msr)) 747 return false; 748 749 if (msr == MSR_IA32_CR_PAT) { 750 for (i = 0; i < 8; i++) 751 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 752 return false; 753 return true; 754 } else if (msr == MSR_MTRRdefType) { 755 if (data & ~0xcff) 756 return false; 757 return valid_mtrr_type(data & 0xff); 758 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 759 for (i = 0; i < 8 ; i++) 760 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 761 return false; 762 return true; 763 } 764 765 /* variable MTRRs */ 766 return valid_mtrr_type(data & 0xff); 767 } 768 769 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 770 { 771 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 772 773 if (!mtrr_valid(vcpu, msr, data)) 774 return 1; 775 776 if (msr == MSR_MTRRdefType) { 777 vcpu->arch.mtrr_state.def_type = data; 778 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 779 } else if (msr == MSR_MTRRfix64K_00000) 780 p[0] = data; 781 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 782 p[1 + msr - MSR_MTRRfix16K_80000] = data; 783 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 784 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 785 else if (msr == MSR_IA32_CR_PAT) 786 vcpu->arch.pat = data; 787 else { /* Variable MTRRs */ 788 int idx, is_mtrr_mask; 789 u64 *pt; 790 791 idx = (msr - 0x200) / 2; 792 is_mtrr_mask = msr - 0x200 - 2 * idx; 793 if (!is_mtrr_mask) 794 pt = 795 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 796 else 797 pt = 798 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 799 *pt = data; 800 } 801 802 kvm_mmu_reset_context(vcpu); 803 return 0; 804 } 805 806 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 807 { 808 u64 mcg_cap = vcpu->arch.mcg_cap; 809 unsigned bank_num = mcg_cap & 0xff; 810 811 switch (msr) { 812 case MSR_IA32_MCG_STATUS: 813 vcpu->arch.mcg_status = data; 814 break; 815 case MSR_IA32_MCG_CTL: 816 if (!(mcg_cap & MCG_CTL_P)) 817 return 1; 818 if (data != 0 && data != ~(u64)0) 819 return -1; 820 vcpu->arch.mcg_ctl = data; 821 break; 822 default: 823 if (msr >= MSR_IA32_MC0_CTL && 824 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 825 u32 offset = msr - MSR_IA32_MC0_CTL; 826 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 827 if ((offset & 0x3) == 0 && 828 data != 0 && data != ~(u64)0) 829 return -1; 830 vcpu->arch.mce_banks[offset] = data; 831 break; 832 } 833 return 1; 834 } 835 return 0; 836 } 837 838 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 839 { 840 switch (msr) { 841 case MSR_EFER: 842 set_efer(vcpu, data); 843 break; 844 case MSR_K7_HWCR: 845 data &= ~(u64)0x40; /* ignore flush filter disable */ 846 if (data != 0) { 847 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 848 data); 849 return 1; 850 } 851 break; 852 case MSR_FAM10H_MMIO_CONF_BASE: 853 if (data != 0) { 854 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 855 "0x%llx\n", data); 856 return 1; 857 } 858 break; 859 case MSR_AMD64_NB_CFG: 860 break; 861 case MSR_IA32_DEBUGCTLMSR: 862 if (!data) { 863 /* We support the non-activated case already */ 864 break; 865 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 866 /* Values other than LBR and BTF are vendor-specific, 867 thus reserved and should throw a #GP */ 868 return 1; 869 } 870 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 871 __func__, data); 872 break; 873 case MSR_IA32_UCODE_REV: 874 case MSR_IA32_UCODE_WRITE: 875 case MSR_VM_HSAVE_PA: 876 case MSR_AMD64_PATCH_LOADER: 877 break; 878 case 0x200 ... 0x2ff: 879 return set_msr_mtrr(vcpu, msr, data); 880 case MSR_IA32_APICBASE: 881 kvm_set_apic_base(vcpu, data); 882 break; 883 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 884 return kvm_x2apic_msr_write(vcpu, msr, data); 885 case MSR_IA32_MISC_ENABLE: 886 vcpu->arch.ia32_misc_enable_msr = data; 887 break; 888 case MSR_KVM_WALL_CLOCK: 889 vcpu->kvm->arch.wall_clock = data; 890 kvm_write_wall_clock(vcpu->kvm, data); 891 break; 892 case MSR_KVM_SYSTEM_TIME: { 893 if (vcpu->arch.time_page) { 894 kvm_release_page_dirty(vcpu->arch.time_page); 895 vcpu->arch.time_page = NULL; 896 } 897 898 vcpu->arch.time = data; 899 900 /* we verify if the enable bit is set... */ 901 if (!(data & 1)) 902 break; 903 904 /* ...but clean it before doing the actual write */ 905 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 906 907 vcpu->arch.time_page = 908 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 909 910 if (is_error_page(vcpu->arch.time_page)) { 911 kvm_release_page_clean(vcpu->arch.time_page); 912 vcpu->arch.time_page = NULL; 913 } 914 915 kvm_request_guest_time_update(vcpu); 916 break; 917 } 918 case MSR_IA32_MCG_CTL: 919 case MSR_IA32_MCG_STATUS: 920 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 921 return set_msr_mce(vcpu, msr, data); 922 923 /* Performance counters are not protected by a CPUID bit, 924 * so we should check all of them in the generic path for the sake of 925 * cross vendor migration. 926 * Writing a zero into the event select MSRs disables them, 927 * which we perfectly emulate ;-). Any other value should be at least 928 * reported, some guests depend on them. 929 */ 930 case MSR_P6_EVNTSEL0: 931 case MSR_P6_EVNTSEL1: 932 case MSR_K7_EVNTSEL0: 933 case MSR_K7_EVNTSEL1: 934 case MSR_K7_EVNTSEL2: 935 case MSR_K7_EVNTSEL3: 936 if (data != 0) 937 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 938 "0x%x data 0x%llx\n", msr, data); 939 break; 940 /* at least RHEL 4 unconditionally writes to the perfctr registers, 941 * so we ignore writes to make it happy. 942 */ 943 case MSR_P6_PERFCTR0: 944 case MSR_P6_PERFCTR1: 945 case MSR_K7_PERFCTR0: 946 case MSR_K7_PERFCTR1: 947 case MSR_K7_PERFCTR2: 948 case MSR_K7_PERFCTR3: 949 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 950 "0x%x data 0x%llx\n", msr, data); 951 break; 952 default: 953 if (!ignore_msrs) { 954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 955 msr, data); 956 return 1; 957 } else { 958 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 959 msr, data); 960 break; 961 } 962 } 963 return 0; 964 } 965 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 966 967 968 /* 969 * Reads an msr value (of 'msr_index') into 'pdata'. 970 * Returns 0 on success, non-0 otherwise. 971 * Assumes vcpu_load() was already called. 972 */ 973 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 974 { 975 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 976 } 977 978 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 979 { 980 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 981 982 if (!msr_mtrr_valid(msr)) 983 return 1; 984 985 if (msr == MSR_MTRRdefType) 986 *pdata = vcpu->arch.mtrr_state.def_type + 987 (vcpu->arch.mtrr_state.enabled << 10); 988 else if (msr == MSR_MTRRfix64K_00000) 989 *pdata = p[0]; 990 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 991 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 992 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 993 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 994 else if (msr == MSR_IA32_CR_PAT) 995 *pdata = vcpu->arch.pat; 996 else { /* Variable MTRRs */ 997 int idx, is_mtrr_mask; 998 u64 *pt; 999 1000 idx = (msr - 0x200) / 2; 1001 is_mtrr_mask = msr - 0x200 - 2 * idx; 1002 if (!is_mtrr_mask) 1003 pt = 1004 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1005 else 1006 pt = 1007 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1008 *pdata = *pt; 1009 } 1010 1011 return 0; 1012 } 1013 1014 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1015 { 1016 u64 data; 1017 u64 mcg_cap = vcpu->arch.mcg_cap; 1018 unsigned bank_num = mcg_cap & 0xff; 1019 1020 switch (msr) { 1021 case MSR_IA32_P5_MC_ADDR: 1022 case MSR_IA32_P5_MC_TYPE: 1023 data = 0; 1024 break; 1025 case MSR_IA32_MCG_CAP: 1026 data = vcpu->arch.mcg_cap; 1027 break; 1028 case MSR_IA32_MCG_CTL: 1029 if (!(mcg_cap & MCG_CTL_P)) 1030 return 1; 1031 data = vcpu->arch.mcg_ctl; 1032 break; 1033 case MSR_IA32_MCG_STATUS: 1034 data = vcpu->arch.mcg_status; 1035 break; 1036 default: 1037 if (msr >= MSR_IA32_MC0_CTL && 1038 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1039 u32 offset = msr - MSR_IA32_MC0_CTL; 1040 data = vcpu->arch.mce_banks[offset]; 1041 break; 1042 } 1043 return 1; 1044 } 1045 *pdata = data; 1046 return 0; 1047 } 1048 1049 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1050 { 1051 u64 data; 1052 1053 switch (msr) { 1054 case MSR_IA32_PLATFORM_ID: 1055 case MSR_IA32_UCODE_REV: 1056 case MSR_IA32_EBL_CR_POWERON: 1057 case MSR_IA32_DEBUGCTLMSR: 1058 case MSR_IA32_LASTBRANCHFROMIP: 1059 case MSR_IA32_LASTBRANCHTOIP: 1060 case MSR_IA32_LASTINTFROMIP: 1061 case MSR_IA32_LASTINTTOIP: 1062 case MSR_K8_SYSCFG: 1063 case MSR_K7_HWCR: 1064 case MSR_VM_HSAVE_PA: 1065 case MSR_P6_PERFCTR0: 1066 case MSR_P6_PERFCTR1: 1067 case MSR_P6_EVNTSEL0: 1068 case MSR_P6_EVNTSEL1: 1069 case MSR_K7_EVNTSEL0: 1070 case MSR_K7_PERFCTR0: 1071 case MSR_K8_INT_PENDING_MSG: 1072 case MSR_AMD64_NB_CFG: 1073 case MSR_FAM10H_MMIO_CONF_BASE: 1074 data = 0; 1075 break; 1076 case MSR_MTRRcap: 1077 data = 0x500 | KVM_NR_VAR_MTRR; 1078 break; 1079 case 0x200 ... 0x2ff: 1080 return get_msr_mtrr(vcpu, msr, pdata); 1081 case 0xcd: /* fsb frequency */ 1082 data = 3; 1083 break; 1084 case MSR_IA32_APICBASE: 1085 data = kvm_get_apic_base(vcpu); 1086 break; 1087 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1088 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1089 break; 1090 case MSR_IA32_MISC_ENABLE: 1091 data = vcpu->arch.ia32_misc_enable_msr; 1092 break; 1093 case MSR_IA32_PERF_STATUS: 1094 /* TSC increment by tick */ 1095 data = 1000ULL; 1096 /* CPU multiplier */ 1097 data |= (((uint64_t)4ULL) << 40); 1098 break; 1099 case MSR_EFER: 1100 data = vcpu->arch.shadow_efer; 1101 break; 1102 case MSR_KVM_WALL_CLOCK: 1103 data = vcpu->kvm->arch.wall_clock; 1104 break; 1105 case MSR_KVM_SYSTEM_TIME: 1106 data = vcpu->arch.time; 1107 break; 1108 case MSR_IA32_P5_MC_ADDR: 1109 case MSR_IA32_P5_MC_TYPE: 1110 case MSR_IA32_MCG_CAP: 1111 case MSR_IA32_MCG_CTL: 1112 case MSR_IA32_MCG_STATUS: 1113 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1114 return get_msr_mce(vcpu, msr, pdata); 1115 default: 1116 if (!ignore_msrs) { 1117 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1118 return 1; 1119 } else { 1120 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1121 data = 0; 1122 } 1123 break; 1124 } 1125 *pdata = data; 1126 return 0; 1127 } 1128 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1129 1130 /* 1131 * Read or write a bunch of msrs. All parameters are kernel addresses. 1132 * 1133 * @return number of msrs set successfully. 1134 */ 1135 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1136 struct kvm_msr_entry *entries, 1137 int (*do_msr)(struct kvm_vcpu *vcpu, 1138 unsigned index, u64 *data)) 1139 { 1140 int i; 1141 1142 vcpu_load(vcpu); 1143 1144 down_read(&vcpu->kvm->slots_lock); 1145 for (i = 0; i < msrs->nmsrs; ++i) 1146 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1147 break; 1148 up_read(&vcpu->kvm->slots_lock); 1149 1150 vcpu_put(vcpu); 1151 1152 return i; 1153 } 1154 1155 /* 1156 * Read or write a bunch of msrs. Parameters are user addresses. 1157 * 1158 * @return number of msrs set successfully. 1159 */ 1160 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1161 int (*do_msr)(struct kvm_vcpu *vcpu, 1162 unsigned index, u64 *data), 1163 int writeback) 1164 { 1165 struct kvm_msrs msrs; 1166 struct kvm_msr_entry *entries; 1167 int r, n; 1168 unsigned size; 1169 1170 r = -EFAULT; 1171 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1172 goto out; 1173 1174 r = -E2BIG; 1175 if (msrs.nmsrs >= MAX_IO_MSRS) 1176 goto out; 1177 1178 r = -ENOMEM; 1179 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1180 entries = vmalloc(size); 1181 if (!entries) 1182 goto out; 1183 1184 r = -EFAULT; 1185 if (copy_from_user(entries, user_msrs->entries, size)) 1186 goto out_free; 1187 1188 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1189 if (r < 0) 1190 goto out_free; 1191 1192 r = -EFAULT; 1193 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1194 goto out_free; 1195 1196 r = n; 1197 1198 out_free: 1199 vfree(entries); 1200 out: 1201 return r; 1202 } 1203 1204 int kvm_dev_ioctl_check_extension(long ext) 1205 { 1206 int r; 1207 1208 switch (ext) { 1209 case KVM_CAP_IRQCHIP: 1210 case KVM_CAP_HLT: 1211 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1212 case KVM_CAP_SET_TSS_ADDR: 1213 case KVM_CAP_EXT_CPUID: 1214 case KVM_CAP_CLOCKSOURCE: 1215 case KVM_CAP_PIT: 1216 case KVM_CAP_NOP_IO_DELAY: 1217 case KVM_CAP_MP_STATE: 1218 case KVM_CAP_SYNC_MMU: 1219 case KVM_CAP_REINJECT_CONTROL: 1220 case KVM_CAP_IRQ_INJECT_STATUS: 1221 case KVM_CAP_ASSIGN_DEV_IRQ: 1222 case KVM_CAP_IRQFD: 1223 case KVM_CAP_IOEVENTFD: 1224 case KVM_CAP_PIT2: 1225 case KVM_CAP_PIT_STATE2: 1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1227 r = 1; 1228 break; 1229 case KVM_CAP_COALESCED_MMIO: 1230 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1231 break; 1232 case KVM_CAP_VAPIC: 1233 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1234 break; 1235 case KVM_CAP_NR_VCPUS: 1236 r = KVM_MAX_VCPUS; 1237 break; 1238 case KVM_CAP_NR_MEMSLOTS: 1239 r = KVM_MEMORY_SLOTS; 1240 break; 1241 case KVM_CAP_PV_MMU: 1242 r = !tdp_enabled; 1243 break; 1244 case KVM_CAP_IOMMU: 1245 r = iommu_found(); 1246 break; 1247 case KVM_CAP_MCE: 1248 r = KVM_MAX_MCE_BANKS; 1249 break; 1250 default: 1251 r = 0; 1252 break; 1253 } 1254 return r; 1255 1256 } 1257 1258 long kvm_arch_dev_ioctl(struct file *filp, 1259 unsigned int ioctl, unsigned long arg) 1260 { 1261 void __user *argp = (void __user *)arg; 1262 long r; 1263 1264 switch (ioctl) { 1265 case KVM_GET_MSR_INDEX_LIST: { 1266 struct kvm_msr_list __user *user_msr_list = argp; 1267 struct kvm_msr_list msr_list; 1268 unsigned n; 1269 1270 r = -EFAULT; 1271 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1272 goto out; 1273 n = msr_list.nmsrs; 1274 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1275 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1276 goto out; 1277 r = -E2BIG; 1278 if (n < msr_list.nmsrs) 1279 goto out; 1280 r = -EFAULT; 1281 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1282 num_msrs_to_save * sizeof(u32))) 1283 goto out; 1284 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 1285 &emulated_msrs, 1286 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1287 goto out; 1288 r = 0; 1289 break; 1290 } 1291 case KVM_GET_SUPPORTED_CPUID: { 1292 struct kvm_cpuid2 __user *cpuid_arg = argp; 1293 struct kvm_cpuid2 cpuid; 1294 1295 r = -EFAULT; 1296 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1297 goto out; 1298 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1299 cpuid_arg->entries); 1300 if (r) 1301 goto out; 1302 1303 r = -EFAULT; 1304 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1305 goto out; 1306 r = 0; 1307 break; 1308 } 1309 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 1310 u64 mce_cap; 1311 1312 mce_cap = KVM_MCE_CAP_SUPPORTED; 1313 r = -EFAULT; 1314 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 1315 goto out; 1316 r = 0; 1317 break; 1318 } 1319 default: 1320 r = -EINVAL; 1321 } 1322 out: 1323 return r; 1324 } 1325 1326 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1327 { 1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1329 kvm_request_guest_time_update(vcpu); 1330 } 1331 1332 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1333 { 1334 kvm_x86_ops->vcpu_put(vcpu); 1335 kvm_put_guest_fpu(vcpu); 1336 } 1337 1338 static int is_efer_nx(void) 1339 { 1340 unsigned long long efer = 0; 1341 1342 rdmsrl_safe(MSR_EFER, &efer); 1343 return efer & EFER_NX; 1344 } 1345 1346 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1347 { 1348 int i; 1349 struct kvm_cpuid_entry2 *e, *entry; 1350 1351 entry = NULL; 1352 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1353 e = &vcpu->arch.cpuid_entries[i]; 1354 if (e->function == 0x80000001) { 1355 entry = e; 1356 break; 1357 } 1358 } 1359 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1360 entry->edx &= ~(1 << 20); 1361 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1362 } 1363 } 1364 1365 /* when an old userspace process fills a new kernel module */ 1366 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1367 struct kvm_cpuid *cpuid, 1368 struct kvm_cpuid_entry __user *entries) 1369 { 1370 int r, i; 1371 struct kvm_cpuid_entry *cpuid_entries; 1372 1373 r = -E2BIG; 1374 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1375 goto out; 1376 r = -ENOMEM; 1377 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1378 if (!cpuid_entries) 1379 goto out; 1380 r = -EFAULT; 1381 if (copy_from_user(cpuid_entries, entries, 1382 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1383 goto out_free; 1384 for (i = 0; i < cpuid->nent; i++) { 1385 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1386 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1387 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1388 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1389 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1390 vcpu->arch.cpuid_entries[i].index = 0; 1391 vcpu->arch.cpuid_entries[i].flags = 0; 1392 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1393 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1394 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1395 } 1396 vcpu->arch.cpuid_nent = cpuid->nent; 1397 cpuid_fix_nx_cap(vcpu); 1398 r = 0; 1399 kvm_apic_set_version(vcpu); 1400 1401 out_free: 1402 vfree(cpuid_entries); 1403 out: 1404 return r; 1405 } 1406 1407 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1408 struct kvm_cpuid2 *cpuid, 1409 struct kvm_cpuid_entry2 __user *entries) 1410 { 1411 int r; 1412 1413 r = -E2BIG; 1414 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1415 goto out; 1416 r = -EFAULT; 1417 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1418 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1419 goto out; 1420 vcpu->arch.cpuid_nent = cpuid->nent; 1421 kvm_apic_set_version(vcpu); 1422 return 0; 1423 1424 out: 1425 return r; 1426 } 1427 1428 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1429 struct kvm_cpuid2 *cpuid, 1430 struct kvm_cpuid_entry2 __user *entries) 1431 { 1432 int r; 1433 1434 r = -E2BIG; 1435 if (cpuid->nent < vcpu->arch.cpuid_nent) 1436 goto out; 1437 r = -EFAULT; 1438 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1439 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1440 goto out; 1441 return 0; 1442 1443 out: 1444 cpuid->nent = vcpu->arch.cpuid_nent; 1445 return r; 1446 } 1447 1448 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1449 u32 index) 1450 { 1451 entry->function = function; 1452 entry->index = index; 1453 cpuid_count(entry->function, entry->index, 1454 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1455 entry->flags = 0; 1456 } 1457 1458 #define F(x) bit(X86_FEATURE_##x) 1459 1460 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1461 u32 index, int *nent, int maxnent) 1462 { 1463 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1464 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; 1465 #ifdef CONFIG_X86_64 1466 unsigned f_lm = F(LM); 1467 #else 1468 unsigned f_lm = 0; 1469 #endif 1470 1471 /* cpuid 1.edx */ 1472 const u32 kvm_supported_word0_x86_features = 1473 F(FPU) | F(VME) | F(DE) | F(PSE) | 1474 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1475 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1476 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1477 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1478 0 /* Reserved, DS, ACPI */ | F(MMX) | 1479 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1480 0 /* HTT, TM, Reserved, PBE */; 1481 /* cpuid 0x80000001.edx */ 1482 const u32 kvm_supported_word1_x86_features = 1483 F(FPU) | F(VME) | F(DE) | F(PSE) | 1484 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1485 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1486 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1487 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1488 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1489 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1490 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1491 /* cpuid 1.ecx */ 1492 const u32 kvm_supported_word4_x86_features = 1493 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1494 0 /* DS-CPL, VMX, SMX, EST */ | 1495 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1496 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1497 0 /* Reserved, DCA */ | F(XMM4_1) | 1498 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1499 0 /* Reserved, XSAVE, OSXSAVE */; 1500 /* cpuid 0x80000001.ecx */ 1501 const u32 kvm_supported_word6_x86_features = 1502 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1503 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1504 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1505 0 /* SKINIT */ | 0 /* WDT */; 1506 1507 /* all calls to cpuid_count() should be made on the same cpu */ 1508 get_cpu(); 1509 do_cpuid_1_ent(entry, function, index); 1510 ++*nent; 1511 1512 switch (function) { 1513 case 0: 1514 entry->eax = min(entry->eax, (u32)0xb); 1515 break; 1516 case 1: 1517 entry->edx &= kvm_supported_word0_x86_features; 1518 entry->ecx &= kvm_supported_word4_x86_features; 1519 /* we support x2apic emulation even if host does not support 1520 * it since we emulate x2apic in software */ 1521 entry->ecx |= F(X2APIC); 1522 break; 1523 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1524 * may return different values. This forces us to get_cpu() before 1525 * issuing the first command, and also to emulate this annoying behavior 1526 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1527 case 2: { 1528 int t, times = entry->eax & 0xff; 1529 1530 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1531 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1532 for (t = 1; t < times && *nent < maxnent; ++t) { 1533 do_cpuid_1_ent(&entry[t], function, 0); 1534 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1535 ++*nent; 1536 } 1537 break; 1538 } 1539 /* function 4 and 0xb have additional index. */ 1540 case 4: { 1541 int i, cache_type; 1542 1543 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1544 /* read more entries until cache_type is zero */ 1545 for (i = 1; *nent < maxnent; ++i) { 1546 cache_type = entry[i - 1].eax & 0x1f; 1547 if (!cache_type) 1548 break; 1549 do_cpuid_1_ent(&entry[i], function, i); 1550 entry[i].flags |= 1551 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1552 ++*nent; 1553 } 1554 break; 1555 } 1556 case 0xb: { 1557 int i, level_type; 1558 1559 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1560 /* read more entries until level_type is zero */ 1561 for (i = 1; *nent < maxnent; ++i) { 1562 level_type = entry[i - 1].ecx & 0xff00; 1563 if (!level_type) 1564 break; 1565 do_cpuid_1_ent(&entry[i], function, i); 1566 entry[i].flags |= 1567 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1568 ++*nent; 1569 } 1570 break; 1571 } 1572 case 0x80000000: 1573 entry->eax = min(entry->eax, 0x8000001a); 1574 break; 1575 case 0x80000001: 1576 entry->edx &= kvm_supported_word1_x86_features; 1577 entry->ecx &= kvm_supported_word6_x86_features; 1578 break; 1579 } 1580 put_cpu(); 1581 } 1582 1583 #undef F 1584 1585 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1586 struct kvm_cpuid_entry2 __user *entries) 1587 { 1588 struct kvm_cpuid_entry2 *cpuid_entries; 1589 int limit, nent = 0, r = -E2BIG; 1590 u32 func; 1591 1592 if (cpuid->nent < 1) 1593 goto out; 1594 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1595 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 1596 r = -ENOMEM; 1597 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1598 if (!cpuid_entries) 1599 goto out; 1600 1601 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1602 limit = cpuid_entries[0].eax; 1603 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1604 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1605 &nent, cpuid->nent); 1606 r = -E2BIG; 1607 if (nent >= cpuid->nent) 1608 goto out_free; 1609 1610 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1611 limit = cpuid_entries[nent - 1].eax; 1612 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1613 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1614 &nent, cpuid->nent); 1615 r = -E2BIG; 1616 if (nent >= cpuid->nent) 1617 goto out_free; 1618 1619 r = -EFAULT; 1620 if (copy_to_user(entries, cpuid_entries, 1621 nent * sizeof(struct kvm_cpuid_entry2))) 1622 goto out_free; 1623 cpuid->nent = nent; 1624 r = 0; 1625 1626 out_free: 1627 vfree(cpuid_entries); 1628 out: 1629 return r; 1630 } 1631 1632 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1633 struct kvm_lapic_state *s) 1634 { 1635 vcpu_load(vcpu); 1636 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1637 vcpu_put(vcpu); 1638 1639 return 0; 1640 } 1641 1642 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1643 struct kvm_lapic_state *s) 1644 { 1645 vcpu_load(vcpu); 1646 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1647 kvm_apic_post_state_restore(vcpu); 1648 update_cr8_intercept(vcpu); 1649 vcpu_put(vcpu); 1650 1651 return 0; 1652 } 1653 1654 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1655 struct kvm_interrupt *irq) 1656 { 1657 if (irq->irq < 0 || irq->irq >= 256) 1658 return -EINVAL; 1659 if (irqchip_in_kernel(vcpu->kvm)) 1660 return -ENXIO; 1661 vcpu_load(vcpu); 1662 1663 kvm_queue_interrupt(vcpu, irq->irq, false); 1664 1665 vcpu_put(vcpu); 1666 1667 return 0; 1668 } 1669 1670 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1671 { 1672 vcpu_load(vcpu); 1673 kvm_inject_nmi(vcpu); 1674 vcpu_put(vcpu); 1675 1676 return 0; 1677 } 1678 1679 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1680 struct kvm_tpr_access_ctl *tac) 1681 { 1682 if (tac->flags) 1683 return -EINVAL; 1684 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1685 return 0; 1686 } 1687 1688 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 1689 u64 mcg_cap) 1690 { 1691 int r; 1692 unsigned bank_num = mcg_cap & 0xff, bank; 1693 1694 r = -EINVAL; 1695 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 1696 goto out; 1697 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1698 goto out; 1699 r = 0; 1700 vcpu->arch.mcg_cap = mcg_cap; 1701 /* Init IA32_MCG_CTL to all 1s */ 1702 if (mcg_cap & MCG_CTL_P) 1703 vcpu->arch.mcg_ctl = ~(u64)0; 1704 /* Init IA32_MCi_CTL to all 1s */ 1705 for (bank = 0; bank < bank_num; bank++) 1706 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 1707 out: 1708 return r; 1709 } 1710 1711 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 1712 struct kvm_x86_mce *mce) 1713 { 1714 u64 mcg_cap = vcpu->arch.mcg_cap; 1715 unsigned bank_num = mcg_cap & 0xff; 1716 u64 *banks = vcpu->arch.mce_banks; 1717 1718 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 1719 return -EINVAL; 1720 /* 1721 * if IA32_MCG_CTL is not all 1s, the uncorrected error 1722 * reporting is disabled 1723 */ 1724 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 1725 vcpu->arch.mcg_ctl != ~(u64)0) 1726 return 0; 1727 banks += 4 * mce->bank; 1728 /* 1729 * if IA32_MCi_CTL is not all 1s, the uncorrected error 1730 * reporting is disabled for the bank 1731 */ 1732 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 1733 return 0; 1734 if (mce->status & MCI_STATUS_UC) { 1735 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 1736 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 1737 printk(KERN_DEBUG "kvm: set_mce: " 1738 "injects mce exception while " 1739 "previous one is in progress!\n"); 1740 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 1741 return 0; 1742 } 1743 if (banks[1] & MCI_STATUS_VAL) 1744 mce->status |= MCI_STATUS_OVER; 1745 banks[2] = mce->addr; 1746 banks[3] = mce->misc; 1747 vcpu->arch.mcg_status = mce->mcg_status; 1748 banks[1] = mce->status; 1749 kvm_queue_exception(vcpu, MC_VECTOR); 1750 } else if (!(banks[1] & MCI_STATUS_VAL) 1751 || !(banks[1] & MCI_STATUS_UC)) { 1752 if (banks[1] & MCI_STATUS_VAL) 1753 mce->status |= MCI_STATUS_OVER; 1754 banks[2] = mce->addr; 1755 banks[3] = mce->misc; 1756 banks[1] = mce->status; 1757 } else 1758 banks[1] |= MCI_STATUS_OVER; 1759 return 0; 1760 } 1761 1762 long kvm_arch_vcpu_ioctl(struct file *filp, 1763 unsigned int ioctl, unsigned long arg) 1764 { 1765 struct kvm_vcpu *vcpu = filp->private_data; 1766 void __user *argp = (void __user *)arg; 1767 int r; 1768 struct kvm_lapic_state *lapic = NULL; 1769 1770 switch (ioctl) { 1771 case KVM_GET_LAPIC: { 1772 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1773 1774 r = -ENOMEM; 1775 if (!lapic) 1776 goto out; 1777 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1778 if (r) 1779 goto out; 1780 r = -EFAULT; 1781 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1782 goto out; 1783 r = 0; 1784 break; 1785 } 1786 case KVM_SET_LAPIC: { 1787 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1788 r = -ENOMEM; 1789 if (!lapic) 1790 goto out; 1791 r = -EFAULT; 1792 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1793 goto out; 1794 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1795 if (r) 1796 goto out; 1797 r = 0; 1798 break; 1799 } 1800 case KVM_INTERRUPT: { 1801 struct kvm_interrupt irq; 1802 1803 r = -EFAULT; 1804 if (copy_from_user(&irq, argp, sizeof irq)) 1805 goto out; 1806 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1807 if (r) 1808 goto out; 1809 r = 0; 1810 break; 1811 } 1812 case KVM_NMI: { 1813 r = kvm_vcpu_ioctl_nmi(vcpu); 1814 if (r) 1815 goto out; 1816 r = 0; 1817 break; 1818 } 1819 case KVM_SET_CPUID: { 1820 struct kvm_cpuid __user *cpuid_arg = argp; 1821 struct kvm_cpuid cpuid; 1822 1823 r = -EFAULT; 1824 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1825 goto out; 1826 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1827 if (r) 1828 goto out; 1829 break; 1830 } 1831 case KVM_SET_CPUID2: { 1832 struct kvm_cpuid2 __user *cpuid_arg = argp; 1833 struct kvm_cpuid2 cpuid; 1834 1835 r = -EFAULT; 1836 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1837 goto out; 1838 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1839 cpuid_arg->entries); 1840 if (r) 1841 goto out; 1842 break; 1843 } 1844 case KVM_GET_CPUID2: { 1845 struct kvm_cpuid2 __user *cpuid_arg = argp; 1846 struct kvm_cpuid2 cpuid; 1847 1848 r = -EFAULT; 1849 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1850 goto out; 1851 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1852 cpuid_arg->entries); 1853 if (r) 1854 goto out; 1855 r = -EFAULT; 1856 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1857 goto out; 1858 r = 0; 1859 break; 1860 } 1861 case KVM_GET_MSRS: 1862 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1863 break; 1864 case KVM_SET_MSRS: 1865 r = msr_io(vcpu, argp, do_set_msr, 0); 1866 break; 1867 case KVM_TPR_ACCESS_REPORTING: { 1868 struct kvm_tpr_access_ctl tac; 1869 1870 r = -EFAULT; 1871 if (copy_from_user(&tac, argp, sizeof tac)) 1872 goto out; 1873 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1874 if (r) 1875 goto out; 1876 r = -EFAULT; 1877 if (copy_to_user(argp, &tac, sizeof tac)) 1878 goto out; 1879 r = 0; 1880 break; 1881 }; 1882 case KVM_SET_VAPIC_ADDR: { 1883 struct kvm_vapic_addr va; 1884 1885 r = -EINVAL; 1886 if (!irqchip_in_kernel(vcpu->kvm)) 1887 goto out; 1888 r = -EFAULT; 1889 if (copy_from_user(&va, argp, sizeof va)) 1890 goto out; 1891 r = 0; 1892 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1893 break; 1894 } 1895 case KVM_X86_SETUP_MCE: { 1896 u64 mcg_cap; 1897 1898 r = -EFAULT; 1899 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 1900 goto out; 1901 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 1902 break; 1903 } 1904 case KVM_X86_SET_MCE: { 1905 struct kvm_x86_mce mce; 1906 1907 r = -EFAULT; 1908 if (copy_from_user(&mce, argp, sizeof mce)) 1909 goto out; 1910 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 1911 break; 1912 } 1913 default: 1914 r = -EINVAL; 1915 } 1916 out: 1917 kfree(lapic); 1918 return r; 1919 } 1920 1921 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1922 { 1923 int ret; 1924 1925 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1926 return -1; 1927 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1928 return ret; 1929 } 1930 1931 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 1932 u64 ident_addr) 1933 { 1934 kvm->arch.ept_identity_map_addr = ident_addr; 1935 return 0; 1936 } 1937 1938 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1939 u32 kvm_nr_mmu_pages) 1940 { 1941 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1942 return -EINVAL; 1943 1944 down_write(&kvm->slots_lock); 1945 spin_lock(&kvm->mmu_lock); 1946 1947 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1948 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1949 1950 spin_unlock(&kvm->mmu_lock); 1951 up_write(&kvm->slots_lock); 1952 return 0; 1953 } 1954 1955 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1956 { 1957 return kvm->arch.n_alloc_mmu_pages; 1958 } 1959 1960 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1961 { 1962 int i; 1963 struct kvm_mem_alias *alias; 1964 1965 for (i = 0; i < kvm->arch.naliases; ++i) { 1966 alias = &kvm->arch.aliases[i]; 1967 if (gfn >= alias->base_gfn 1968 && gfn < alias->base_gfn + alias->npages) 1969 return alias->target_gfn + gfn - alias->base_gfn; 1970 } 1971 return gfn; 1972 } 1973 1974 /* 1975 * Set a new alias region. Aliases map a portion of physical memory into 1976 * another portion. This is useful for memory windows, for example the PC 1977 * VGA region. 1978 */ 1979 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1980 struct kvm_memory_alias *alias) 1981 { 1982 int r, n; 1983 struct kvm_mem_alias *p; 1984 1985 r = -EINVAL; 1986 /* General sanity checks */ 1987 if (alias->memory_size & (PAGE_SIZE - 1)) 1988 goto out; 1989 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1990 goto out; 1991 if (alias->slot >= KVM_ALIAS_SLOTS) 1992 goto out; 1993 if (alias->guest_phys_addr + alias->memory_size 1994 < alias->guest_phys_addr) 1995 goto out; 1996 if (alias->target_phys_addr + alias->memory_size 1997 < alias->target_phys_addr) 1998 goto out; 1999 2000 down_write(&kvm->slots_lock); 2001 spin_lock(&kvm->mmu_lock); 2002 2003 p = &kvm->arch.aliases[alias->slot]; 2004 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2005 p->npages = alias->memory_size >> PAGE_SHIFT; 2006 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2007 2008 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2009 if (kvm->arch.aliases[n - 1].npages) 2010 break; 2011 kvm->arch.naliases = n; 2012 2013 spin_unlock(&kvm->mmu_lock); 2014 kvm_mmu_zap_all(kvm); 2015 2016 up_write(&kvm->slots_lock); 2017 2018 return 0; 2019 2020 out: 2021 return r; 2022 } 2023 2024 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2025 { 2026 int r; 2027 2028 r = 0; 2029 switch (chip->chip_id) { 2030 case KVM_IRQCHIP_PIC_MASTER: 2031 memcpy(&chip->chip.pic, 2032 &pic_irqchip(kvm)->pics[0], 2033 sizeof(struct kvm_pic_state)); 2034 break; 2035 case KVM_IRQCHIP_PIC_SLAVE: 2036 memcpy(&chip->chip.pic, 2037 &pic_irqchip(kvm)->pics[1], 2038 sizeof(struct kvm_pic_state)); 2039 break; 2040 case KVM_IRQCHIP_IOAPIC: 2041 memcpy(&chip->chip.ioapic, 2042 ioapic_irqchip(kvm), 2043 sizeof(struct kvm_ioapic_state)); 2044 break; 2045 default: 2046 r = -EINVAL; 2047 break; 2048 } 2049 return r; 2050 } 2051 2052 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2053 { 2054 int r; 2055 2056 r = 0; 2057 switch (chip->chip_id) { 2058 case KVM_IRQCHIP_PIC_MASTER: 2059 spin_lock(&pic_irqchip(kvm)->lock); 2060 memcpy(&pic_irqchip(kvm)->pics[0], 2061 &chip->chip.pic, 2062 sizeof(struct kvm_pic_state)); 2063 spin_unlock(&pic_irqchip(kvm)->lock); 2064 break; 2065 case KVM_IRQCHIP_PIC_SLAVE: 2066 spin_lock(&pic_irqchip(kvm)->lock); 2067 memcpy(&pic_irqchip(kvm)->pics[1], 2068 &chip->chip.pic, 2069 sizeof(struct kvm_pic_state)); 2070 spin_unlock(&pic_irqchip(kvm)->lock); 2071 break; 2072 case KVM_IRQCHIP_IOAPIC: 2073 mutex_lock(&kvm->irq_lock); 2074 memcpy(ioapic_irqchip(kvm), 2075 &chip->chip.ioapic, 2076 sizeof(struct kvm_ioapic_state)); 2077 mutex_unlock(&kvm->irq_lock); 2078 break; 2079 default: 2080 r = -EINVAL; 2081 break; 2082 } 2083 kvm_pic_update_irq(pic_irqchip(kvm)); 2084 return r; 2085 } 2086 2087 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2088 { 2089 int r = 0; 2090 2091 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2092 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2093 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2094 return r; 2095 } 2096 2097 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2098 { 2099 int r = 0; 2100 2101 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2102 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2103 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 2104 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2105 return r; 2106 } 2107 2108 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2109 { 2110 int r = 0; 2111 2112 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2113 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 2114 sizeof(ps->channels)); 2115 ps->flags = kvm->arch.vpit->pit_state.flags; 2116 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2117 return r; 2118 } 2119 2120 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2121 { 2122 int r = 0, start = 0; 2123 u32 prev_legacy, cur_legacy; 2124 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2125 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 2126 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 2127 if (!prev_legacy && cur_legacy) 2128 start = 1; 2129 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 2130 sizeof(kvm->arch.vpit->pit_state.channels)); 2131 kvm->arch.vpit->pit_state.flags = ps->flags; 2132 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 2133 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2134 return r; 2135 } 2136 2137 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 2138 struct kvm_reinject_control *control) 2139 { 2140 if (!kvm->arch.vpit) 2141 return -ENXIO; 2142 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2143 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2144 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2145 return 0; 2146 } 2147 2148 /* 2149 * Get (and clear) the dirty memory log for a memory slot. 2150 */ 2151 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2152 struct kvm_dirty_log *log) 2153 { 2154 int r; 2155 int n; 2156 struct kvm_memory_slot *memslot; 2157 int is_dirty = 0; 2158 2159 down_write(&kvm->slots_lock); 2160 2161 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2162 if (r) 2163 goto out; 2164 2165 /* If nothing is dirty, don't bother messing with page tables. */ 2166 if (is_dirty) { 2167 spin_lock(&kvm->mmu_lock); 2168 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2169 spin_unlock(&kvm->mmu_lock); 2170 memslot = &kvm->memslots[log->slot]; 2171 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2172 memset(memslot->dirty_bitmap, 0, n); 2173 } 2174 r = 0; 2175 out: 2176 up_write(&kvm->slots_lock); 2177 return r; 2178 } 2179 2180 long kvm_arch_vm_ioctl(struct file *filp, 2181 unsigned int ioctl, unsigned long arg) 2182 { 2183 struct kvm *kvm = filp->private_data; 2184 void __user *argp = (void __user *)arg; 2185 int r = -EINVAL; 2186 /* 2187 * This union makes it completely explicit to gcc-3.x 2188 * that these two variables' stack usage should be 2189 * combined, not added together. 2190 */ 2191 union { 2192 struct kvm_pit_state ps; 2193 struct kvm_pit_state2 ps2; 2194 struct kvm_memory_alias alias; 2195 struct kvm_pit_config pit_config; 2196 } u; 2197 2198 switch (ioctl) { 2199 case KVM_SET_TSS_ADDR: 2200 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 2201 if (r < 0) 2202 goto out; 2203 break; 2204 case KVM_SET_IDENTITY_MAP_ADDR: { 2205 u64 ident_addr; 2206 2207 r = -EFAULT; 2208 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 2209 goto out; 2210 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 2211 if (r < 0) 2212 goto out; 2213 break; 2214 } 2215 case KVM_SET_MEMORY_REGION: { 2216 struct kvm_memory_region kvm_mem; 2217 struct kvm_userspace_memory_region kvm_userspace_mem; 2218 2219 r = -EFAULT; 2220 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2221 goto out; 2222 kvm_userspace_mem.slot = kvm_mem.slot; 2223 kvm_userspace_mem.flags = kvm_mem.flags; 2224 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 2225 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 2226 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 2227 if (r) 2228 goto out; 2229 break; 2230 } 2231 case KVM_SET_NR_MMU_PAGES: 2232 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2233 if (r) 2234 goto out; 2235 break; 2236 case KVM_GET_NR_MMU_PAGES: 2237 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2238 break; 2239 case KVM_SET_MEMORY_ALIAS: 2240 r = -EFAULT; 2241 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 2242 goto out; 2243 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 2244 if (r) 2245 goto out; 2246 break; 2247 case KVM_CREATE_IRQCHIP: 2248 r = -ENOMEM; 2249 kvm->arch.vpic = kvm_create_pic(kvm); 2250 if (kvm->arch.vpic) { 2251 r = kvm_ioapic_init(kvm); 2252 if (r) { 2253 kfree(kvm->arch.vpic); 2254 kvm->arch.vpic = NULL; 2255 goto out; 2256 } 2257 } else 2258 goto out; 2259 r = kvm_setup_default_irq_routing(kvm); 2260 if (r) { 2261 kfree(kvm->arch.vpic); 2262 kfree(kvm->arch.vioapic); 2263 goto out; 2264 } 2265 break; 2266 case KVM_CREATE_PIT: 2267 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2268 goto create_pit; 2269 case KVM_CREATE_PIT2: 2270 r = -EFAULT; 2271 if (copy_from_user(&u.pit_config, argp, 2272 sizeof(struct kvm_pit_config))) 2273 goto out; 2274 create_pit: 2275 down_write(&kvm->slots_lock); 2276 r = -EEXIST; 2277 if (kvm->arch.vpit) 2278 goto create_pit_unlock; 2279 r = -ENOMEM; 2280 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 2281 if (kvm->arch.vpit) 2282 r = 0; 2283 create_pit_unlock: 2284 up_write(&kvm->slots_lock); 2285 break; 2286 case KVM_IRQ_LINE_STATUS: 2287 case KVM_IRQ_LINE: { 2288 struct kvm_irq_level irq_event; 2289 2290 r = -EFAULT; 2291 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2292 goto out; 2293 if (irqchip_in_kernel(kvm)) { 2294 __s32 status; 2295 mutex_lock(&kvm->irq_lock); 2296 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2297 irq_event.irq, irq_event.level); 2298 mutex_unlock(&kvm->irq_lock); 2299 if (ioctl == KVM_IRQ_LINE_STATUS) { 2300 irq_event.status = status; 2301 if (copy_to_user(argp, &irq_event, 2302 sizeof irq_event)) 2303 goto out; 2304 } 2305 r = 0; 2306 } 2307 break; 2308 } 2309 case KVM_GET_IRQCHIP: { 2310 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2311 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2312 2313 r = -ENOMEM; 2314 if (!chip) 2315 goto out; 2316 r = -EFAULT; 2317 if (copy_from_user(chip, argp, sizeof *chip)) 2318 goto get_irqchip_out; 2319 r = -ENXIO; 2320 if (!irqchip_in_kernel(kvm)) 2321 goto get_irqchip_out; 2322 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 2323 if (r) 2324 goto get_irqchip_out; 2325 r = -EFAULT; 2326 if (copy_to_user(argp, chip, sizeof *chip)) 2327 goto get_irqchip_out; 2328 r = 0; 2329 get_irqchip_out: 2330 kfree(chip); 2331 if (r) 2332 goto out; 2333 break; 2334 } 2335 case KVM_SET_IRQCHIP: { 2336 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2337 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2338 2339 r = -ENOMEM; 2340 if (!chip) 2341 goto out; 2342 r = -EFAULT; 2343 if (copy_from_user(chip, argp, sizeof *chip)) 2344 goto set_irqchip_out; 2345 r = -ENXIO; 2346 if (!irqchip_in_kernel(kvm)) 2347 goto set_irqchip_out; 2348 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 2349 if (r) 2350 goto set_irqchip_out; 2351 r = 0; 2352 set_irqchip_out: 2353 kfree(chip); 2354 if (r) 2355 goto out; 2356 break; 2357 } 2358 case KVM_GET_PIT: { 2359 r = -EFAULT; 2360 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 2361 goto out; 2362 r = -ENXIO; 2363 if (!kvm->arch.vpit) 2364 goto out; 2365 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 2366 if (r) 2367 goto out; 2368 r = -EFAULT; 2369 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 2370 goto out; 2371 r = 0; 2372 break; 2373 } 2374 case KVM_SET_PIT: { 2375 r = -EFAULT; 2376 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 2377 goto out; 2378 r = -ENXIO; 2379 if (!kvm->arch.vpit) 2380 goto out; 2381 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2382 if (r) 2383 goto out; 2384 r = 0; 2385 break; 2386 } 2387 case KVM_GET_PIT2: { 2388 r = -ENXIO; 2389 if (!kvm->arch.vpit) 2390 goto out; 2391 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 2392 if (r) 2393 goto out; 2394 r = -EFAULT; 2395 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 2396 goto out; 2397 r = 0; 2398 break; 2399 } 2400 case KVM_SET_PIT2: { 2401 r = -EFAULT; 2402 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 2403 goto out; 2404 r = -ENXIO; 2405 if (!kvm->arch.vpit) 2406 goto out; 2407 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 2408 if (r) 2409 goto out; 2410 r = 0; 2411 break; 2412 } 2413 case KVM_REINJECT_CONTROL: { 2414 struct kvm_reinject_control control; 2415 r = -EFAULT; 2416 if (copy_from_user(&control, argp, sizeof(control))) 2417 goto out; 2418 r = kvm_vm_ioctl_reinject(kvm, &control); 2419 if (r) 2420 goto out; 2421 r = 0; 2422 break; 2423 } 2424 default: 2425 ; 2426 } 2427 out: 2428 return r; 2429 } 2430 2431 static void kvm_init_msr_list(void) 2432 { 2433 u32 dummy[2]; 2434 unsigned i, j; 2435 2436 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2437 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2438 continue; 2439 if (j < i) 2440 msrs_to_save[j] = msrs_to_save[i]; 2441 j++; 2442 } 2443 num_msrs_to_save = j; 2444 } 2445 2446 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 2447 const void *v) 2448 { 2449 if (vcpu->arch.apic && 2450 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 2451 return 0; 2452 2453 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 2454 } 2455 2456 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 2457 { 2458 if (vcpu->arch.apic && 2459 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 2460 return 0; 2461 2462 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 2463 } 2464 2465 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2466 struct kvm_vcpu *vcpu) 2467 { 2468 void *data = val; 2469 int r = X86EMUL_CONTINUE; 2470 2471 while (bytes) { 2472 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2473 unsigned offset = addr & (PAGE_SIZE-1); 2474 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2475 int ret; 2476 2477 if (gpa == UNMAPPED_GVA) { 2478 r = X86EMUL_PROPAGATE_FAULT; 2479 goto out; 2480 } 2481 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2482 if (ret < 0) { 2483 r = X86EMUL_UNHANDLEABLE; 2484 goto out; 2485 } 2486 2487 bytes -= toread; 2488 data += toread; 2489 addr += toread; 2490 } 2491 out: 2492 return r; 2493 } 2494 2495 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2496 struct kvm_vcpu *vcpu) 2497 { 2498 void *data = val; 2499 int r = X86EMUL_CONTINUE; 2500 2501 while (bytes) { 2502 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2503 unsigned offset = addr & (PAGE_SIZE-1); 2504 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2505 int ret; 2506 2507 if (gpa == UNMAPPED_GVA) { 2508 r = X86EMUL_PROPAGATE_FAULT; 2509 goto out; 2510 } 2511 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2512 if (ret < 0) { 2513 r = X86EMUL_UNHANDLEABLE; 2514 goto out; 2515 } 2516 2517 bytes -= towrite; 2518 data += towrite; 2519 addr += towrite; 2520 } 2521 out: 2522 return r; 2523 } 2524 2525 2526 static int emulator_read_emulated(unsigned long addr, 2527 void *val, 2528 unsigned int bytes, 2529 struct kvm_vcpu *vcpu) 2530 { 2531 gpa_t gpa; 2532 2533 if (vcpu->mmio_read_completed) { 2534 memcpy(val, vcpu->mmio_data, bytes); 2535 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 2536 vcpu->mmio_phys_addr, *(u64 *)val); 2537 vcpu->mmio_read_completed = 0; 2538 return X86EMUL_CONTINUE; 2539 } 2540 2541 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2542 2543 /* For APIC access vmexit */ 2544 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2545 goto mmio; 2546 2547 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2548 == X86EMUL_CONTINUE) 2549 return X86EMUL_CONTINUE; 2550 if (gpa == UNMAPPED_GVA) 2551 return X86EMUL_PROPAGATE_FAULT; 2552 2553 mmio: 2554 /* 2555 * Is this MMIO handled locally? 2556 */ 2557 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 2558 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 2559 return X86EMUL_CONTINUE; 2560 } 2561 2562 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 2563 2564 vcpu->mmio_needed = 1; 2565 vcpu->mmio_phys_addr = gpa; 2566 vcpu->mmio_size = bytes; 2567 vcpu->mmio_is_write = 0; 2568 2569 return X86EMUL_UNHANDLEABLE; 2570 } 2571 2572 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2573 const void *val, int bytes) 2574 { 2575 int ret; 2576 2577 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2578 if (ret < 0) 2579 return 0; 2580 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2581 return 1; 2582 } 2583 2584 static int emulator_write_emulated_onepage(unsigned long addr, 2585 const void *val, 2586 unsigned int bytes, 2587 struct kvm_vcpu *vcpu) 2588 { 2589 gpa_t gpa; 2590 2591 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2592 2593 if (gpa == UNMAPPED_GVA) { 2594 kvm_inject_page_fault(vcpu, addr, 2); 2595 return X86EMUL_PROPAGATE_FAULT; 2596 } 2597 2598 /* For APIC access vmexit */ 2599 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2600 goto mmio; 2601 2602 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2603 return X86EMUL_CONTINUE; 2604 2605 mmio: 2606 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 2607 /* 2608 * Is this MMIO handled locally? 2609 */ 2610 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 2611 return X86EMUL_CONTINUE; 2612 2613 vcpu->mmio_needed = 1; 2614 vcpu->mmio_phys_addr = gpa; 2615 vcpu->mmio_size = bytes; 2616 vcpu->mmio_is_write = 1; 2617 memcpy(vcpu->mmio_data, val, bytes); 2618 2619 return X86EMUL_CONTINUE; 2620 } 2621 2622 int emulator_write_emulated(unsigned long addr, 2623 const void *val, 2624 unsigned int bytes, 2625 struct kvm_vcpu *vcpu) 2626 { 2627 /* Crossing a page boundary? */ 2628 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2629 int rc, now; 2630 2631 now = -addr & ~PAGE_MASK; 2632 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2633 if (rc != X86EMUL_CONTINUE) 2634 return rc; 2635 addr += now; 2636 val += now; 2637 bytes -= now; 2638 } 2639 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2640 } 2641 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2642 2643 static int emulator_cmpxchg_emulated(unsigned long addr, 2644 const void *old, 2645 const void *new, 2646 unsigned int bytes, 2647 struct kvm_vcpu *vcpu) 2648 { 2649 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 2650 #ifndef CONFIG_X86_64 2651 /* guests cmpxchg8b have to be emulated atomically */ 2652 if (bytes == 8) { 2653 gpa_t gpa; 2654 struct page *page; 2655 char *kaddr; 2656 u64 val; 2657 2658 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2659 2660 if (gpa == UNMAPPED_GVA || 2661 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2662 goto emul_write; 2663 2664 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2665 goto emul_write; 2666 2667 val = *(u64 *)new; 2668 2669 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2670 2671 kaddr = kmap_atomic(page, KM_USER0); 2672 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2673 kunmap_atomic(kaddr, KM_USER0); 2674 kvm_release_page_dirty(page); 2675 } 2676 emul_write: 2677 #endif 2678 2679 return emulator_write_emulated(addr, new, bytes, vcpu); 2680 } 2681 2682 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2683 { 2684 return kvm_x86_ops->get_segment_base(vcpu, seg); 2685 } 2686 2687 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2688 { 2689 kvm_mmu_invlpg(vcpu, address); 2690 return X86EMUL_CONTINUE; 2691 } 2692 2693 int emulate_clts(struct kvm_vcpu *vcpu) 2694 { 2695 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2696 return X86EMUL_CONTINUE; 2697 } 2698 2699 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2700 { 2701 struct kvm_vcpu *vcpu = ctxt->vcpu; 2702 2703 switch (dr) { 2704 case 0 ... 3: 2705 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2706 return X86EMUL_CONTINUE; 2707 default: 2708 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2709 return X86EMUL_UNHANDLEABLE; 2710 } 2711 } 2712 2713 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2714 { 2715 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2716 int exception; 2717 2718 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2719 if (exception) { 2720 /* FIXME: better handling */ 2721 return X86EMUL_UNHANDLEABLE; 2722 } 2723 return X86EMUL_CONTINUE; 2724 } 2725 2726 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2727 { 2728 u8 opcodes[4]; 2729 unsigned long rip = kvm_rip_read(vcpu); 2730 unsigned long rip_linear; 2731 2732 if (!printk_ratelimit()) 2733 return; 2734 2735 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2736 2737 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2738 2739 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2740 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2741 } 2742 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2743 2744 static struct x86_emulate_ops emulate_ops = { 2745 .read_std = kvm_read_guest_virt, 2746 .read_emulated = emulator_read_emulated, 2747 .write_emulated = emulator_write_emulated, 2748 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2749 }; 2750 2751 static void cache_all_regs(struct kvm_vcpu *vcpu) 2752 { 2753 kvm_register_read(vcpu, VCPU_REGS_RAX); 2754 kvm_register_read(vcpu, VCPU_REGS_RSP); 2755 kvm_register_read(vcpu, VCPU_REGS_RIP); 2756 vcpu->arch.regs_dirty = ~0; 2757 } 2758 2759 int emulate_instruction(struct kvm_vcpu *vcpu, 2760 struct kvm_run *run, 2761 unsigned long cr2, 2762 u16 error_code, 2763 int emulation_type) 2764 { 2765 int r, shadow_mask; 2766 struct decode_cache *c; 2767 2768 kvm_clear_exception_queue(vcpu); 2769 vcpu->arch.mmio_fault_cr2 = cr2; 2770 /* 2771 * TODO: fix emulate.c to use guest_read/write_register 2772 * instead of direct ->regs accesses, can save hundred cycles 2773 * on Intel for instructions that don't read/change RSP, for 2774 * for example. 2775 */ 2776 cache_all_regs(vcpu); 2777 2778 vcpu->mmio_is_write = 0; 2779 vcpu->arch.pio.string = 0; 2780 2781 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2782 int cs_db, cs_l; 2783 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2784 2785 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2786 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2787 vcpu->arch.emulate_ctxt.mode = 2788 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2789 ? X86EMUL_MODE_REAL : cs_l 2790 ? X86EMUL_MODE_PROT64 : cs_db 2791 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2792 2793 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2794 2795 /* Only allow emulation of specific instructions on #UD 2796 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 2797 c = &vcpu->arch.emulate_ctxt.decode; 2798 if (emulation_type & EMULTYPE_TRAP_UD) { 2799 if (!c->twobyte) 2800 return EMULATE_FAIL; 2801 switch (c->b) { 2802 case 0x01: /* VMMCALL */ 2803 if (c->modrm_mod != 3 || c->modrm_rm != 1) 2804 return EMULATE_FAIL; 2805 break; 2806 case 0x34: /* sysenter */ 2807 case 0x35: /* sysexit */ 2808 if (c->modrm_mod != 0 || c->modrm_rm != 0) 2809 return EMULATE_FAIL; 2810 break; 2811 case 0x05: /* syscall */ 2812 if (c->modrm_mod != 0 || c->modrm_rm != 0) 2813 return EMULATE_FAIL; 2814 break; 2815 default: 2816 return EMULATE_FAIL; 2817 } 2818 2819 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 2820 return EMULATE_FAIL; 2821 } 2822 2823 ++vcpu->stat.insn_emulation; 2824 if (r) { 2825 ++vcpu->stat.insn_emulation_fail; 2826 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2827 return EMULATE_DONE; 2828 return EMULATE_FAIL; 2829 } 2830 } 2831 2832 if (emulation_type & EMULTYPE_SKIP) { 2833 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 2834 return EMULATE_DONE; 2835 } 2836 2837 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2838 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 2839 2840 if (r == 0) 2841 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 2842 2843 if (vcpu->arch.pio.string) 2844 return EMULATE_DO_MMIO; 2845 2846 if ((r || vcpu->mmio_is_write) && run) { 2847 run->exit_reason = KVM_EXIT_MMIO; 2848 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2849 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2850 run->mmio.len = vcpu->mmio_size; 2851 run->mmio.is_write = vcpu->mmio_is_write; 2852 } 2853 2854 if (r) { 2855 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2856 return EMULATE_DONE; 2857 if (!vcpu->mmio_needed) { 2858 kvm_report_emulation_failure(vcpu, "mmio"); 2859 return EMULATE_FAIL; 2860 } 2861 return EMULATE_DO_MMIO; 2862 } 2863 2864 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2865 2866 if (vcpu->mmio_is_write) { 2867 vcpu->mmio_needed = 0; 2868 return EMULATE_DO_MMIO; 2869 } 2870 2871 return EMULATE_DONE; 2872 } 2873 EXPORT_SYMBOL_GPL(emulate_instruction); 2874 2875 static int pio_copy_data(struct kvm_vcpu *vcpu) 2876 { 2877 void *p = vcpu->arch.pio_data; 2878 gva_t q = vcpu->arch.pio.guest_gva; 2879 unsigned bytes; 2880 int ret; 2881 2882 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2883 if (vcpu->arch.pio.in) 2884 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2885 else 2886 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2887 return ret; 2888 } 2889 2890 int complete_pio(struct kvm_vcpu *vcpu) 2891 { 2892 struct kvm_pio_request *io = &vcpu->arch.pio; 2893 long delta; 2894 int r; 2895 unsigned long val; 2896 2897 if (!io->string) { 2898 if (io->in) { 2899 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2900 memcpy(&val, vcpu->arch.pio_data, io->size); 2901 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2902 } 2903 } else { 2904 if (io->in) { 2905 r = pio_copy_data(vcpu); 2906 if (r) 2907 return r; 2908 } 2909 2910 delta = 1; 2911 if (io->rep) { 2912 delta *= io->cur_count; 2913 /* 2914 * The size of the register should really depend on 2915 * current address size. 2916 */ 2917 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2918 val -= delta; 2919 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2920 } 2921 if (io->down) 2922 delta = -delta; 2923 delta *= io->size; 2924 if (io->in) { 2925 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2926 val += delta; 2927 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2928 } else { 2929 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2930 val += delta; 2931 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2932 } 2933 } 2934 2935 io->count -= io->cur_count; 2936 io->cur_count = 0; 2937 2938 return 0; 2939 } 2940 2941 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 2942 { 2943 /* TODO: String I/O for in kernel device */ 2944 int r; 2945 2946 if (vcpu->arch.pio.in) 2947 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 2948 vcpu->arch.pio.size, pd); 2949 else 2950 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 2951 vcpu->arch.pio.size, pd); 2952 return r; 2953 } 2954 2955 static int pio_string_write(struct kvm_vcpu *vcpu) 2956 { 2957 struct kvm_pio_request *io = &vcpu->arch.pio; 2958 void *pd = vcpu->arch.pio_data; 2959 int i, r = 0; 2960 2961 for (i = 0; i < io->cur_count; i++) { 2962 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 2963 io->port, io->size, pd)) { 2964 r = -EOPNOTSUPP; 2965 break; 2966 } 2967 pd += io->size; 2968 } 2969 return r; 2970 } 2971 2972 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2973 int size, unsigned port) 2974 { 2975 unsigned long val; 2976 2977 vcpu->run->exit_reason = KVM_EXIT_IO; 2978 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2979 vcpu->run->io.size = vcpu->arch.pio.size = size; 2980 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2981 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2982 vcpu->run->io.port = vcpu->arch.pio.port = port; 2983 vcpu->arch.pio.in = in; 2984 vcpu->arch.pio.string = 0; 2985 vcpu->arch.pio.down = 0; 2986 vcpu->arch.pio.rep = 0; 2987 2988 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 2989 size, 1); 2990 2991 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2992 memcpy(vcpu->arch.pio_data, &val, 4); 2993 2994 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 2995 complete_pio(vcpu); 2996 return 1; 2997 } 2998 return 0; 2999 } 3000 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3001 3002 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3003 int size, unsigned long count, int down, 3004 gva_t address, int rep, unsigned port) 3005 { 3006 unsigned now, in_page; 3007 int ret = 0; 3008 3009 vcpu->run->exit_reason = KVM_EXIT_IO; 3010 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3011 vcpu->run->io.size = vcpu->arch.pio.size = size; 3012 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3013 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 3014 vcpu->run->io.port = vcpu->arch.pio.port = port; 3015 vcpu->arch.pio.in = in; 3016 vcpu->arch.pio.string = 1; 3017 vcpu->arch.pio.down = down; 3018 vcpu->arch.pio.rep = rep; 3019 3020 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3021 size, count); 3022 3023 if (!count) { 3024 kvm_x86_ops->skip_emulated_instruction(vcpu); 3025 return 1; 3026 } 3027 3028 if (!down) 3029 in_page = PAGE_SIZE - offset_in_page(address); 3030 else 3031 in_page = offset_in_page(address) + size; 3032 now = min(count, (unsigned long)in_page / size); 3033 if (!now) 3034 now = 1; 3035 if (down) { 3036 /* 3037 * String I/O in reverse. Yuck. Kill the guest, fix later. 3038 */ 3039 pr_unimpl(vcpu, "guest string pio down\n"); 3040 kvm_inject_gp(vcpu, 0); 3041 return 1; 3042 } 3043 vcpu->run->io.count = now; 3044 vcpu->arch.pio.cur_count = now; 3045 3046 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 3047 kvm_x86_ops->skip_emulated_instruction(vcpu); 3048 3049 vcpu->arch.pio.guest_gva = address; 3050 3051 if (!vcpu->arch.pio.in) { 3052 /* string PIO write */ 3053 ret = pio_copy_data(vcpu); 3054 if (ret == X86EMUL_PROPAGATE_FAULT) { 3055 kvm_inject_gp(vcpu, 0); 3056 return 1; 3057 } 3058 if (ret == 0 && !pio_string_write(vcpu)) { 3059 complete_pio(vcpu); 3060 if (vcpu->arch.pio.count == 0) 3061 ret = 1; 3062 } 3063 } 3064 /* no string PIO read support yet */ 3065 3066 return ret; 3067 } 3068 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3069 3070 static void bounce_off(void *info) 3071 { 3072 /* nothing */ 3073 } 3074 3075 static unsigned int ref_freq; 3076 static unsigned long tsc_khz_ref; 3077 3078 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3079 void *data) 3080 { 3081 struct cpufreq_freqs *freq = data; 3082 struct kvm *kvm; 3083 struct kvm_vcpu *vcpu; 3084 int i, send_ipi = 0; 3085 3086 if (!ref_freq) 3087 ref_freq = freq->old; 3088 3089 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3090 return 0; 3091 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3092 return 0; 3093 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3094 3095 spin_lock(&kvm_lock); 3096 list_for_each_entry(kvm, &vm_list, vm_list) { 3097 kvm_for_each_vcpu(i, vcpu, kvm) { 3098 if (vcpu->cpu != freq->cpu) 3099 continue; 3100 if (!kvm_request_guest_time_update(vcpu)) 3101 continue; 3102 if (vcpu->cpu != smp_processor_id()) 3103 send_ipi++; 3104 } 3105 } 3106 spin_unlock(&kvm_lock); 3107 3108 if (freq->old < freq->new && send_ipi) { 3109 /* 3110 * We upscale the frequency. Must make the guest 3111 * doesn't see old kvmclock values while running with 3112 * the new frequency, otherwise we risk the guest sees 3113 * time go backwards. 3114 * 3115 * In case we update the frequency for another cpu 3116 * (which might be in guest context) send an interrupt 3117 * to kick the cpu out of guest context. Next time 3118 * guest context is entered kvmclock will be updated, 3119 * so the guest will not see stale values. 3120 */ 3121 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 3122 } 3123 return 0; 3124 } 3125 3126 static struct notifier_block kvmclock_cpufreq_notifier_block = { 3127 .notifier_call = kvmclock_cpufreq_notifier 3128 }; 3129 3130 int kvm_arch_init(void *opaque) 3131 { 3132 int r, cpu; 3133 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3134 3135 if (kvm_x86_ops) { 3136 printk(KERN_ERR "kvm: already loaded the other module\n"); 3137 r = -EEXIST; 3138 goto out; 3139 } 3140 3141 if (!ops->cpu_has_kvm_support()) { 3142 printk(KERN_ERR "kvm: no hardware support\n"); 3143 r = -EOPNOTSUPP; 3144 goto out; 3145 } 3146 if (ops->disabled_by_bios()) { 3147 printk(KERN_ERR "kvm: disabled by bios\n"); 3148 r = -EOPNOTSUPP; 3149 goto out; 3150 } 3151 3152 r = kvm_mmu_module_init(); 3153 if (r) 3154 goto out; 3155 3156 kvm_init_msr_list(); 3157 3158 kvm_x86_ops = ops; 3159 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 3160 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 3161 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3162 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3163 3164 for_each_possible_cpu(cpu) 3165 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 3166 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 3167 tsc_khz_ref = tsc_khz; 3168 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 3169 CPUFREQ_TRANSITION_NOTIFIER); 3170 } 3171 3172 return 0; 3173 3174 out: 3175 return r; 3176 } 3177 3178 void kvm_arch_exit(void) 3179 { 3180 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 3181 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 3182 CPUFREQ_TRANSITION_NOTIFIER); 3183 kvm_x86_ops = NULL; 3184 kvm_mmu_module_exit(); 3185 } 3186 3187 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3188 { 3189 ++vcpu->stat.halt_exits; 3190 if (irqchip_in_kernel(vcpu->kvm)) { 3191 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3192 return 1; 3193 } else { 3194 vcpu->run->exit_reason = KVM_EXIT_HLT; 3195 return 0; 3196 } 3197 } 3198 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 3199 3200 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 3201 unsigned long a1) 3202 { 3203 if (is_long_mode(vcpu)) 3204 return a0; 3205 else 3206 return a0 | ((gpa_t)a1 << 32); 3207 } 3208 3209 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3210 { 3211 unsigned long nr, a0, a1, a2, a3, ret; 3212 int r = 1; 3213 3214 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3215 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3216 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3217 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3218 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3219 3220 trace_kvm_hypercall(nr, a0, a1, a2, a3); 3221 3222 if (!is_long_mode(vcpu)) { 3223 nr &= 0xFFFFFFFF; 3224 a0 &= 0xFFFFFFFF; 3225 a1 &= 0xFFFFFFFF; 3226 a2 &= 0xFFFFFFFF; 3227 a3 &= 0xFFFFFFFF; 3228 } 3229 3230 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 3231 ret = -KVM_EPERM; 3232 goto out; 3233 } 3234 3235 switch (nr) { 3236 case KVM_HC_VAPIC_POLL_IRQ: 3237 ret = 0; 3238 break; 3239 case KVM_HC_MMU_OP: 3240 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 3241 break; 3242 default: 3243 ret = -KVM_ENOSYS; 3244 break; 3245 } 3246 out: 3247 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3248 ++vcpu->stat.hypercalls; 3249 return r; 3250 } 3251 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 3252 3253 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3254 { 3255 char instruction[3]; 3256 int ret = 0; 3257 unsigned long rip = kvm_rip_read(vcpu); 3258 3259 3260 /* 3261 * Blow out the MMU to ensure that no other VCPU has an active mapping 3262 * to ensure that the updated hypercall appears atomically across all 3263 * VCPUs. 3264 */ 3265 kvm_mmu_zap_all(vcpu->kvm); 3266 3267 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3268 if (emulator_write_emulated(rip, instruction, 3, vcpu) 3269 != X86EMUL_CONTINUE) 3270 ret = -EFAULT; 3271 3272 return ret; 3273 } 3274 3275 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3276 { 3277 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 3278 } 3279 3280 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3281 { 3282 struct descriptor_table dt = { limit, base }; 3283 3284 kvm_x86_ops->set_gdt(vcpu, &dt); 3285 } 3286 3287 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3288 { 3289 struct descriptor_table dt = { limit, base }; 3290 3291 kvm_x86_ops->set_idt(vcpu, &dt); 3292 } 3293 3294 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 3295 unsigned long *rflags) 3296 { 3297 kvm_lmsw(vcpu, msw); 3298 *rflags = kvm_x86_ops->get_rflags(vcpu); 3299 } 3300 3301 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3302 { 3303 unsigned long value; 3304 3305 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3306 switch (cr) { 3307 case 0: 3308 value = vcpu->arch.cr0; 3309 break; 3310 case 2: 3311 value = vcpu->arch.cr2; 3312 break; 3313 case 3: 3314 value = vcpu->arch.cr3; 3315 break; 3316 case 4: 3317 value = vcpu->arch.cr4; 3318 break; 3319 case 8: 3320 value = kvm_get_cr8(vcpu); 3321 break; 3322 default: 3323 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3324 return 0; 3325 } 3326 3327 return value; 3328 } 3329 3330 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3331 unsigned long *rflags) 3332 { 3333 switch (cr) { 3334 case 0: 3335 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3336 *rflags = kvm_x86_ops->get_rflags(vcpu); 3337 break; 3338 case 2: 3339 vcpu->arch.cr2 = val; 3340 break; 3341 case 3: 3342 kvm_set_cr3(vcpu, val); 3343 break; 3344 case 4: 3345 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 3346 break; 3347 case 8: 3348 kvm_set_cr8(vcpu, val & 0xfUL); 3349 break; 3350 default: 3351 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3352 } 3353 } 3354 3355 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 3356 { 3357 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 3358 int j, nent = vcpu->arch.cpuid_nent; 3359 3360 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 3361 /* when no next entry is found, the current entry[i] is reselected */ 3362 for (j = i + 1; ; j = (j + 1) % nent) { 3363 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 3364 if (ej->function == e->function) { 3365 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 3366 return j; 3367 } 3368 } 3369 return 0; /* silence gcc, even though control never reaches here */ 3370 } 3371 3372 /* find an entry with matching function, matching index (if needed), and that 3373 * should be read next (if it's stateful) */ 3374 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3375 u32 function, u32 index) 3376 { 3377 if (e->function != function) 3378 return 0; 3379 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3380 return 0; 3381 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3382 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3383 return 0; 3384 return 1; 3385 } 3386 3387 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3388 u32 function, u32 index) 3389 { 3390 int i; 3391 struct kvm_cpuid_entry2 *best = NULL; 3392 3393 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3394 struct kvm_cpuid_entry2 *e; 3395 3396 e = &vcpu->arch.cpuid_entries[i]; 3397 if (is_matching_cpuid_entry(e, function, index)) { 3398 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3399 move_to_next_stateful_cpuid_entry(vcpu, i); 3400 best = e; 3401 break; 3402 } 3403 /* 3404 * Both basic or both extended? 3405 */ 3406 if (((e->function ^ function) & 0x80000000) == 0) 3407 if (!best || e->function > best->function) 3408 best = e; 3409 } 3410 return best; 3411 } 3412 3413 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3414 { 3415 struct kvm_cpuid_entry2 *best; 3416 3417 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3418 if (best) 3419 return best->eax & 0xff; 3420 return 36; 3421 } 3422 3423 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3424 { 3425 u32 function, index; 3426 struct kvm_cpuid_entry2 *best; 3427 3428 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3429 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3430 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3431 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3432 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3433 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3434 best = kvm_find_cpuid_entry(vcpu, function, index); 3435 if (best) { 3436 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3437 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3438 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3439 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3440 } 3441 kvm_x86_ops->skip_emulated_instruction(vcpu); 3442 trace_kvm_cpuid(function, 3443 kvm_register_read(vcpu, VCPU_REGS_RAX), 3444 kvm_register_read(vcpu, VCPU_REGS_RBX), 3445 kvm_register_read(vcpu, VCPU_REGS_RCX), 3446 kvm_register_read(vcpu, VCPU_REGS_RDX)); 3447 } 3448 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3449 3450 /* 3451 * Check if userspace requested an interrupt window, and that the 3452 * interrupt window is open. 3453 * 3454 * No need to exit to userspace if we already have an interrupt queued. 3455 */ 3456 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3457 struct kvm_run *kvm_run) 3458 { 3459 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3460 kvm_run->request_interrupt_window && 3461 kvm_arch_interrupt_allowed(vcpu)); 3462 } 3463 3464 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3465 struct kvm_run *kvm_run) 3466 { 3467 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3468 kvm_run->cr8 = kvm_get_cr8(vcpu); 3469 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3470 if (irqchip_in_kernel(vcpu->kvm)) 3471 kvm_run->ready_for_interrupt_injection = 1; 3472 else 3473 kvm_run->ready_for_interrupt_injection = 3474 kvm_arch_interrupt_allowed(vcpu) && 3475 !kvm_cpu_has_interrupt(vcpu) && 3476 !kvm_event_needs_reinjection(vcpu); 3477 } 3478 3479 static void vapic_enter(struct kvm_vcpu *vcpu) 3480 { 3481 struct kvm_lapic *apic = vcpu->arch.apic; 3482 struct page *page; 3483 3484 if (!apic || !apic->vapic_addr) 3485 return; 3486 3487 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3488 3489 vcpu->arch.apic->vapic_page = page; 3490 } 3491 3492 static void vapic_exit(struct kvm_vcpu *vcpu) 3493 { 3494 struct kvm_lapic *apic = vcpu->arch.apic; 3495 3496 if (!apic || !apic->vapic_addr) 3497 return; 3498 3499 down_read(&vcpu->kvm->slots_lock); 3500 kvm_release_page_dirty(apic->vapic_page); 3501 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3502 up_read(&vcpu->kvm->slots_lock); 3503 } 3504 3505 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3506 { 3507 int max_irr, tpr; 3508 3509 if (!kvm_x86_ops->update_cr8_intercept) 3510 return; 3511 3512 if (!vcpu->arch.apic) 3513 return; 3514 3515 if (!vcpu->arch.apic->vapic_addr) 3516 max_irr = kvm_lapic_find_highest_irr(vcpu); 3517 else 3518 max_irr = -1; 3519 3520 if (max_irr != -1) 3521 max_irr >>= 4; 3522 3523 tpr = kvm_lapic_get_cr8(vcpu); 3524 3525 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3526 } 3527 3528 static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3529 { 3530 /* try to reinject previous events if any */ 3531 if (vcpu->arch.exception.pending) { 3532 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 3533 vcpu->arch.exception.has_error_code, 3534 vcpu->arch.exception.error_code); 3535 return; 3536 } 3537 3538 if (vcpu->arch.nmi_injected) { 3539 kvm_x86_ops->set_nmi(vcpu); 3540 return; 3541 } 3542 3543 if (vcpu->arch.interrupt.pending) { 3544 kvm_x86_ops->set_irq(vcpu); 3545 return; 3546 } 3547 3548 /* try to inject new event if pending */ 3549 if (vcpu->arch.nmi_pending) { 3550 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3551 vcpu->arch.nmi_pending = false; 3552 vcpu->arch.nmi_injected = true; 3553 kvm_x86_ops->set_nmi(vcpu); 3554 } 3555 } else if (kvm_cpu_has_interrupt(vcpu)) { 3556 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3557 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3558 false); 3559 kvm_x86_ops->set_irq(vcpu); 3560 } 3561 } 3562 } 3563 3564 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3565 { 3566 int r; 3567 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3568 kvm_run->request_interrupt_window; 3569 3570 if (vcpu->requests) 3571 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3572 kvm_mmu_unload(vcpu); 3573 3574 r = kvm_mmu_reload(vcpu); 3575 if (unlikely(r)) 3576 goto out; 3577 3578 if (vcpu->requests) { 3579 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3580 __kvm_migrate_timers(vcpu); 3581 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3582 kvm_write_guest_time(vcpu); 3583 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3584 kvm_mmu_sync_roots(vcpu); 3585 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3586 kvm_x86_ops->tlb_flush(vcpu); 3587 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3588 &vcpu->requests)) { 3589 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3590 r = 0; 3591 goto out; 3592 } 3593 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3594 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3595 r = 0; 3596 goto out; 3597 } 3598 } 3599 3600 preempt_disable(); 3601 3602 kvm_x86_ops->prepare_guest_switch(vcpu); 3603 kvm_load_guest_fpu(vcpu); 3604 3605 local_irq_disable(); 3606 3607 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3608 smp_mb__after_clear_bit(); 3609 3610 if (vcpu->requests || need_resched() || signal_pending(current)) { 3611 set_bit(KVM_REQ_KICK, &vcpu->requests); 3612 local_irq_enable(); 3613 preempt_enable(); 3614 r = 1; 3615 goto out; 3616 } 3617 3618 inject_pending_event(vcpu, kvm_run); 3619 3620 /* enable NMI/IRQ window open exits if needed */ 3621 if (vcpu->arch.nmi_pending) 3622 kvm_x86_ops->enable_nmi_window(vcpu); 3623 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3624 kvm_x86_ops->enable_irq_window(vcpu); 3625 3626 if (kvm_lapic_enabled(vcpu)) { 3627 update_cr8_intercept(vcpu); 3628 kvm_lapic_sync_to_vapic(vcpu); 3629 } 3630 3631 up_read(&vcpu->kvm->slots_lock); 3632 3633 kvm_guest_enter(); 3634 3635 if (unlikely(vcpu->arch.switch_db_regs)) { 3636 set_debugreg(0, 7); 3637 set_debugreg(vcpu->arch.eff_db[0], 0); 3638 set_debugreg(vcpu->arch.eff_db[1], 1); 3639 set_debugreg(vcpu->arch.eff_db[2], 2); 3640 set_debugreg(vcpu->arch.eff_db[3], 3); 3641 } 3642 3643 trace_kvm_entry(vcpu->vcpu_id); 3644 kvm_x86_ops->run(vcpu, kvm_run); 3645 3646 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3647 set_debugreg(current->thread.debugreg0, 0); 3648 set_debugreg(current->thread.debugreg1, 1); 3649 set_debugreg(current->thread.debugreg2, 2); 3650 set_debugreg(current->thread.debugreg3, 3); 3651 set_debugreg(current->thread.debugreg6, 6); 3652 set_debugreg(current->thread.debugreg7, 7); 3653 } 3654 3655 set_bit(KVM_REQ_KICK, &vcpu->requests); 3656 local_irq_enable(); 3657 3658 ++vcpu->stat.exits; 3659 3660 /* 3661 * We must have an instruction between local_irq_enable() and 3662 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3663 * the interrupt shadow. The stat.exits increment will do nicely. 3664 * But we need to prevent reordering, hence this barrier(): 3665 */ 3666 barrier(); 3667 3668 kvm_guest_exit(); 3669 3670 preempt_enable(); 3671 3672 down_read(&vcpu->kvm->slots_lock); 3673 3674 /* 3675 * Profile KVM exit RIPs: 3676 */ 3677 if (unlikely(prof_on == KVM_PROFILING)) { 3678 unsigned long rip = kvm_rip_read(vcpu); 3679 profile_hit(KVM_PROFILING, (void *)rip); 3680 } 3681 3682 3683 kvm_lapic_sync_from_vapic(vcpu); 3684 3685 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3686 out: 3687 return r; 3688 } 3689 3690 3691 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3692 { 3693 int r; 3694 3695 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3696 pr_debug("vcpu %d received sipi with vector # %x\n", 3697 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3698 kvm_lapic_reset(vcpu); 3699 r = kvm_arch_vcpu_reset(vcpu); 3700 if (r) 3701 return r; 3702 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3703 } 3704 3705 down_read(&vcpu->kvm->slots_lock); 3706 vapic_enter(vcpu); 3707 3708 r = 1; 3709 while (r > 0) { 3710 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3711 r = vcpu_enter_guest(vcpu, kvm_run); 3712 else { 3713 up_read(&vcpu->kvm->slots_lock); 3714 kvm_vcpu_block(vcpu); 3715 down_read(&vcpu->kvm->slots_lock); 3716 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3717 { 3718 switch(vcpu->arch.mp_state) { 3719 case KVM_MP_STATE_HALTED: 3720 vcpu->arch.mp_state = 3721 KVM_MP_STATE_RUNNABLE; 3722 case KVM_MP_STATE_RUNNABLE: 3723 break; 3724 case KVM_MP_STATE_SIPI_RECEIVED: 3725 default: 3726 r = -EINTR; 3727 break; 3728 } 3729 } 3730 } 3731 3732 if (r <= 0) 3733 break; 3734 3735 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3736 if (kvm_cpu_has_pending_timer(vcpu)) 3737 kvm_inject_pending_timer_irqs(vcpu); 3738 3739 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3740 r = -EINTR; 3741 kvm_run->exit_reason = KVM_EXIT_INTR; 3742 ++vcpu->stat.request_irq_exits; 3743 } 3744 if (signal_pending(current)) { 3745 r = -EINTR; 3746 kvm_run->exit_reason = KVM_EXIT_INTR; 3747 ++vcpu->stat.signal_exits; 3748 } 3749 if (need_resched()) { 3750 up_read(&vcpu->kvm->slots_lock); 3751 kvm_resched(vcpu); 3752 down_read(&vcpu->kvm->slots_lock); 3753 } 3754 } 3755 3756 up_read(&vcpu->kvm->slots_lock); 3757 post_kvm_run_save(vcpu, kvm_run); 3758 3759 vapic_exit(vcpu); 3760 3761 return r; 3762 } 3763 3764 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3765 { 3766 int r; 3767 sigset_t sigsaved; 3768 3769 vcpu_load(vcpu); 3770 3771 if (vcpu->sigset_active) 3772 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3773 3774 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3775 kvm_vcpu_block(vcpu); 3776 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3777 r = -EAGAIN; 3778 goto out; 3779 } 3780 3781 /* re-sync apic's tpr */ 3782 if (!irqchip_in_kernel(vcpu->kvm)) 3783 kvm_set_cr8(vcpu, kvm_run->cr8); 3784 3785 if (vcpu->arch.pio.cur_count) { 3786 r = complete_pio(vcpu); 3787 if (r) 3788 goto out; 3789 } 3790 #if CONFIG_HAS_IOMEM 3791 if (vcpu->mmio_needed) { 3792 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3793 vcpu->mmio_read_completed = 1; 3794 vcpu->mmio_needed = 0; 3795 3796 down_read(&vcpu->kvm->slots_lock); 3797 r = emulate_instruction(vcpu, kvm_run, 3798 vcpu->arch.mmio_fault_cr2, 0, 3799 EMULTYPE_NO_DECODE); 3800 up_read(&vcpu->kvm->slots_lock); 3801 if (r == EMULATE_DO_MMIO) { 3802 /* 3803 * Read-modify-write. Back to userspace. 3804 */ 3805 r = 0; 3806 goto out; 3807 } 3808 } 3809 #endif 3810 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3811 kvm_register_write(vcpu, VCPU_REGS_RAX, 3812 kvm_run->hypercall.ret); 3813 3814 r = __vcpu_run(vcpu, kvm_run); 3815 3816 out: 3817 if (vcpu->sigset_active) 3818 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3819 3820 vcpu_put(vcpu); 3821 return r; 3822 } 3823 3824 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3825 { 3826 vcpu_load(vcpu); 3827 3828 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3829 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3830 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3831 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3832 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3833 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3834 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3835 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3836 #ifdef CONFIG_X86_64 3837 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3838 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3839 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3840 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3841 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3842 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3843 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3844 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3845 #endif 3846 3847 regs->rip = kvm_rip_read(vcpu); 3848 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3849 3850 /* 3851 * Don't leak debug flags in case they were set for guest debugging 3852 */ 3853 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3854 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3855 3856 vcpu_put(vcpu); 3857 3858 return 0; 3859 } 3860 3861 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3862 { 3863 vcpu_load(vcpu); 3864 3865 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3866 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3867 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3868 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3869 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3870 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3871 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3872 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3873 #ifdef CONFIG_X86_64 3874 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3875 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3876 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3877 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3878 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3879 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3880 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3881 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3882 3883 #endif 3884 3885 kvm_rip_write(vcpu, regs->rip); 3886 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3887 3888 3889 vcpu->arch.exception.pending = false; 3890 3891 vcpu_put(vcpu); 3892 3893 return 0; 3894 } 3895 3896 void kvm_get_segment(struct kvm_vcpu *vcpu, 3897 struct kvm_segment *var, int seg) 3898 { 3899 kvm_x86_ops->get_segment(vcpu, var, seg); 3900 } 3901 3902 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3903 { 3904 struct kvm_segment cs; 3905 3906 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3907 *db = cs.db; 3908 *l = cs.l; 3909 } 3910 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3911 3912 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3913 struct kvm_sregs *sregs) 3914 { 3915 struct descriptor_table dt; 3916 3917 vcpu_load(vcpu); 3918 3919 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3920 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3921 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3922 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3923 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3924 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3925 3926 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3927 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3928 3929 kvm_x86_ops->get_idt(vcpu, &dt); 3930 sregs->idt.limit = dt.limit; 3931 sregs->idt.base = dt.base; 3932 kvm_x86_ops->get_gdt(vcpu, &dt); 3933 sregs->gdt.limit = dt.limit; 3934 sregs->gdt.base = dt.base; 3935 3936 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3937 sregs->cr0 = vcpu->arch.cr0; 3938 sregs->cr2 = vcpu->arch.cr2; 3939 sregs->cr3 = vcpu->arch.cr3; 3940 sregs->cr4 = vcpu->arch.cr4; 3941 sregs->cr8 = kvm_get_cr8(vcpu); 3942 sregs->efer = vcpu->arch.shadow_efer; 3943 sregs->apic_base = kvm_get_apic_base(vcpu); 3944 3945 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 3946 3947 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 3948 set_bit(vcpu->arch.interrupt.nr, 3949 (unsigned long *)sregs->interrupt_bitmap); 3950 3951 vcpu_put(vcpu); 3952 3953 return 0; 3954 } 3955 3956 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3957 struct kvm_mp_state *mp_state) 3958 { 3959 vcpu_load(vcpu); 3960 mp_state->mp_state = vcpu->arch.mp_state; 3961 vcpu_put(vcpu); 3962 return 0; 3963 } 3964 3965 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3966 struct kvm_mp_state *mp_state) 3967 { 3968 vcpu_load(vcpu); 3969 vcpu->arch.mp_state = mp_state->mp_state; 3970 vcpu_put(vcpu); 3971 return 0; 3972 } 3973 3974 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3975 struct kvm_segment *var, int seg) 3976 { 3977 kvm_x86_ops->set_segment(vcpu, var, seg); 3978 } 3979 3980 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3981 struct kvm_segment *kvm_desct) 3982 { 3983 kvm_desct->base = get_desc_base(seg_desc); 3984 kvm_desct->limit = get_desc_limit(seg_desc); 3985 if (seg_desc->g) { 3986 kvm_desct->limit <<= 12; 3987 kvm_desct->limit |= 0xfff; 3988 } 3989 kvm_desct->selector = selector; 3990 kvm_desct->type = seg_desc->type; 3991 kvm_desct->present = seg_desc->p; 3992 kvm_desct->dpl = seg_desc->dpl; 3993 kvm_desct->db = seg_desc->d; 3994 kvm_desct->s = seg_desc->s; 3995 kvm_desct->l = seg_desc->l; 3996 kvm_desct->g = seg_desc->g; 3997 kvm_desct->avl = seg_desc->avl; 3998 if (!selector) 3999 kvm_desct->unusable = 1; 4000 else 4001 kvm_desct->unusable = 0; 4002 kvm_desct->padding = 0; 4003 } 4004 4005 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 4006 u16 selector, 4007 struct descriptor_table *dtable) 4008 { 4009 if (selector & 1 << 2) { 4010 struct kvm_segment kvm_seg; 4011 4012 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 4013 4014 if (kvm_seg.unusable) 4015 dtable->limit = 0; 4016 else 4017 dtable->limit = kvm_seg.limit; 4018 dtable->base = kvm_seg.base; 4019 } 4020 else 4021 kvm_x86_ops->get_gdt(vcpu, dtable); 4022 } 4023 4024 /* allowed just for 8 bytes segments */ 4025 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4026 struct desc_struct *seg_desc) 4027 { 4028 struct descriptor_table dtable; 4029 u16 index = selector >> 3; 4030 4031 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4032 4033 if (dtable.limit < index * 8 + 7) { 4034 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4035 return 1; 4036 } 4037 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4038 } 4039 4040 /* allowed just for 8 bytes segments */ 4041 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4042 struct desc_struct *seg_desc) 4043 { 4044 struct descriptor_table dtable; 4045 u16 index = selector >> 3; 4046 4047 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4048 4049 if (dtable.limit < index * 8 + 7) 4050 return 1; 4051 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4052 } 4053 4054 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4055 struct desc_struct *seg_desc) 4056 { 4057 u32 base_addr = get_desc_base(seg_desc); 4058 4059 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4060 } 4061 4062 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4063 { 4064 struct kvm_segment kvm_seg; 4065 4066 kvm_get_segment(vcpu, &kvm_seg, seg); 4067 return kvm_seg.selector; 4068 } 4069 4070 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 4071 u16 selector, 4072 struct kvm_segment *kvm_seg) 4073 { 4074 struct desc_struct seg_desc; 4075 4076 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 4077 return 1; 4078 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 4079 return 0; 4080 } 4081 4082 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4083 { 4084 struct kvm_segment segvar = { 4085 .base = selector << 4, 4086 .limit = 0xffff, 4087 .selector = selector, 4088 .type = 3, 4089 .present = 1, 4090 .dpl = 3, 4091 .db = 0, 4092 .s = 1, 4093 .l = 0, 4094 .g = 0, 4095 .avl = 0, 4096 .unusable = 0, 4097 }; 4098 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4099 return 0; 4100 } 4101 4102 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4103 { 4104 return (seg != VCPU_SREG_LDTR) && 4105 (seg != VCPU_SREG_TR) && 4106 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4107 } 4108 4109 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4110 int type_bits, int seg) 4111 { 4112 struct kvm_segment kvm_seg; 4113 4114 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4115 return kvm_load_realmode_segment(vcpu, selector, seg); 4116 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4117 return 1; 4118 kvm_seg.type |= type_bits; 4119 4120 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4121 seg != VCPU_SREG_LDTR) 4122 if (!kvm_seg.s) 4123 kvm_seg.unusable = 1; 4124 4125 kvm_set_segment(vcpu, &kvm_seg, seg); 4126 return 0; 4127 } 4128 4129 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4130 struct tss_segment_32 *tss) 4131 { 4132 tss->cr3 = vcpu->arch.cr3; 4133 tss->eip = kvm_rip_read(vcpu); 4134 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4135 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4136 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4137 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4138 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4139 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4140 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4141 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4142 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4143 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4144 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4145 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4146 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4147 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 4148 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 4149 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4150 } 4151 4152 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4153 struct tss_segment_32 *tss) 4154 { 4155 kvm_set_cr3(vcpu, tss->cr3); 4156 4157 kvm_rip_write(vcpu, tss->eip); 4158 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4159 4160 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4161 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4162 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 4163 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 4164 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 4165 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 4166 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4167 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4168 4169 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4170 return 1; 4171 4172 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4173 return 1; 4174 4175 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4176 return 1; 4177 4178 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4179 return 1; 4180 4181 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4182 return 1; 4183 4184 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4185 return 1; 4186 4187 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4188 return 1; 4189 return 0; 4190 } 4191 4192 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 4193 struct tss_segment_16 *tss) 4194 { 4195 tss->ip = kvm_rip_read(vcpu); 4196 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4197 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4198 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4199 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4200 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4201 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4202 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4203 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 4204 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 4205 4206 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4207 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4208 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4209 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4210 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4211 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 4212 } 4213 4214 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4215 struct tss_segment_16 *tss) 4216 { 4217 kvm_rip_write(vcpu, tss->ip); 4218 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4219 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4220 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4221 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4222 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 4223 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 4224 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 4225 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 4226 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 4227 4228 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 4229 return 1; 4230 4231 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4232 return 1; 4233 4234 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4235 return 1; 4236 4237 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4238 return 1; 4239 4240 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4241 return 1; 4242 return 0; 4243 } 4244 4245 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 4246 u16 old_tss_sel, u32 old_tss_base, 4247 struct desc_struct *nseg_desc) 4248 { 4249 struct tss_segment_16 tss_segment_16; 4250 int ret = 0; 4251 4252 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4253 sizeof tss_segment_16)) 4254 goto out; 4255 4256 save_state_to_tss16(vcpu, &tss_segment_16); 4257 4258 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4259 sizeof tss_segment_16)) 4260 goto out; 4261 4262 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4263 &tss_segment_16, sizeof tss_segment_16)) 4264 goto out; 4265 4266 if (old_tss_sel != 0xffff) { 4267 tss_segment_16.prev_task_link = old_tss_sel; 4268 4269 if (kvm_write_guest(vcpu->kvm, 4270 get_tss_base_addr(vcpu, nseg_desc), 4271 &tss_segment_16.prev_task_link, 4272 sizeof tss_segment_16.prev_task_link)) 4273 goto out; 4274 } 4275 4276 if (load_state_from_tss16(vcpu, &tss_segment_16)) 4277 goto out; 4278 4279 ret = 1; 4280 out: 4281 return ret; 4282 } 4283 4284 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 4285 u16 old_tss_sel, u32 old_tss_base, 4286 struct desc_struct *nseg_desc) 4287 { 4288 struct tss_segment_32 tss_segment_32; 4289 int ret = 0; 4290 4291 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4292 sizeof tss_segment_32)) 4293 goto out; 4294 4295 save_state_to_tss32(vcpu, &tss_segment_32); 4296 4297 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4298 sizeof tss_segment_32)) 4299 goto out; 4300 4301 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4302 &tss_segment_32, sizeof tss_segment_32)) 4303 goto out; 4304 4305 if (old_tss_sel != 0xffff) { 4306 tss_segment_32.prev_task_link = old_tss_sel; 4307 4308 if (kvm_write_guest(vcpu->kvm, 4309 get_tss_base_addr(vcpu, nseg_desc), 4310 &tss_segment_32.prev_task_link, 4311 sizeof tss_segment_32.prev_task_link)) 4312 goto out; 4313 } 4314 4315 if (load_state_from_tss32(vcpu, &tss_segment_32)) 4316 goto out; 4317 4318 ret = 1; 4319 out: 4320 return ret; 4321 } 4322 4323 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 4324 { 4325 struct kvm_segment tr_seg; 4326 struct desc_struct cseg_desc; 4327 struct desc_struct nseg_desc; 4328 int ret = 0; 4329 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 4330 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 4331 4332 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 4333 4334 /* FIXME: Handle errors. Failure to read either TSS or their 4335 * descriptors should generate a pagefault. 4336 */ 4337 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 4338 goto out; 4339 4340 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 4341 goto out; 4342 4343 if (reason != TASK_SWITCH_IRET) { 4344 int cpl; 4345 4346 cpl = kvm_x86_ops->get_cpl(vcpu); 4347 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 4348 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4349 return 1; 4350 } 4351 } 4352 4353 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4354 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4355 return 1; 4356 } 4357 4358 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4359 cseg_desc.type &= ~(1 << 1); //clear the B flag 4360 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4361 } 4362 4363 if (reason == TASK_SWITCH_IRET) { 4364 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4365 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4366 } 4367 4368 /* set back link to prev task only if NT bit is set in eflags 4369 note that old_tss_sel is not used afetr this point */ 4370 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4371 old_tss_sel = 0xffff; 4372 4373 /* set back link to prev task only if NT bit is set in eflags 4374 note that old_tss_sel is not used afetr this point */ 4375 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4376 old_tss_sel = 0xffff; 4377 4378 if (nseg_desc.type & 8) 4379 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4380 old_tss_base, &nseg_desc); 4381 else 4382 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4383 old_tss_base, &nseg_desc); 4384 4385 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4386 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4387 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4388 } 4389 4390 if (reason != TASK_SWITCH_IRET) { 4391 nseg_desc.type |= (1 << 1); 4392 save_guest_segment_descriptor(vcpu, tss_selector, 4393 &nseg_desc); 4394 } 4395 4396 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4397 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4398 tr_seg.type = 11; 4399 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4400 out: 4401 return ret; 4402 } 4403 EXPORT_SYMBOL_GPL(kvm_task_switch); 4404 4405 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4406 struct kvm_sregs *sregs) 4407 { 4408 int mmu_reset_needed = 0; 4409 int pending_vec, max_bits; 4410 struct descriptor_table dt; 4411 4412 vcpu_load(vcpu); 4413 4414 dt.limit = sregs->idt.limit; 4415 dt.base = sregs->idt.base; 4416 kvm_x86_ops->set_idt(vcpu, &dt); 4417 dt.limit = sregs->gdt.limit; 4418 dt.base = sregs->gdt.base; 4419 kvm_x86_ops->set_gdt(vcpu, &dt); 4420 4421 vcpu->arch.cr2 = sregs->cr2; 4422 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4423 vcpu->arch.cr3 = sregs->cr3; 4424 4425 kvm_set_cr8(vcpu, sregs->cr8); 4426 4427 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4428 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4429 kvm_set_apic_base(vcpu, sregs->apic_base); 4430 4431 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4432 4433 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4434 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4435 vcpu->arch.cr0 = sregs->cr0; 4436 4437 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4438 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4439 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4440 load_pdptrs(vcpu, vcpu->arch.cr3); 4441 4442 if (mmu_reset_needed) 4443 kvm_mmu_reset_context(vcpu); 4444 4445 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4446 pending_vec = find_first_bit( 4447 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4448 if (pending_vec < max_bits) { 4449 kvm_queue_interrupt(vcpu, pending_vec, false); 4450 pr_debug("Set back pending irq %d\n", pending_vec); 4451 if (irqchip_in_kernel(vcpu->kvm)) 4452 kvm_pic_clear_isr_ack(vcpu->kvm); 4453 } 4454 4455 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4456 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4457 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4458 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4459 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4460 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4461 4462 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4463 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4464 4465 update_cr8_intercept(vcpu); 4466 4467 /* Older userspace won't unhalt the vcpu on reset. */ 4468 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 4469 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4470 !(vcpu->arch.cr0 & X86_CR0_PE)) 4471 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4472 4473 vcpu_put(vcpu); 4474 4475 return 0; 4476 } 4477 4478 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4479 struct kvm_guest_debug *dbg) 4480 { 4481 int i, r; 4482 4483 vcpu_load(vcpu); 4484 4485 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4486 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4487 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4488 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4489 vcpu->arch.switch_db_regs = 4490 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4491 } else { 4492 for (i = 0; i < KVM_NR_DB_REGS; i++) 4493 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4494 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4495 } 4496 4497 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4498 4499 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4500 kvm_queue_exception(vcpu, DB_VECTOR); 4501 else if (dbg->control & KVM_GUESTDBG_INJECT_BP) 4502 kvm_queue_exception(vcpu, BP_VECTOR); 4503 4504 vcpu_put(vcpu); 4505 4506 return r; 4507 } 4508 4509 /* 4510 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4511 * we have asm/x86/processor.h 4512 */ 4513 struct fxsave { 4514 u16 cwd; 4515 u16 swd; 4516 u16 twd; 4517 u16 fop; 4518 u64 rip; 4519 u64 rdp; 4520 u32 mxcsr; 4521 u32 mxcsr_mask; 4522 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4523 #ifdef CONFIG_X86_64 4524 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4525 #else 4526 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4527 #endif 4528 }; 4529 4530 /* 4531 * Translate a guest virtual address to a guest physical address. 4532 */ 4533 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4534 struct kvm_translation *tr) 4535 { 4536 unsigned long vaddr = tr->linear_address; 4537 gpa_t gpa; 4538 4539 vcpu_load(vcpu); 4540 down_read(&vcpu->kvm->slots_lock); 4541 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4542 up_read(&vcpu->kvm->slots_lock); 4543 tr->physical_address = gpa; 4544 tr->valid = gpa != UNMAPPED_GVA; 4545 tr->writeable = 1; 4546 tr->usermode = 0; 4547 vcpu_put(vcpu); 4548 4549 return 0; 4550 } 4551 4552 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4553 { 4554 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4555 4556 vcpu_load(vcpu); 4557 4558 memcpy(fpu->fpr, fxsave->st_space, 128); 4559 fpu->fcw = fxsave->cwd; 4560 fpu->fsw = fxsave->swd; 4561 fpu->ftwx = fxsave->twd; 4562 fpu->last_opcode = fxsave->fop; 4563 fpu->last_ip = fxsave->rip; 4564 fpu->last_dp = fxsave->rdp; 4565 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4566 4567 vcpu_put(vcpu); 4568 4569 return 0; 4570 } 4571 4572 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4573 { 4574 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4575 4576 vcpu_load(vcpu); 4577 4578 memcpy(fxsave->st_space, fpu->fpr, 128); 4579 fxsave->cwd = fpu->fcw; 4580 fxsave->swd = fpu->fsw; 4581 fxsave->twd = fpu->ftwx; 4582 fxsave->fop = fpu->last_opcode; 4583 fxsave->rip = fpu->last_ip; 4584 fxsave->rdp = fpu->last_dp; 4585 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4586 4587 vcpu_put(vcpu); 4588 4589 return 0; 4590 } 4591 4592 void fx_init(struct kvm_vcpu *vcpu) 4593 { 4594 unsigned after_mxcsr_mask; 4595 4596 /* 4597 * Touch the fpu the first time in non atomic context as if 4598 * this is the first fpu instruction the exception handler 4599 * will fire before the instruction returns and it'll have to 4600 * allocate ram with GFP_KERNEL. 4601 */ 4602 if (!used_math()) 4603 kvm_fx_save(&vcpu->arch.host_fx_image); 4604 4605 /* Initialize guest FPU by resetting ours and saving into guest's */ 4606 preempt_disable(); 4607 kvm_fx_save(&vcpu->arch.host_fx_image); 4608 kvm_fx_finit(); 4609 kvm_fx_save(&vcpu->arch.guest_fx_image); 4610 kvm_fx_restore(&vcpu->arch.host_fx_image); 4611 preempt_enable(); 4612 4613 vcpu->arch.cr0 |= X86_CR0_ET; 4614 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4615 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4616 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4617 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4618 } 4619 EXPORT_SYMBOL_GPL(fx_init); 4620 4621 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4622 { 4623 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4624 return; 4625 4626 vcpu->guest_fpu_loaded = 1; 4627 kvm_fx_save(&vcpu->arch.host_fx_image); 4628 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4629 } 4630 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4631 4632 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4633 { 4634 if (!vcpu->guest_fpu_loaded) 4635 return; 4636 4637 vcpu->guest_fpu_loaded = 0; 4638 kvm_fx_save(&vcpu->arch.guest_fx_image); 4639 kvm_fx_restore(&vcpu->arch.host_fx_image); 4640 ++vcpu->stat.fpu_reload; 4641 } 4642 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4643 4644 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4645 { 4646 if (vcpu->arch.time_page) { 4647 kvm_release_page_dirty(vcpu->arch.time_page); 4648 vcpu->arch.time_page = NULL; 4649 } 4650 4651 kvm_x86_ops->vcpu_free(vcpu); 4652 } 4653 4654 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4655 unsigned int id) 4656 { 4657 return kvm_x86_ops->vcpu_create(kvm, id); 4658 } 4659 4660 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4661 { 4662 int r; 4663 4664 /* We do fxsave: this must be aligned. */ 4665 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4666 4667 vcpu->arch.mtrr_state.have_fixed = 1; 4668 vcpu_load(vcpu); 4669 r = kvm_arch_vcpu_reset(vcpu); 4670 if (r == 0) 4671 r = kvm_mmu_setup(vcpu); 4672 vcpu_put(vcpu); 4673 if (r < 0) 4674 goto free_vcpu; 4675 4676 return 0; 4677 free_vcpu: 4678 kvm_x86_ops->vcpu_free(vcpu); 4679 return r; 4680 } 4681 4682 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4683 { 4684 vcpu_load(vcpu); 4685 kvm_mmu_unload(vcpu); 4686 vcpu_put(vcpu); 4687 4688 kvm_x86_ops->vcpu_free(vcpu); 4689 } 4690 4691 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4692 { 4693 vcpu->arch.nmi_pending = false; 4694 vcpu->arch.nmi_injected = false; 4695 4696 vcpu->arch.switch_db_regs = 0; 4697 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4698 vcpu->arch.dr6 = DR6_FIXED_1; 4699 vcpu->arch.dr7 = DR7_FIXED_1; 4700 4701 return kvm_x86_ops->vcpu_reset(vcpu); 4702 } 4703 4704 void kvm_arch_hardware_enable(void *garbage) 4705 { 4706 kvm_x86_ops->hardware_enable(garbage); 4707 } 4708 4709 void kvm_arch_hardware_disable(void *garbage) 4710 { 4711 kvm_x86_ops->hardware_disable(garbage); 4712 } 4713 4714 int kvm_arch_hardware_setup(void) 4715 { 4716 return kvm_x86_ops->hardware_setup(); 4717 } 4718 4719 void kvm_arch_hardware_unsetup(void) 4720 { 4721 kvm_x86_ops->hardware_unsetup(); 4722 } 4723 4724 void kvm_arch_check_processor_compat(void *rtn) 4725 { 4726 kvm_x86_ops->check_processor_compatibility(rtn); 4727 } 4728 4729 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4730 { 4731 struct page *page; 4732 struct kvm *kvm; 4733 int r; 4734 4735 BUG_ON(vcpu->kvm == NULL); 4736 kvm = vcpu->kvm; 4737 4738 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4739 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 4740 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4741 else 4742 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4743 4744 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4745 if (!page) { 4746 r = -ENOMEM; 4747 goto fail; 4748 } 4749 vcpu->arch.pio_data = page_address(page); 4750 4751 r = kvm_mmu_create(vcpu); 4752 if (r < 0) 4753 goto fail_free_pio_data; 4754 4755 if (irqchip_in_kernel(kvm)) { 4756 r = kvm_create_lapic(vcpu); 4757 if (r < 0) 4758 goto fail_mmu_destroy; 4759 } 4760 4761 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 4762 GFP_KERNEL); 4763 if (!vcpu->arch.mce_banks) { 4764 r = -ENOMEM; 4765 goto fail_mmu_destroy; 4766 } 4767 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 4768 4769 return 0; 4770 4771 fail_mmu_destroy: 4772 kvm_mmu_destroy(vcpu); 4773 fail_free_pio_data: 4774 free_page((unsigned long)vcpu->arch.pio_data); 4775 fail: 4776 return r; 4777 } 4778 4779 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4780 { 4781 kvm_free_lapic(vcpu); 4782 down_read(&vcpu->kvm->slots_lock); 4783 kvm_mmu_destroy(vcpu); 4784 up_read(&vcpu->kvm->slots_lock); 4785 free_page((unsigned long)vcpu->arch.pio_data); 4786 } 4787 4788 struct kvm *kvm_arch_create_vm(void) 4789 { 4790 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4791 4792 if (!kvm) 4793 return ERR_PTR(-ENOMEM); 4794 4795 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4796 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4797 4798 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4799 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4800 4801 rdtscll(kvm->arch.vm_init_tsc); 4802 4803 return kvm; 4804 } 4805 4806 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4807 { 4808 vcpu_load(vcpu); 4809 kvm_mmu_unload(vcpu); 4810 vcpu_put(vcpu); 4811 } 4812 4813 static void kvm_free_vcpus(struct kvm *kvm) 4814 { 4815 unsigned int i; 4816 struct kvm_vcpu *vcpu; 4817 4818 /* 4819 * Unpin any mmu pages first. 4820 */ 4821 kvm_for_each_vcpu(i, vcpu, kvm) 4822 kvm_unload_vcpu_mmu(vcpu); 4823 kvm_for_each_vcpu(i, vcpu, kvm) 4824 kvm_arch_vcpu_free(vcpu); 4825 4826 mutex_lock(&kvm->lock); 4827 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 4828 kvm->vcpus[i] = NULL; 4829 4830 atomic_set(&kvm->online_vcpus, 0); 4831 mutex_unlock(&kvm->lock); 4832 } 4833 4834 void kvm_arch_sync_events(struct kvm *kvm) 4835 { 4836 kvm_free_all_assigned_devices(kvm); 4837 } 4838 4839 void kvm_arch_destroy_vm(struct kvm *kvm) 4840 { 4841 kvm_iommu_unmap_guest(kvm); 4842 kvm_free_pit(kvm); 4843 kfree(kvm->arch.vpic); 4844 kfree(kvm->arch.vioapic); 4845 kvm_free_vcpus(kvm); 4846 kvm_free_physmem(kvm); 4847 if (kvm->arch.apic_access_page) 4848 put_page(kvm->arch.apic_access_page); 4849 if (kvm->arch.ept_identity_pagetable) 4850 put_page(kvm->arch.ept_identity_pagetable); 4851 kfree(kvm); 4852 } 4853 4854 int kvm_arch_set_memory_region(struct kvm *kvm, 4855 struct kvm_userspace_memory_region *mem, 4856 struct kvm_memory_slot old, 4857 int user_alloc) 4858 { 4859 int npages = mem->memory_size >> PAGE_SHIFT; 4860 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4861 4862 /*To keep backward compatibility with older userspace, 4863 *x86 needs to hanlde !user_alloc case. 4864 */ 4865 if (!user_alloc) { 4866 if (npages && !old.rmap) { 4867 unsigned long userspace_addr; 4868 4869 down_write(¤t->mm->mmap_sem); 4870 userspace_addr = do_mmap(NULL, 0, 4871 npages * PAGE_SIZE, 4872 PROT_READ | PROT_WRITE, 4873 MAP_PRIVATE | MAP_ANONYMOUS, 4874 0); 4875 up_write(¤t->mm->mmap_sem); 4876 4877 if (IS_ERR((void *)userspace_addr)) 4878 return PTR_ERR((void *)userspace_addr); 4879 4880 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4881 spin_lock(&kvm->mmu_lock); 4882 memslot->userspace_addr = userspace_addr; 4883 spin_unlock(&kvm->mmu_lock); 4884 } else { 4885 if (!old.user_alloc && old.rmap) { 4886 int ret; 4887 4888 down_write(¤t->mm->mmap_sem); 4889 ret = do_munmap(current->mm, old.userspace_addr, 4890 old.npages * PAGE_SIZE); 4891 up_write(¤t->mm->mmap_sem); 4892 if (ret < 0) 4893 printk(KERN_WARNING 4894 "kvm_vm_ioctl_set_memory_region: " 4895 "failed to munmap memory\n"); 4896 } 4897 } 4898 } 4899 4900 spin_lock(&kvm->mmu_lock); 4901 if (!kvm->arch.n_requested_mmu_pages) { 4902 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4903 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4904 } 4905 4906 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4907 spin_unlock(&kvm->mmu_lock); 4908 4909 return 0; 4910 } 4911 4912 void kvm_arch_flush_shadow(struct kvm *kvm) 4913 { 4914 kvm_mmu_zap_all(kvm); 4915 kvm_reload_remote_mmus(kvm); 4916 } 4917 4918 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4919 { 4920 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4921 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4922 || vcpu->arch.nmi_pending || 4923 (kvm_arch_interrupt_allowed(vcpu) && 4924 kvm_cpu_has_interrupt(vcpu)); 4925 } 4926 4927 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4928 { 4929 int me; 4930 int cpu = vcpu->cpu; 4931 4932 if (waitqueue_active(&vcpu->wq)) { 4933 wake_up_interruptible(&vcpu->wq); 4934 ++vcpu->stat.halt_wakeup; 4935 } 4936 4937 me = get_cpu(); 4938 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 4939 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 4940 smp_send_reschedule(cpu); 4941 put_cpu(); 4942 } 4943 4944 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 4945 { 4946 return kvm_x86_ops->interrupt_allowed(vcpu); 4947 } 4948 4949 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 4950 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 4951 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 4952 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 4953 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 4954