1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 #include <trace/events/kvm.h> 41 #undef TRACE_INCLUDE_FILE 42 #define CREATE_TRACE_POINTS 43 #include "trace.h" 44 45 #include <asm/uaccess.h> 46 #include <asm/msr.h> 47 #include <asm/desc.h> 48 #include <asm/mtrr.h> 49 #include <asm/mce.h> 50 51 #define MAX_IO_MSRS 256 52 #define CR0_RESERVED_BITS \ 53 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 54 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 55 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 56 #define CR4_RESERVED_BITS \ 57 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 58 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 59 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 60 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 61 62 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 63 64 #define KVM_MAX_MCE_BANKS 32 65 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 66 67 /* EFER defaults: 68 * - enable syscall per default because its emulated by KVM 69 * - enable LME and LMA per default on 64 bit KVM 70 */ 71 #ifdef CONFIG_X86_64 72 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 73 #else 74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 75 #endif 76 77 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 78 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 79 80 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 81 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 82 struct kvm_cpuid_entry2 __user *entries); 83 84 struct kvm_x86_ops *kvm_x86_ops; 85 EXPORT_SYMBOL_GPL(kvm_x86_ops); 86 87 int ignore_msrs = 0; 88 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 89 90 struct kvm_stats_debugfs_item debugfs_entries[] = { 91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 92 { "pf_guest", VCPU_STAT(pf_guest) }, 93 { "tlb_flush", VCPU_STAT(tlb_flush) }, 94 { "invlpg", VCPU_STAT(invlpg) }, 95 { "exits", VCPU_STAT(exits) }, 96 { "io_exits", VCPU_STAT(io_exits) }, 97 { "mmio_exits", VCPU_STAT(mmio_exits) }, 98 { "signal_exits", VCPU_STAT(signal_exits) }, 99 { "irq_window", VCPU_STAT(irq_window_exits) }, 100 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 101 { "halt_exits", VCPU_STAT(halt_exits) }, 102 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 103 { "hypercalls", VCPU_STAT(hypercalls) }, 104 { "request_irq", VCPU_STAT(request_irq_exits) }, 105 { "irq_exits", VCPU_STAT(irq_exits) }, 106 { "host_state_reload", VCPU_STAT(host_state_reload) }, 107 { "efer_reload", VCPU_STAT(efer_reload) }, 108 { "fpu_reload", VCPU_STAT(fpu_reload) }, 109 { "insn_emulation", VCPU_STAT(insn_emulation) }, 110 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 111 { "irq_injections", VCPU_STAT(irq_injections) }, 112 { "nmi_injections", VCPU_STAT(nmi_injections) }, 113 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 114 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 115 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 116 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 117 { "mmu_flooded", VM_STAT(mmu_flooded) }, 118 { "mmu_recycled", VM_STAT(mmu_recycled) }, 119 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 120 { "mmu_unsync", VM_STAT(mmu_unsync) }, 121 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 122 { "largepages", VM_STAT(lpages) }, 123 { NULL } 124 }; 125 126 unsigned long segment_base(u16 selector) 127 { 128 struct descriptor_table gdt; 129 struct desc_struct *d; 130 unsigned long table_base; 131 unsigned long v; 132 133 if (selector == 0) 134 return 0; 135 136 kvm_get_gdt(&gdt); 137 table_base = gdt.base; 138 139 if (selector & 4) { /* from ldt */ 140 u16 ldt_selector = kvm_read_ldt(); 141 142 table_base = segment_base(ldt_selector); 143 } 144 d = (struct desc_struct *)(table_base + (selector & ~7)); 145 v = get_desc_base(d); 146 #ifdef CONFIG_X86_64 147 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 148 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 149 #endif 150 return v; 151 } 152 EXPORT_SYMBOL_GPL(segment_base); 153 154 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 155 { 156 if (irqchip_in_kernel(vcpu->kvm)) 157 return vcpu->arch.apic_base; 158 else 159 return vcpu->arch.apic_base; 160 } 161 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 162 163 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 164 { 165 /* TODO: reserve bits check */ 166 if (irqchip_in_kernel(vcpu->kvm)) 167 kvm_lapic_set_base(vcpu, data); 168 else 169 vcpu->arch.apic_base = data; 170 } 171 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 172 173 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 174 { 175 WARN_ON(vcpu->arch.exception.pending); 176 vcpu->arch.exception.pending = true; 177 vcpu->arch.exception.has_error_code = false; 178 vcpu->arch.exception.nr = nr; 179 } 180 EXPORT_SYMBOL_GPL(kvm_queue_exception); 181 182 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 183 u32 error_code) 184 { 185 ++vcpu->stat.pf_guest; 186 187 if (vcpu->arch.exception.pending) { 188 switch(vcpu->arch.exception.nr) { 189 case DF_VECTOR: 190 /* triple fault -> shutdown */ 191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 192 return; 193 case PF_VECTOR: 194 vcpu->arch.exception.nr = DF_VECTOR; 195 vcpu->arch.exception.error_code = 0; 196 return; 197 default: 198 /* replace previous exception with a new one in a hope 199 that instruction re-execution will regenerate lost 200 exception */ 201 vcpu->arch.exception.pending = false; 202 break; 203 } 204 } 205 vcpu->arch.cr2 = addr; 206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 207 } 208 209 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 210 { 211 vcpu->arch.nmi_pending = 1; 212 } 213 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 214 215 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 216 { 217 WARN_ON(vcpu->arch.exception.pending); 218 vcpu->arch.exception.pending = true; 219 vcpu->arch.exception.has_error_code = true; 220 vcpu->arch.exception.nr = nr; 221 vcpu->arch.exception.error_code = error_code; 222 } 223 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 224 225 /* 226 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 227 * a #GP and return false. 228 */ 229 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 230 { 231 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 232 return true; 233 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 234 return false; 235 } 236 EXPORT_SYMBOL_GPL(kvm_require_cpl); 237 238 /* 239 * Load the pae pdptrs. Return true is they are all valid. 240 */ 241 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 242 { 243 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 244 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 245 int i; 246 int ret; 247 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 248 249 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 250 offset * sizeof(u64), sizeof(pdpte)); 251 if (ret < 0) { 252 ret = 0; 253 goto out; 254 } 255 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 256 if (is_present_gpte(pdpte[i]) && 257 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 258 ret = 0; 259 goto out; 260 } 261 } 262 ret = 1; 263 264 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 265 __set_bit(VCPU_EXREG_PDPTR, 266 (unsigned long *)&vcpu->arch.regs_avail); 267 __set_bit(VCPU_EXREG_PDPTR, 268 (unsigned long *)&vcpu->arch.regs_dirty); 269 out: 270 271 return ret; 272 } 273 EXPORT_SYMBOL_GPL(load_pdptrs); 274 275 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 276 { 277 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 278 bool changed = true; 279 int r; 280 281 if (is_long_mode(vcpu) || !is_pae(vcpu)) 282 return false; 283 284 if (!test_bit(VCPU_EXREG_PDPTR, 285 (unsigned long *)&vcpu->arch.regs_avail)) 286 return true; 287 288 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 289 if (r < 0) 290 goto out; 291 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 292 out: 293 294 return changed; 295 } 296 297 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 298 { 299 if (cr0 & CR0_RESERVED_BITS) { 300 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 301 cr0, vcpu->arch.cr0); 302 kvm_inject_gp(vcpu, 0); 303 return; 304 } 305 306 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 307 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 308 kvm_inject_gp(vcpu, 0); 309 return; 310 } 311 312 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 313 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 314 "and a clear PE flag\n"); 315 kvm_inject_gp(vcpu, 0); 316 return; 317 } 318 319 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 320 #ifdef CONFIG_X86_64 321 if ((vcpu->arch.shadow_efer & EFER_LME)) { 322 int cs_db, cs_l; 323 324 if (!is_pae(vcpu)) { 325 printk(KERN_DEBUG "set_cr0: #GP, start paging " 326 "in long mode while PAE is disabled\n"); 327 kvm_inject_gp(vcpu, 0); 328 return; 329 } 330 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 331 if (cs_l) { 332 printk(KERN_DEBUG "set_cr0: #GP, start paging " 333 "in long mode while CS.L == 1\n"); 334 kvm_inject_gp(vcpu, 0); 335 return; 336 337 } 338 } else 339 #endif 340 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 341 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 342 "reserved bits\n"); 343 kvm_inject_gp(vcpu, 0); 344 return; 345 } 346 347 } 348 349 kvm_x86_ops->set_cr0(vcpu, cr0); 350 vcpu->arch.cr0 = cr0; 351 352 kvm_mmu_reset_context(vcpu); 353 return; 354 } 355 EXPORT_SYMBOL_GPL(kvm_set_cr0); 356 357 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 358 { 359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 360 } 361 EXPORT_SYMBOL_GPL(kvm_lmsw); 362 363 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 364 { 365 unsigned long old_cr4 = vcpu->arch.cr4; 366 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 367 368 if (cr4 & CR4_RESERVED_BITS) { 369 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 370 kvm_inject_gp(vcpu, 0); 371 return; 372 } 373 374 if (is_long_mode(vcpu)) { 375 if (!(cr4 & X86_CR4_PAE)) { 376 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 377 "in long mode\n"); 378 kvm_inject_gp(vcpu, 0); 379 return; 380 } 381 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 382 && ((cr4 ^ old_cr4) & pdptr_bits) 383 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 384 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 385 kvm_inject_gp(vcpu, 0); 386 return; 387 } 388 389 if (cr4 & X86_CR4_VMXE) { 390 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 391 kvm_inject_gp(vcpu, 0); 392 return; 393 } 394 kvm_x86_ops->set_cr4(vcpu, cr4); 395 vcpu->arch.cr4 = cr4; 396 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 397 kvm_mmu_reset_context(vcpu); 398 } 399 EXPORT_SYMBOL_GPL(kvm_set_cr4); 400 401 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 402 { 403 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 404 kvm_mmu_sync_roots(vcpu); 405 kvm_mmu_flush_tlb(vcpu); 406 return; 407 } 408 409 if (is_long_mode(vcpu)) { 410 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 411 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 412 kvm_inject_gp(vcpu, 0); 413 return; 414 } 415 } else { 416 if (is_pae(vcpu)) { 417 if (cr3 & CR3_PAE_RESERVED_BITS) { 418 printk(KERN_DEBUG 419 "set_cr3: #GP, reserved bits\n"); 420 kvm_inject_gp(vcpu, 0); 421 return; 422 } 423 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 424 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 425 "reserved bits\n"); 426 kvm_inject_gp(vcpu, 0); 427 return; 428 } 429 } 430 /* 431 * We don't check reserved bits in nonpae mode, because 432 * this isn't enforced, and VMware depends on this. 433 */ 434 } 435 436 /* 437 * Does the new cr3 value map to physical memory? (Note, we 438 * catch an invalid cr3 even in real-mode, because it would 439 * cause trouble later on when we turn on paging anyway.) 440 * 441 * A real CPU would silently accept an invalid cr3 and would 442 * attempt to use it - with largely undefined (and often hard 443 * to debug) behavior on the guest side. 444 */ 445 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 446 kvm_inject_gp(vcpu, 0); 447 else { 448 vcpu->arch.cr3 = cr3; 449 vcpu->arch.mmu.new_cr3(vcpu); 450 } 451 } 452 EXPORT_SYMBOL_GPL(kvm_set_cr3); 453 454 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 455 { 456 if (cr8 & CR8_RESERVED_BITS) { 457 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 458 kvm_inject_gp(vcpu, 0); 459 return; 460 } 461 if (irqchip_in_kernel(vcpu->kvm)) 462 kvm_lapic_set_tpr(vcpu, cr8); 463 else 464 vcpu->arch.cr8 = cr8; 465 } 466 EXPORT_SYMBOL_GPL(kvm_set_cr8); 467 468 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 469 { 470 if (irqchip_in_kernel(vcpu->kvm)) 471 return kvm_lapic_get_cr8(vcpu); 472 else 473 return vcpu->arch.cr8; 474 } 475 EXPORT_SYMBOL_GPL(kvm_get_cr8); 476 477 static inline u32 bit(int bitno) 478 { 479 return 1 << (bitno & 31); 480 } 481 482 /* 483 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 485 * 486 * This list is modified at module load time to reflect the 487 * capabilities of the host cpu. 488 */ 489 static u32 msrs_to_save[] = { 490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 491 MSR_K6_STAR, 492 #ifdef CONFIG_X86_64 493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 494 #endif 495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 497 }; 498 499 static unsigned num_msrs_to_save; 500 501 static u32 emulated_msrs[] = { 502 MSR_IA32_MISC_ENABLE, 503 }; 504 505 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 506 { 507 if (efer & efer_reserved_bits) { 508 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 509 efer); 510 kvm_inject_gp(vcpu, 0); 511 return; 512 } 513 514 if (is_paging(vcpu) 515 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 516 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 517 kvm_inject_gp(vcpu, 0); 518 return; 519 } 520 521 if (efer & EFER_FFXSR) { 522 struct kvm_cpuid_entry2 *feat; 523 524 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 525 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 526 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 527 kvm_inject_gp(vcpu, 0); 528 return; 529 } 530 } 531 532 if (efer & EFER_SVME) { 533 struct kvm_cpuid_entry2 *feat; 534 535 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 536 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 537 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 538 kvm_inject_gp(vcpu, 0); 539 return; 540 } 541 } 542 543 kvm_x86_ops->set_efer(vcpu, efer); 544 545 efer &= ~EFER_LMA; 546 efer |= vcpu->arch.shadow_efer & EFER_LMA; 547 548 vcpu->arch.shadow_efer = efer; 549 550 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 551 kvm_mmu_reset_context(vcpu); 552 } 553 554 void kvm_enable_efer_bits(u64 mask) 555 { 556 efer_reserved_bits &= ~mask; 557 } 558 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 559 560 561 /* 562 * Writes msr value into into the appropriate "register". 563 * Returns 0 on success, non-0 otherwise. 564 * Assumes vcpu_load() was already called. 565 */ 566 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 567 { 568 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 569 } 570 571 /* 572 * Adapt set_msr() to msr_io()'s calling convention 573 */ 574 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 575 { 576 return kvm_set_msr(vcpu, index, *data); 577 } 578 579 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 580 { 581 static int version; 582 struct pvclock_wall_clock wc; 583 struct timespec now, sys, boot; 584 585 if (!wall_clock) 586 return; 587 588 version++; 589 590 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 591 592 /* 593 * The guest calculates current wall clock time by adding 594 * system time (updated by kvm_write_guest_time below) to the 595 * wall clock specified here. guest system time equals host 596 * system time for us, thus we must fill in host boot time here. 597 */ 598 now = current_kernel_time(); 599 ktime_get_ts(&sys); 600 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 601 602 wc.sec = boot.tv_sec; 603 wc.nsec = boot.tv_nsec; 604 wc.version = version; 605 606 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 607 608 version++; 609 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 610 } 611 612 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 613 { 614 uint32_t quotient, remainder; 615 616 /* Don't try to replace with do_div(), this one calculates 617 * "(dividend << 32) / divisor" */ 618 __asm__ ( "divl %4" 619 : "=a" (quotient), "=d" (remainder) 620 : "0" (0), "1" (dividend), "r" (divisor) ); 621 return quotient; 622 } 623 624 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 625 { 626 uint64_t nsecs = 1000000000LL; 627 int32_t shift = 0; 628 uint64_t tps64; 629 uint32_t tps32; 630 631 tps64 = tsc_khz * 1000LL; 632 while (tps64 > nsecs*2) { 633 tps64 >>= 1; 634 shift--; 635 } 636 637 tps32 = (uint32_t)tps64; 638 while (tps32 <= (uint32_t)nsecs) { 639 tps32 <<= 1; 640 shift++; 641 } 642 643 hv_clock->tsc_shift = shift; 644 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 645 646 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 647 __func__, tsc_khz, hv_clock->tsc_shift, 648 hv_clock->tsc_to_system_mul); 649 } 650 651 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 652 653 static void kvm_write_guest_time(struct kvm_vcpu *v) 654 { 655 struct timespec ts; 656 unsigned long flags; 657 struct kvm_vcpu_arch *vcpu = &v->arch; 658 void *shared_kaddr; 659 unsigned long this_tsc_khz; 660 661 if ((!vcpu->time_page)) 662 return; 663 664 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 665 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 666 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 667 vcpu->hv_clock_tsc_khz = this_tsc_khz; 668 } 669 put_cpu_var(cpu_tsc_khz); 670 671 /* Keep irq disabled to prevent changes to the clock */ 672 local_irq_save(flags); 673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 674 ktime_get_ts(&ts); 675 local_irq_restore(flags); 676 677 /* With all the info we got, fill in the values */ 678 679 vcpu->hv_clock.system_time = ts.tv_nsec + 680 (NSEC_PER_SEC * (u64)ts.tv_sec); 681 /* 682 * The interface expects us to write an even number signaling that the 683 * update is finished. Since the guest won't see the intermediate 684 * state, we just increase by 2 at the end. 685 */ 686 vcpu->hv_clock.version += 2; 687 688 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 689 690 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 691 sizeof(vcpu->hv_clock)); 692 693 kunmap_atomic(shared_kaddr, KM_USER0); 694 695 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 696 } 697 698 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 699 { 700 struct kvm_vcpu_arch *vcpu = &v->arch; 701 702 if (!vcpu->time_page) 703 return 0; 704 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 705 return 1; 706 } 707 708 static bool msr_mtrr_valid(unsigned msr) 709 { 710 switch (msr) { 711 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 712 case MSR_MTRRfix64K_00000: 713 case MSR_MTRRfix16K_80000: 714 case MSR_MTRRfix16K_A0000: 715 case MSR_MTRRfix4K_C0000: 716 case MSR_MTRRfix4K_C8000: 717 case MSR_MTRRfix4K_D0000: 718 case MSR_MTRRfix4K_D8000: 719 case MSR_MTRRfix4K_E0000: 720 case MSR_MTRRfix4K_E8000: 721 case MSR_MTRRfix4K_F0000: 722 case MSR_MTRRfix4K_F8000: 723 case MSR_MTRRdefType: 724 case MSR_IA32_CR_PAT: 725 return true; 726 case 0x2f8: 727 return true; 728 } 729 return false; 730 } 731 732 static bool valid_pat_type(unsigned t) 733 { 734 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 735 } 736 737 static bool valid_mtrr_type(unsigned t) 738 { 739 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 740 } 741 742 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 743 { 744 int i; 745 746 if (!msr_mtrr_valid(msr)) 747 return false; 748 749 if (msr == MSR_IA32_CR_PAT) { 750 for (i = 0; i < 8; i++) 751 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 752 return false; 753 return true; 754 } else if (msr == MSR_MTRRdefType) { 755 if (data & ~0xcff) 756 return false; 757 return valid_mtrr_type(data & 0xff); 758 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 759 for (i = 0; i < 8 ; i++) 760 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 761 return false; 762 return true; 763 } 764 765 /* variable MTRRs */ 766 return valid_mtrr_type(data & 0xff); 767 } 768 769 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 770 { 771 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 772 773 if (!mtrr_valid(vcpu, msr, data)) 774 return 1; 775 776 if (msr == MSR_MTRRdefType) { 777 vcpu->arch.mtrr_state.def_type = data; 778 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 779 } else if (msr == MSR_MTRRfix64K_00000) 780 p[0] = data; 781 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 782 p[1 + msr - MSR_MTRRfix16K_80000] = data; 783 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 784 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 785 else if (msr == MSR_IA32_CR_PAT) 786 vcpu->arch.pat = data; 787 else { /* Variable MTRRs */ 788 int idx, is_mtrr_mask; 789 u64 *pt; 790 791 idx = (msr - 0x200) / 2; 792 is_mtrr_mask = msr - 0x200 - 2 * idx; 793 if (!is_mtrr_mask) 794 pt = 795 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 796 else 797 pt = 798 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 799 *pt = data; 800 } 801 802 kvm_mmu_reset_context(vcpu); 803 return 0; 804 } 805 806 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 807 { 808 u64 mcg_cap = vcpu->arch.mcg_cap; 809 unsigned bank_num = mcg_cap & 0xff; 810 811 switch (msr) { 812 case MSR_IA32_MCG_STATUS: 813 vcpu->arch.mcg_status = data; 814 break; 815 case MSR_IA32_MCG_CTL: 816 if (!(mcg_cap & MCG_CTL_P)) 817 return 1; 818 if (data != 0 && data != ~(u64)0) 819 return -1; 820 vcpu->arch.mcg_ctl = data; 821 break; 822 default: 823 if (msr >= MSR_IA32_MC0_CTL && 824 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 825 u32 offset = msr - MSR_IA32_MC0_CTL; 826 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 827 if ((offset & 0x3) == 0 && 828 data != 0 && data != ~(u64)0) 829 return -1; 830 vcpu->arch.mce_banks[offset] = data; 831 break; 832 } 833 return 1; 834 } 835 return 0; 836 } 837 838 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 839 { 840 switch (msr) { 841 case MSR_EFER: 842 set_efer(vcpu, data); 843 break; 844 case MSR_K7_HWCR: 845 data &= ~(u64)0x40; /* ignore flush filter disable */ 846 if (data != 0) { 847 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 848 data); 849 return 1; 850 } 851 break; 852 case MSR_FAM10H_MMIO_CONF_BASE: 853 if (data != 0) { 854 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 855 "0x%llx\n", data); 856 return 1; 857 } 858 break; 859 case MSR_AMD64_NB_CFG: 860 break; 861 case MSR_IA32_DEBUGCTLMSR: 862 if (!data) { 863 /* We support the non-activated case already */ 864 break; 865 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 866 /* Values other than LBR and BTF are vendor-specific, 867 thus reserved and should throw a #GP */ 868 return 1; 869 } 870 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 871 __func__, data); 872 break; 873 case MSR_IA32_UCODE_REV: 874 case MSR_IA32_UCODE_WRITE: 875 case MSR_VM_HSAVE_PA: 876 case MSR_AMD64_PATCH_LOADER: 877 break; 878 case 0x200 ... 0x2ff: 879 return set_msr_mtrr(vcpu, msr, data); 880 case MSR_IA32_APICBASE: 881 kvm_set_apic_base(vcpu, data); 882 break; 883 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 884 return kvm_x2apic_msr_write(vcpu, msr, data); 885 case MSR_IA32_MISC_ENABLE: 886 vcpu->arch.ia32_misc_enable_msr = data; 887 break; 888 case MSR_KVM_WALL_CLOCK: 889 vcpu->kvm->arch.wall_clock = data; 890 kvm_write_wall_clock(vcpu->kvm, data); 891 break; 892 case MSR_KVM_SYSTEM_TIME: { 893 if (vcpu->arch.time_page) { 894 kvm_release_page_dirty(vcpu->arch.time_page); 895 vcpu->arch.time_page = NULL; 896 } 897 898 vcpu->arch.time = data; 899 900 /* we verify if the enable bit is set... */ 901 if (!(data & 1)) 902 break; 903 904 /* ...but clean it before doing the actual write */ 905 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 906 907 vcpu->arch.time_page = 908 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 909 910 if (is_error_page(vcpu->arch.time_page)) { 911 kvm_release_page_clean(vcpu->arch.time_page); 912 vcpu->arch.time_page = NULL; 913 } 914 915 kvm_request_guest_time_update(vcpu); 916 break; 917 } 918 case MSR_IA32_MCG_CTL: 919 case MSR_IA32_MCG_STATUS: 920 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 921 return set_msr_mce(vcpu, msr, data); 922 923 /* Performance counters are not protected by a CPUID bit, 924 * so we should check all of them in the generic path for the sake of 925 * cross vendor migration. 926 * Writing a zero into the event select MSRs disables them, 927 * which we perfectly emulate ;-). Any other value should be at least 928 * reported, some guests depend on them. 929 */ 930 case MSR_P6_EVNTSEL0: 931 case MSR_P6_EVNTSEL1: 932 case MSR_K7_EVNTSEL0: 933 case MSR_K7_EVNTSEL1: 934 case MSR_K7_EVNTSEL2: 935 case MSR_K7_EVNTSEL3: 936 if (data != 0) 937 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 938 "0x%x data 0x%llx\n", msr, data); 939 break; 940 /* at least RHEL 4 unconditionally writes to the perfctr registers, 941 * so we ignore writes to make it happy. 942 */ 943 case MSR_P6_PERFCTR0: 944 case MSR_P6_PERFCTR1: 945 case MSR_K7_PERFCTR0: 946 case MSR_K7_PERFCTR1: 947 case MSR_K7_PERFCTR2: 948 case MSR_K7_PERFCTR3: 949 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 950 "0x%x data 0x%llx\n", msr, data); 951 break; 952 default: 953 if (!ignore_msrs) { 954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 955 msr, data); 956 return 1; 957 } else { 958 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 959 msr, data); 960 break; 961 } 962 } 963 return 0; 964 } 965 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 966 967 968 /* 969 * Reads an msr value (of 'msr_index') into 'pdata'. 970 * Returns 0 on success, non-0 otherwise. 971 * Assumes vcpu_load() was already called. 972 */ 973 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 974 { 975 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 976 } 977 978 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 979 { 980 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 981 982 if (!msr_mtrr_valid(msr)) 983 return 1; 984 985 if (msr == MSR_MTRRdefType) 986 *pdata = vcpu->arch.mtrr_state.def_type + 987 (vcpu->arch.mtrr_state.enabled << 10); 988 else if (msr == MSR_MTRRfix64K_00000) 989 *pdata = p[0]; 990 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 991 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 992 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 993 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 994 else if (msr == MSR_IA32_CR_PAT) 995 *pdata = vcpu->arch.pat; 996 else { /* Variable MTRRs */ 997 int idx, is_mtrr_mask; 998 u64 *pt; 999 1000 idx = (msr - 0x200) / 2; 1001 is_mtrr_mask = msr - 0x200 - 2 * idx; 1002 if (!is_mtrr_mask) 1003 pt = 1004 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1005 else 1006 pt = 1007 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1008 *pdata = *pt; 1009 } 1010 1011 return 0; 1012 } 1013 1014 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1015 { 1016 u64 data; 1017 u64 mcg_cap = vcpu->arch.mcg_cap; 1018 unsigned bank_num = mcg_cap & 0xff; 1019 1020 switch (msr) { 1021 case MSR_IA32_P5_MC_ADDR: 1022 case MSR_IA32_P5_MC_TYPE: 1023 data = 0; 1024 break; 1025 case MSR_IA32_MCG_CAP: 1026 data = vcpu->arch.mcg_cap; 1027 break; 1028 case MSR_IA32_MCG_CTL: 1029 if (!(mcg_cap & MCG_CTL_P)) 1030 return 1; 1031 data = vcpu->arch.mcg_ctl; 1032 break; 1033 case MSR_IA32_MCG_STATUS: 1034 data = vcpu->arch.mcg_status; 1035 break; 1036 default: 1037 if (msr >= MSR_IA32_MC0_CTL && 1038 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1039 u32 offset = msr - MSR_IA32_MC0_CTL; 1040 data = vcpu->arch.mce_banks[offset]; 1041 break; 1042 } 1043 return 1; 1044 } 1045 *pdata = data; 1046 return 0; 1047 } 1048 1049 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1050 { 1051 u64 data; 1052 1053 switch (msr) { 1054 case MSR_IA32_PLATFORM_ID: 1055 case MSR_IA32_UCODE_REV: 1056 case MSR_IA32_EBL_CR_POWERON: 1057 case MSR_IA32_DEBUGCTLMSR: 1058 case MSR_IA32_LASTBRANCHFROMIP: 1059 case MSR_IA32_LASTBRANCHTOIP: 1060 case MSR_IA32_LASTINTFROMIP: 1061 case MSR_IA32_LASTINTTOIP: 1062 case MSR_K8_SYSCFG: 1063 case MSR_K7_HWCR: 1064 case MSR_VM_HSAVE_PA: 1065 case MSR_P6_PERFCTR0: 1066 case MSR_P6_PERFCTR1: 1067 case MSR_P6_EVNTSEL0: 1068 case MSR_P6_EVNTSEL1: 1069 case MSR_K7_EVNTSEL0: 1070 case MSR_K7_PERFCTR0: 1071 case MSR_K8_INT_PENDING_MSG: 1072 case MSR_AMD64_NB_CFG: 1073 case MSR_FAM10H_MMIO_CONF_BASE: 1074 data = 0; 1075 break; 1076 case MSR_MTRRcap: 1077 data = 0x500 | KVM_NR_VAR_MTRR; 1078 break; 1079 case 0x200 ... 0x2ff: 1080 return get_msr_mtrr(vcpu, msr, pdata); 1081 case 0xcd: /* fsb frequency */ 1082 data = 3; 1083 break; 1084 case MSR_IA32_APICBASE: 1085 data = kvm_get_apic_base(vcpu); 1086 break; 1087 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1088 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1089 break; 1090 case MSR_IA32_MISC_ENABLE: 1091 data = vcpu->arch.ia32_misc_enable_msr; 1092 break; 1093 case MSR_IA32_PERF_STATUS: 1094 /* TSC increment by tick */ 1095 data = 1000ULL; 1096 /* CPU multiplier */ 1097 data |= (((uint64_t)4ULL) << 40); 1098 break; 1099 case MSR_EFER: 1100 data = vcpu->arch.shadow_efer; 1101 break; 1102 case MSR_KVM_WALL_CLOCK: 1103 data = vcpu->kvm->arch.wall_clock; 1104 break; 1105 case MSR_KVM_SYSTEM_TIME: 1106 data = vcpu->arch.time; 1107 break; 1108 case MSR_IA32_P5_MC_ADDR: 1109 case MSR_IA32_P5_MC_TYPE: 1110 case MSR_IA32_MCG_CAP: 1111 case MSR_IA32_MCG_CTL: 1112 case MSR_IA32_MCG_STATUS: 1113 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1114 return get_msr_mce(vcpu, msr, pdata); 1115 default: 1116 if (!ignore_msrs) { 1117 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1118 return 1; 1119 } else { 1120 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1121 data = 0; 1122 } 1123 break; 1124 } 1125 *pdata = data; 1126 return 0; 1127 } 1128 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1129 1130 /* 1131 * Read or write a bunch of msrs. All parameters are kernel addresses. 1132 * 1133 * @return number of msrs set successfully. 1134 */ 1135 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1136 struct kvm_msr_entry *entries, 1137 int (*do_msr)(struct kvm_vcpu *vcpu, 1138 unsigned index, u64 *data)) 1139 { 1140 int i; 1141 1142 vcpu_load(vcpu); 1143 1144 down_read(&vcpu->kvm->slots_lock); 1145 for (i = 0; i < msrs->nmsrs; ++i) 1146 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1147 break; 1148 up_read(&vcpu->kvm->slots_lock); 1149 1150 vcpu_put(vcpu); 1151 1152 return i; 1153 } 1154 1155 /* 1156 * Read or write a bunch of msrs. Parameters are user addresses. 1157 * 1158 * @return number of msrs set successfully. 1159 */ 1160 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1161 int (*do_msr)(struct kvm_vcpu *vcpu, 1162 unsigned index, u64 *data), 1163 int writeback) 1164 { 1165 struct kvm_msrs msrs; 1166 struct kvm_msr_entry *entries; 1167 int r, n; 1168 unsigned size; 1169 1170 r = -EFAULT; 1171 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1172 goto out; 1173 1174 r = -E2BIG; 1175 if (msrs.nmsrs >= MAX_IO_MSRS) 1176 goto out; 1177 1178 r = -ENOMEM; 1179 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1180 entries = vmalloc(size); 1181 if (!entries) 1182 goto out; 1183 1184 r = -EFAULT; 1185 if (copy_from_user(entries, user_msrs->entries, size)) 1186 goto out_free; 1187 1188 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1189 if (r < 0) 1190 goto out_free; 1191 1192 r = -EFAULT; 1193 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1194 goto out_free; 1195 1196 r = n; 1197 1198 out_free: 1199 vfree(entries); 1200 out: 1201 return r; 1202 } 1203 1204 int kvm_dev_ioctl_check_extension(long ext) 1205 { 1206 int r; 1207 1208 switch (ext) { 1209 case KVM_CAP_IRQCHIP: 1210 case KVM_CAP_HLT: 1211 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1212 case KVM_CAP_SET_TSS_ADDR: 1213 case KVM_CAP_EXT_CPUID: 1214 case KVM_CAP_CLOCKSOURCE: 1215 case KVM_CAP_PIT: 1216 case KVM_CAP_NOP_IO_DELAY: 1217 case KVM_CAP_MP_STATE: 1218 case KVM_CAP_SYNC_MMU: 1219 case KVM_CAP_REINJECT_CONTROL: 1220 case KVM_CAP_IRQ_INJECT_STATUS: 1221 case KVM_CAP_ASSIGN_DEV_IRQ: 1222 case KVM_CAP_IRQFD: 1223 case KVM_CAP_IOEVENTFD: 1224 case KVM_CAP_PIT2: 1225 case KVM_CAP_PIT_STATE2: 1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1227 r = 1; 1228 break; 1229 case KVM_CAP_COALESCED_MMIO: 1230 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1231 break; 1232 case KVM_CAP_VAPIC: 1233 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1234 break; 1235 case KVM_CAP_NR_VCPUS: 1236 r = KVM_MAX_VCPUS; 1237 break; 1238 case KVM_CAP_NR_MEMSLOTS: 1239 r = KVM_MEMORY_SLOTS; 1240 break; 1241 case KVM_CAP_PV_MMU: 1242 r = !tdp_enabled; 1243 break; 1244 case KVM_CAP_IOMMU: 1245 r = iommu_found(); 1246 break; 1247 case KVM_CAP_MCE: 1248 r = KVM_MAX_MCE_BANKS; 1249 break; 1250 default: 1251 r = 0; 1252 break; 1253 } 1254 return r; 1255 1256 } 1257 1258 long kvm_arch_dev_ioctl(struct file *filp, 1259 unsigned int ioctl, unsigned long arg) 1260 { 1261 void __user *argp = (void __user *)arg; 1262 long r; 1263 1264 switch (ioctl) { 1265 case KVM_GET_MSR_INDEX_LIST: { 1266 struct kvm_msr_list __user *user_msr_list = argp; 1267 struct kvm_msr_list msr_list; 1268 unsigned n; 1269 1270 r = -EFAULT; 1271 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1272 goto out; 1273 n = msr_list.nmsrs; 1274 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1275 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1276 goto out; 1277 r = -E2BIG; 1278 if (n < msr_list.nmsrs) 1279 goto out; 1280 r = -EFAULT; 1281 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1282 num_msrs_to_save * sizeof(u32))) 1283 goto out; 1284 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 1285 &emulated_msrs, 1286 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1287 goto out; 1288 r = 0; 1289 break; 1290 } 1291 case KVM_GET_SUPPORTED_CPUID: { 1292 struct kvm_cpuid2 __user *cpuid_arg = argp; 1293 struct kvm_cpuid2 cpuid; 1294 1295 r = -EFAULT; 1296 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1297 goto out; 1298 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1299 cpuid_arg->entries); 1300 if (r) 1301 goto out; 1302 1303 r = -EFAULT; 1304 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1305 goto out; 1306 r = 0; 1307 break; 1308 } 1309 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 1310 u64 mce_cap; 1311 1312 mce_cap = KVM_MCE_CAP_SUPPORTED; 1313 r = -EFAULT; 1314 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 1315 goto out; 1316 r = 0; 1317 break; 1318 } 1319 default: 1320 r = -EINVAL; 1321 } 1322 out: 1323 return r; 1324 } 1325 1326 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1327 { 1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1329 kvm_request_guest_time_update(vcpu); 1330 } 1331 1332 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1333 { 1334 kvm_x86_ops->vcpu_put(vcpu); 1335 kvm_put_guest_fpu(vcpu); 1336 } 1337 1338 static int is_efer_nx(void) 1339 { 1340 unsigned long long efer = 0; 1341 1342 rdmsrl_safe(MSR_EFER, &efer); 1343 return efer & EFER_NX; 1344 } 1345 1346 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1347 { 1348 int i; 1349 struct kvm_cpuid_entry2 *e, *entry; 1350 1351 entry = NULL; 1352 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1353 e = &vcpu->arch.cpuid_entries[i]; 1354 if (e->function == 0x80000001) { 1355 entry = e; 1356 break; 1357 } 1358 } 1359 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1360 entry->edx &= ~(1 << 20); 1361 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1362 } 1363 } 1364 1365 /* when an old userspace process fills a new kernel module */ 1366 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1367 struct kvm_cpuid *cpuid, 1368 struct kvm_cpuid_entry __user *entries) 1369 { 1370 int r, i; 1371 struct kvm_cpuid_entry *cpuid_entries; 1372 1373 r = -E2BIG; 1374 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1375 goto out; 1376 r = -ENOMEM; 1377 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1378 if (!cpuid_entries) 1379 goto out; 1380 r = -EFAULT; 1381 if (copy_from_user(cpuid_entries, entries, 1382 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1383 goto out_free; 1384 for (i = 0; i < cpuid->nent; i++) { 1385 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1386 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1387 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1388 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1389 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1390 vcpu->arch.cpuid_entries[i].index = 0; 1391 vcpu->arch.cpuid_entries[i].flags = 0; 1392 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1393 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1394 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1395 } 1396 vcpu->arch.cpuid_nent = cpuid->nent; 1397 cpuid_fix_nx_cap(vcpu); 1398 r = 0; 1399 kvm_apic_set_version(vcpu); 1400 1401 out_free: 1402 vfree(cpuid_entries); 1403 out: 1404 return r; 1405 } 1406 1407 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1408 struct kvm_cpuid2 *cpuid, 1409 struct kvm_cpuid_entry2 __user *entries) 1410 { 1411 int r; 1412 1413 r = -E2BIG; 1414 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1415 goto out; 1416 r = -EFAULT; 1417 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1418 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1419 goto out; 1420 vcpu->arch.cpuid_nent = cpuid->nent; 1421 kvm_apic_set_version(vcpu); 1422 return 0; 1423 1424 out: 1425 return r; 1426 } 1427 1428 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1429 struct kvm_cpuid2 *cpuid, 1430 struct kvm_cpuid_entry2 __user *entries) 1431 { 1432 int r; 1433 1434 r = -E2BIG; 1435 if (cpuid->nent < vcpu->arch.cpuid_nent) 1436 goto out; 1437 r = -EFAULT; 1438 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1439 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1440 goto out; 1441 return 0; 1442 1443 out: 1444 cpuid->nent = vcpu->arch.cpuid_nent; 1445 return r; 1446 } 1447 1448 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1449 u32 index) 1450 { 1451 entry->function = function; 1452 entry->index = index; 1453 cpuid_count(entry->function, entry->index, 1454 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1455 entry->flags = 0; 1456 } 1457 1458 #define F(x) bit(X86_FEATURE_##x) 1459 1460 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1461 u32 index, int *nent, int maxnent) 1462 { 1463 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1464 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; 1465 #ifdef CONFIG_X86_64 1466 unsigned f_lm = F(LM); 1467 #else 1468 unsigned f_lm = 0; 1469 #endif 1470 1471 /* cpuid 1.edx */ 1472 const u32 kvm_supported_word0_x86_features = 1473 F(FPU) | F(VME) | F(DE) | F(PSE) | 1474 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1475 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1476 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1477 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1478 0 /* Reserved, DS, ACPI */ | F(MMX) | 1479 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1480 0 /* HTT, TM, Reserved, PBE */; 1481 /* cpuid 0x80000001.edx */ 1482 const u32 kvm_supported_word1_x86_features = 1483 F(FPU) | F(VME) | F(DE) | F(PSE) | 1484 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1485 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1486 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1487 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1488 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1489 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1490 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1491 /* cpuid 1.ecx */ 1492 const u32 kvm_supported_word4_x86_features = 1493 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1494 0 /* DS-CPL, VMX, SMX, EST */ | 1495 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1496 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1497 0 /* Reserved, DCA */ | F(XMM4_1) | 1498 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1499 0 /* Reserved, XSAVE, OSXSAVE */; 1500 /* cpuid 0x80000001.ecx */ 1501 const u32 kvm_supported_word6_x86_features = 1502 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1503 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1504 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1505 0 /* SKINIT */ | 0 /* WDT */; 1506 1507 /* all calls to cpuid_count() should be made on the same cpu */ 1508 get_cpu(); 1509 do_cpuid_1_ent(entry, function, index); 1510 ++*nent; 1511 1512 switch (function) { 1513 case 0: 1514 entry->eax = min(entry->eax, (u32)0xb); 1515 break; 1516 case 1: 1517 entry->edx &= kvm_supported_word0_x86_features; 1518 entry->ecx &= kvm_supported_word4_x86_features; 1519 /* we support x2apic emulation even if host does not support 1520 * it since we emulate x2apic in software */ 1521 entry->ecx |= F(X2APIC); 1522 break; 1523 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1524 * may return different values. This forces us to get_cpu() before 1525 * issuing the first command, and also to emulate this annoying behavior 1526 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1527 case 2: { 1528 int t, times = entry->eax & 0xff; 1529 1530 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1531 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1532 for (t = 1; t < times && *nent < maxnent; ++t) { 1533 do_cpuid_1_ent(&entry[t], function, 0); 1534 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1535 ++*nent; 1536 } 1537 break; 1538 } 1539 /* function 4 and 0xb have additional index. */ 1540 case 4: { 1541 int i, cache_type; 1542 1543 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1544 /* read more entries until cache_type is zero */ 1545 for (i = 1; *nent < maxnent; ++i) { 1546 cache_type = entry[i - 1].eax & 0x1f; 1547 if (!cache_type) 1548 break; 1549 do_cpuid_1_ent(&entry[i], function, i); 1550 entry[i].flags |= 1551 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1552 ++*nent; 1553 } 1554 break; 1555 } 1556 case 0xb: { 1557 int i, level_type; 1558 1559 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1560 /* read more entries until level_type is zero */ 1561 for (i = 1; *nent < maxnent; ++i) { 1562 level_type = entry[i - 1].ecx & 0xff00; 1563 if (!level_type) 1564 break; 1565 do_cpuid_1_ent(&entry[i], function, i); 1566 entry[i].flags |= 1567 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1568 ++*nent; 1569 } 1570 break; 1571 } 1572 case 0x80000000: 1573 entry->eax = min(entry->eax, 0x8000001a); 1574 break; 1575 case 0x80000001: 1576 entry->edx &= kvm_supported_word1_x86_features; 1577 entry->ecx &= kvm_supported_word6_x86_features; 1578 break; 1579 } 1580 put_cpu(); 1581 } 1582 1583 #undef F 1584 1585 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1586 struct kvm_cpuid_entry2 __user *entries) 1587 { 1588 struct kvm_cpuid_entry2 *cpuid_entries; 1589 int limit, nent = 0, r = -E2BIG; 1590 u32 func; 1591 1592 if (cpuid->nent < 1) 1593 goto out; 1594 r = -ENOMEM; 1595 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1596 if (!cpuid_entries) 1597 goto out; 1598 1599 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1600 limit = cpuid_entries[0].eax; 1601 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1602 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1603 &nent, cpuid->nent); 1604 r = -E2BIG; 1605 if (nent >= cpuid->nent) 1606 goto out_free; 1607 1608 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1609 limit = cpuid_entries[nent - 1].eax; 1610 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1611 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1612 &nent, cpuid->nent); 1613 r = -E2BIG; 1614 if (nent >= cpuid->nent) 1615 goto out_free; 1616 1617 r = -EFAULT; 1618 if (copy_to_user(entries, cpuid_entries, 1619 nent * sizeof(struct kvm_cpuid_entry2))) 1620 goto out_free; 1621 cpuid->nent = nent; 1622 r = 0; 1623 1624 out_free: 1625 vfree(cpuid_entries); 1626 out: 1627 return r; 1628 } 1629 1630 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1631 struct kvm_lapic_state *s) 1632 { 1633 vcpu_load(vcpu); 1634 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1635 vcpu_put(vcpu); 1636 1637 return 0; 1638 } 1639 1640 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1641 struct kvm_lapic_state *s) 1642 { 1643 vcpu_load(vcpu); 1644 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1645 kvm_apic_post_state_restore(vcpu); 1646 update_cr8_intercept(vcpu); 1647 vcpu_put(vcpu); 1648 1649 return 0; 1650 } 1651 1652 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1653 struct kvm_interrupt *irq) 1654 { 1655 if (irq->irq < 0 || irq->irq >= 256) 1656 return -EINVAL; 1657 if (irqchip_in_kernel(vcpu->kvm)) 1658 return -ENXIO; 1659 vcpu_load(vcpu); 1660 1661 kvm_queue_interrupt(vcpu, irq->irq, false); 1662 1663 vcpu_put(vcpu); 1664 1665 return 0; 1666 } 1667 1668 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1669 { 1670 vcpu_load(vcpu); 1671 kvm_inject_nmi(vcpu); 1672 vcpu_put(vcpu); 1673 1674 return 0; 1675 } 1676 1677 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1678 struct kvm_tpr_access_ctl *tac) 1679 { 1680 if (tac->flags) 1681 return -EINVAL; 1682 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1683 return 0; 1684 } 1685 1686 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 1687 u64 mcg_cap) 1688 { 1689 int r; 1690 unsigned bank_num = mcg_cap & 0xff, bank; 1691 1692 r = -EINVAL; 1693 if (!bank_num) 1694 goto out; 1695 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1696 goto out; 1697 r = 0; 1698 vcpu->arch.mcg_cap = mcg_cap; 1699 /* Init IA32_MCG_CTL to all 1s */ 1700 if (mcg_cap & MCG_CTL_P) 1701 vcpu->arch.mcg_ctl = ~(u64)0; 1702 /* Init IA32_MCi_CTL to all 1s */ 1703 for (bank = 0; bank < bank_num; bank++) 1704 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 1705 out: 1706 return r; 1707 } 1708 1709 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 1710 struct kvm_x86_mce *mce) 1711 { 1712 u64 mcg_cap = vcpu->arch.mcg_cap; 1713 unsigned bank_num = mcg_cap & 0xff; 1714 u64 *banks = vcpu->arch.mce_banks; 1715 1716 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 1717 return -EINVAL; 1718 /* 1719 * if IA32_MCG_CTL is not all 1s, the uncorrected error 1720 * reporting is disabled 1721 */ 1722 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 1723 vcpu->arch.mcg_ctl != ~(u64)0) 1724 return 0; 1725 banks += 4 * mce->bank; 1726 /* 1727 * if IA32_MCi_CTL is not all 1s, the uncorrected error 1728 * reporting is disabled for the bank 1729 */ 1730 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 1731 return 0; 1732 if (mce->status & MCI_STATUS_UC) { 1733 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 1734 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 1735 printk(KERN_DEBUG "kvm: set_mce: " 1736 "injects mce exception while " 1737 "previous one is in progress!\n"); 1738 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 1739 return 0; 1740 } 1741 if (banks[1] & MCI_STATUS_VAL) 1742 mce->status |= MCI_STATUS_OVER; 1743 banks[2] = mce->addr; 1744 banks[3] = mce->misc; 1745 vcpu->arch.mcg_status = mce->mcg_status; 1746 banks[1] = mce->status; 1747 kvm_queue_exception(vcpu, MC_VECTOR); 1748 } else if (!(banks[1] & MCI_STATUS_VAL) 1749 || !(banks[1] & MCI_STATUS_UC)) { 1750 if (banks[1] & MCI_STATUS_VAL) 1751 mce->status |= MCI_STATUS_OVER; 1752 banks[2] = mce->addr; 1753 banks[3] = mce->misc; 1754 banks[1] = mce->status; 1755 } else 1756 banks[1] |= MCI_STATUS_OVER; 1757 return 0; 1758 } 1759 1760 long kvm_arch_vcpu_ioctl(struct file *filp, 1761 unsigned int ioctl, unsigned long arg) 1762 { 1763 struct kvm_vcpu *vcpu = filp->private_data; 1764 void __user *argp = (void __user *)arg; 1765 int r; 1766 struct kvm_lapic_state *lapic = NULL; 1767 1768 switch (ioctl) { 1769 case KVM_GET_LAPIC: { 1770 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1771 1772 r = -ENOMEM; 1773 if (!lapic) 1774 goto out; 1775 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1776 if (r) 1777 goto out; 1778 r = -EFAULT; 1779 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1780 goto out; 1781 r = 0; 1782 break; 1783 } 1784 case KVM_SET_LAPIC: { 1785 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1786 r = -ENOMEM; 1787 if (!lapic) 1788 goto out; 1789 r = -EFAULT; 1790 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1791 goto out; 1792 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1793 if (r) 1794 goto out; 1795 r = 0; 1796 break; 1797 } 1798 case KVM_INTERRUPT: { 1799 struct kvm_interrupt irq; 1800 1801 r = -EFAULT; 1802 if (copy_from_user(&irq, argp, sizeof irq)) 1803 goto out; 1804 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1805 if (r) 1806 goto out; 1807 r = 0; 1808 break; 1809 } 1810 case KVM_NMI: { 1811 r = kvm_vcpu_ioctl_nmi(vcpu); 1812 if (r) 1813 goto out; 1814 r = 0; 1815 break; 1816 } 1817 case KVM_SET_CPUID: { 1818 struct kvm_cpuid __user *cpuid_arg = argp; 1819 struct kvm_cpuid cpuid; 1820 1821 r = -EFAULT; 1822 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1823 goto out; 1824 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1825 if (r) 1826 goto out; 1827 break; 1828 } 1829 case KVM_SET_CPUID2: { 1830 struct kvm_cpuid2 __user *cpuid_arg = argp; 1831 struct kvm_cpuid2 cpuid; 1832 1833 r = -EFAULT; 1834 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1835 goto out; 1836 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1837 cpuid_arg->entries); 1838 if (r) 1839 goto out; 1840 break; 1841 } 1842 case KVM_GET_CPUID2: { 1843 struct kvm_cpuid2 __user *cpuid_arg = argp; 1844 struct kvm_cpuid2 cpuid; 1845 1846 r = -EFAULT; 1847 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1848 goto out; 1849 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1850 cpuid_arg->entries); 1851 if (r) 1852 goto out; 1853 r = -EFAULT; 1854 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1855 goto out; 1856 r = 0; 1857 break; 1858 } 1859 case KVM_GET_MSRS: 1860 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1861 break; 1862 case KVM_SET_MSRS: 1863 r = msr_io(vcpu, argp, do_set_msr, 0); 1864 break; 1865 case KVM_TPR_ACCESS_REPORTING: { 1866 struct kvm_tpr_access_ctl tac; 1867 1868 r = -EFAULT; 1869 if (copy_from_user(&tac, argp, sizeof tac)) 1870 goto out; 1871 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1872 if (r) 1873 goto out; 1874 r = -EFAULT; 1875 if (copy_to_user(argp, &tac, sizeof tac)) 1876 goto out; 1877 r = 0; 1878 break; 1879 }; 1880 case KVM_SET_VAPIC_ADDR: { 1881 struct kvm_vapic_addr va; 1882 1883 r = -EINVAL; 1884 if (!irqchip_in_kernel(vcpu->kvm)) 1885 goto out; 1886 r = -EFAULT; 1887 if (copy_from_user(&va, argp, sizeof va)) 1888 goto out; 1889 r = 0; 1890 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1891 break; 1892 } 1893 case KVM_X86_SETUP_MCE: { 1894 u64 mcg_cap; 1895 1896 r = -EFAULT; 1897 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 1898 goto out; 1899 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 1900 break; 1901 } 1902 case KVM_X86_SET_MCE: { 1903 struct kvm_x86_mce mce; 1904 1905 r = -EFAULT; 1906 if (copy_from_user(&mce, argp, sizeof mce)) 1907 goto out; 1908 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 1909 break; 1910 } 1911 default: 1912 r = -EINVAL; 1913 } 1914 out: 1915 kfree(lapic); 1916 return r; 1917 } 1918 1919 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1920 { 1921 int ret; 1922 1923 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1924 return -1; 1925 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1926 return ret; 1927 } 1928 1929 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 1930 u64 ident_addr) 1931 { 1932 kvm->arch.ept_identity_map_addr = ident_addr; 1933 return 0; 1934 } 1935 1936 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1937 u32 kvm_nr_mmu_pages) 1938 { 1939 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1940 return -EINVAL; 1941 1942 down_write(&kvm->slots_lock); 1943 spin_lock(&kvm->mmu_lock); 1944 1945 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1946 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1947 1948 spin_unlock(&kvm->mmu_lock); 1949 up_write(&kvm->slots_lock); 1950 return 0; 1951 } 1952 1953 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1954 { 1955 return kvm->arch.n_alloc_mmu_pages; 1956 } 1957 1958 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1959 { 1960 int i; 1961 struct kvm_mem_alias *alias; 1962 1963 for (i = 0; i < kvm->arch.naliases; ++i) { 1964 alias = &kvm->arch.aliases[i]; 1965 if (gfn >= alias->base_gfn 1966 && gfn < alias->base_gfn + alias->npages) 1967 return alias->target_gfn + gfn - alias->base_gfn; 1968 } 1969 return gfn; 1970 } 1971 1972 /* 1973 * Set a new alias region. Aliases map a portion of physical memory into 1974 * another portion. This is useful for memory windows, for example the PC 1975 * VGA region. 1976 */ 1977 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1978 struct kvm_memory_alias *alias) 1979 { 1980 int r, n; 1981 struct kvm_mem_alias *p; 1982 1983 r = -EINVAL; 1984 /* General sanity checks */ 1985 if (alias->memory_size & (PAGE_SIZE - 1)) 1986 goto out; 1987 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1988 goto out; 1989 if (alias->slot >= KVM_ALIAS_SLOTS) 1990 goto out; 1991 if (alias->guest_phys_addr + alias->memory_size 1992 < alias->guest_phys_addr) 1993 goto out; 1994 if (alias->target_phys_addr + alias->memory_size 1995 < alias->target_phys_addr) 1996 goto out; 1997 1998 down_write(&kvm->slots_lock); 1999 spin_lock(&kvm->mmu_lock); 2000 2001 p = &kvm->arch.aliases[alias->slot]; 2002 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2003 p->npages = alias->memory_size >> PAGE_SHIFT; 2004 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2005 2006 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2007 if (kvm->arch.aliases[n - 1].npages) 2008 break; 2009 kvm->arch.naliases = n; 2010 2011 spin_unlock(&kvm->mmu_lock); 2012 kvm_mmu_zap_all(kvm); 2013 2014 up_write(&kvm->slots_lock); 2015 2016 return 0; 2017 2018 out: 2019 return r; 2020 } 2021 2022 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2023 { 2024 int r; 2025 2026 r = 0; 2027 switch (chip->chip_id) { 2028 case KVM_IRQCHIP_PIC_MASTER: 2029 memcpy(&chip->chip.pic, 2030 &pic_irqchip(kvm)->pics[0], 2031 sizeof(struct kvm_pic_state)); 2032 break; 2033 case KVM_IRQCHIP_PIC_SLAVE: 2034 memcpy(&chip->chip.pic, 2035 &pic_irqchip(kvm)->pics[1], 2036 sizeof(struct kvm_pic_state)); 2037 break; 2038 case KVM_IRQCHIP_IOAPIC: 2039 memcpy(&chip->chip.ioapic, 2040 ioapic_irqchip(kvm), 2041 sizeof(struct kvm_ioapic_state)); 2042 break; 2043 default: 2044 r = -EINVAL; 2045 break; 2046 } 2047 return r; 2048 } 2049 2050 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2051 { 2052 int r; 2053 2054 r = 0; 2055 switch (chip->chip_id) { 2056 case KVM_IRQCHIP_PIC_MASTER: 2057 spin_lock(&pic_irqchip(kvm)->lock); 2058 memcpy(&pic_irqchip(kvm)->pics[0], 2059 &chip->chip.pic, 2060 sizeof(struct kvm_pic_state)); 2061 spin_unlock(&pic_irqchip(kvm)->lock); 2062 break; 2063 case KVM_IRQCHIP_PIC_SLAVE: 2064 spin_lock(&pic_irqchip(kvm)->lock); 2065 memcpy(&pic_irqchip(kvm)->pics[1], 2066 &chip->chip.pic, 2067 sizeof(struct kvm_pic_state)); 2068 spin_unlock(&pic_irqchip(kvm)->lock); 2069 break; 2070 case KVM_IRQCHIP_IOAPIC: 2071 mutex_lock(&kvm->irq_lock); 2072 memcpy(ioapic_irqchip(kvm), 2073 &chip->chip.ioapic, 2074 sizeof(struct kvm_ioapic_state)); 2075 mutex_unlock(&kvm->irq_lock); 2076 break; 2077 default: 2078 r = -EINVAL; 2079 break; 2080 } 2081 kvm_pic_update_irq(pic_irqchip(kvm)); 2082 return r; 2083 } 2084 2085 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2086 { 2087 int r = 0; 2088 2089 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2090 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2091 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2092 return r; 2093 } 2094 2095 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2096 { 2097 int r = 0; 2098 2099 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2100 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2101 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 2102 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2103 return r; 2104 } 2105 2106 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2107 { 2108 int r = 0; 2109 2110 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2111 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 2112 sizeof(ps->channels)); 2113 ps->flags = kvm->arch.vpit->pit_state.flags; 2114 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2115 return r; 2116 } 2117 2118 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2119 { 2120 int r = 0, start = 0; 2121 u32 prev_legacy, cur_legacy; 2122 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2123 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 2124 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 2125 if (!prev_legacy && cur_legacy) 2126 start = 1; 2127 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 2128 sizeof(kvm->arch.vpit->pit_state.channels)); 2129 kvm->arch.vpit->pit_state.flags = ps->flags; 2130 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 2131 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2132 return r; 2133 } 2134 2135 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 2136 struct kvm_reinject_control *control) 2137 { 2138 if (!kvm->arch.vpit) 2139 return -ENXIO; 2140 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2141 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2142 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2143 return 0; 2144 } 2145 2146 /* 2147 * Get (and clear) the dirty memory log for a memory slot. 2148 */ 2149 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2150 struct kvm_dirty_log *log) 2151 { 2152 int r; 2153 int n; 2154 struct kvm_memory_slot *memslot; 2155 int is_dirty = 0; 2156 2157 down_write(&kvm->slots_lock); 2158 2159 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2160 if (r) 2161 goto out; 2162 2163 /* If nothing is dirty, don't bother messing with page tables. */ 2164 if (is_dirty) { 2165 spin_lock(&kvm->mmu_lock); 2166 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2167 spin_unlock(&kvm->mmu_lock); 2168 memslot = &kvm->memslots[log->slot]; 2169 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2170 memset(memslot->dirty_bitmap, 0, n); 2171 } 2172 r = 0; 2173 out: 2174 up_write(&kvm->slots_lock); 2175 return r; 2176 } 2177 2178 long kvm_arch_vm_ioctl(struct file *filp, 2179 unsigned int ioctl, unsigned long arg) 2180 { 2181 struct kvm *kvm = filp->private_data; 2182 void __user *argp = (void __user *)arg; 2183 int r = -EINVAL; 2184 /* 2185 * This union makes it completely explicit to gcc-3.x 2186 * that these two variables' stack usage should be 2187 * combined, not added together. 2188 */ 2189 union { 2190 struct kvm_pit_state ps; 2191 struct kvm_pit_state2 ps2; 2192 struct kvm_memory_alias alias; 2193 struct kvm_pit_config pit_config; 2194 } u; 2195 2196 switch (ioctl) { 2197 case KVM_SET_TSS_ADDR: 2198 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 2199 if (r < 0) 2200 goto out; 2201 break; 2202 case KVM_SET_IDENTITY_MAP_ADDR: { 2203 u64 ident_addr; 2204 2205 r = -EFAULT; 2206 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 2207 goto out; 2208 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 2209 if (r < 0) 2210 goto out; 2211 break; 2212 } 2213 case KVM_SET_MEMORY_REGION: { 2214 struct kvm_memory_region kvm_mem; 2215 struct kvm_userspace_memory_region kvm_userspace_mem; 2216 2217 r = -EFAULT; 2218 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2219 goto out; 2220 kvm_userspace_mem.slot = kvm_mem.slot; 2221 kvm_userspace_mem.flags = kvm_mem.flags; 2222 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 2223 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 2224 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 2225 if (r) 2226 goto out; 2227 break; 2228 } 2229 case KVM_SET_NR_MMU_PAGES: 2230 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2231 if (r) 2232 goto out; 2233 break; 2234 case KVM_GET_NR_MMU_PAGES: 2235 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2236 break; 2237 case KVM_SET_MEMORY_ALIAS: 2238 r = -EFAULT; 2239 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 2240 goto out; 2241 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 2242 if (r) 2243 goto out; 2244 break; 2245 case KVM_CREATE_IRQCHIP: 2246 r = -ENOMEM; 2247 kvm->arch.vpic = kvm_create_pic(kvm); 2248 if (kvm->arch.vpic) { 2249 r = kvm_ioapic_init(kvm); 2250 if (r) { 2251 kfree(kvm->arch.vpic); 2252 kvm->arch.vpic = NULL; 2253 goto out; 2254 } 2255 } else 2256 goto out; 2257 r = kvm_setup_default_irq_routing(kvm); 2258 if (r) { 2259 kfree(kvm->arch.vpic); 2260 kfree(kvm->arch.vioapic); 2261 goto out; 2262 } 2263 break; 2264 case KVM_CREATE_PIT: 2265 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2266 goto create_pit; 2267 case KVM_CREATE_PIT2: 2268 r = -EFAULT; 2269 if (copy_from_user(&u.pit_config, argp, 2270 sizeof(struct kvm_pit_config))) 2271 goto out; 2272 create_pit: 2273 down_write(&kvm->slots_lock); 2274 r = -EEXIST; 2275 if (kvm->arch.vpit) 2276 goto create_pit_unlock; 2277 r = -ENOMEM; 2278 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 2279 if (kvm->arch.vpit) 2280 r = 0; 2281 create_pit_unlock: 2282 up_write(&kvm->slots_lock); 2283 break; 2284 case KVM_IRQ_LINE_STATUS: 2285 case KVM_IRQ_LINE: { 2286 struct kvm_irq_level irq_event; 2287 2288 r = -EFAULT; 2289 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2290 goto out; 2291 if (irqchip_in_kernel(kvm)) { 2292 __s32 status; 2293 mutex_lock(&kvm->irq_lock); 2294 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2295 irq_event.irq, irq_event.level); 2296 mutex_unlock(&kvm->irq_lock); 2297 if (ioctl == KVM_IRQ_LINE_STATUS) { 2298 irq_event.status = status; 2299 if (copy_to_user(argp, &irq_event, 2300 sizeof irq_event)) 2301 goto out; 2302 } 2303 r = 0; 2304 } 2305 break; 2306 } 2307 case KVM_GET_IRQCHIP: { 2308 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2309 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2310 2311 r = -ENOMEM; 2312 if (!chip) 2313 goto out; 2314 r = -EFAULT; 2315 if (copy_from_user(chip, argp, sizeof *chip)) 2316 goto get_irqchip_out; 2317 r = -ENXIO; 2318 if (!irqchip_in_kernel(kvm)) 2319 goto get_irqchip_out; 2320 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 2321 if (r) 2322 goto get_irqchip_out; 2323 r = -EFAULT; 2324 if (copy_to_user(argp, chip, sizeof *chip)) 2325 goto get_irqchip_out; 2326 r = 0; 2327 get_irqchip_out: 2328 kfree(chip); 2329 if (r) 2330 goto out; 2331 break; 2332 } 2333 case KVM_SET_IRQCHIP: { 2334 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2335 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2336 2337 r = -ENOMEM; 2338 if (!chip) 2339 goto out; 2340 r = -EFAULT; 2341 if (copy_from_user(chip, argp, sizeof *chip)) 2342 goto set_irqchip_out; 2343 r = -ENXIO; 2344 if (!irqchip_in_kernel(kvm)) 2345 goto set_irqchip_out; 2346 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 2347 if (r) 2348 goto set_irqchip_out; 2349 r = 0; 2350 set_irqchip_out: 2351 kfree(chip); 2352 if (r) 2353 goto out; 2354 break; 2355 } 2356 case KVM_GET_PIT: { 2357 r = -EFAULT; 2358 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 2359 goto out; 2360 r = -ENXIO; 2361 if (!kvm->arch.vpit) 2362 goto out; 2363 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 2364 if (r) 2365 goto out; 2366 r = -EFAULT; 2367 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 2368 goto out; 2369 r = 0; 2370 break; 2371 } 2372 case KVM_SET_PIT: { 2373 r = -EFAULT; 2374 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 2375 goto out; 2376 r = -ENXIO; 2377 if (!kvm->arch.vpit) 2378 goto out; 2379 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2380 if (r) 2381 goto out; 2382 r = 0; 2383 break; 2384 } 2385 case KVM_GET_PIT2: { 2386 r = -ENXIO; 2387 if (!kvm->arch.vpit) 2388 goto out; 2389 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 2390 if (r) 2391 goto out; 2392 r = -EFAULT; 2393 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 2394 goto out; 2395 r = 0; 2396 break; 2397 } 2398 case KVM_SET_PIT2: { 2399 r = -EFAULT; 2400 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 2401 goto out; 2402 r = -ENXIO; 2403 if (!kvm->arch.vpit) 2404 goto out; 2405 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 2406 if (r) 2407 goto out; 2408 r = 0; 2409 break; 2410 } 2411 case KVM_REINJECT_CONTROL: { 2412 struct kvm_reinject_control control; 2413 r = -EFAULT; 2414 if (copy_from_user(&control, argp, sizeof(control))) 2415 goto out; 2416 r = kvm_vm_ioctl_reinject(kvm, &control); 2417 if (r) 2418 goto out; 2419 r = 0; 2420 break; 2421 } 2422 default: 2423 ; 2424 } 2425 out: 2426 return r; 2427 } 2428 2429 static void kvm_init_msr_list(void) 2430 { 2431 u32 dummy[2]; 2432 unsigned i, j; 2433 2434 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2435 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2436 continue; 2437 if (j < i) 2438 msrs_to_save[j] = msrs_to_save[i]; 2439 j++; 2440 } 2441 num_msrs_to_save = j; 2442 } 2443 2444 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 2445 const void *v) 2446 { 2447 if (vcpu->arch.apic && 2448 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 2449 return 0; 2450 2451 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 2452 } 2453 2454 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 2455 { 2456 if (vcpu->arch.apic && 2457 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 2458 return 0; 2459 2460 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 2461 } 2462 2463 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2464 struct kvm_vcpu *vcpu) 2465 { 2466 void *data = val; 2467 int r = X86EMUL_CONTINUE; 2468 2469 while (bytes) { 2470 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2471 unsigned offset = addr & (PAGE_SIZE-1); 2472 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2473 int ret; 2474 2475 if (gpa == UNMAPPED_GVA) { 2476 r = X86EMUL_PROPAGATE_FAULT; 2477 goto out; 2478 } 2479 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2480 if (ret < 0) { 2481 r = X86EMUL_UNHANDLEABLE; 2482 goto out; 2483 } 2484 2485 bytes -= toread; 2486 data += toread; 2487 addr += toread; 2488 } 2489 out: 2490 return r; 2491 } 2492 2493 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2494 struct kvm_vcpu *vcpu) 2495 { 2496 void *data = val; 2497 int r = X86EMUL_CONTINUE; 2498 2499 while (bytes) { 2500 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2501 unsigned offset = addr & (PAGE_SIZE-1); 2502 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2503 int ret; 2504 2505 if (gpa == UNMAPPED_GVA) { 2506 r = X86EMUL_PROPAGATE_FAULT; 2507 goto out; 2508 } 2509 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2510 if (ret < 0) { 2511 r = X86EMUL_UNHANDLEABLE; 2512 goto out; 2513 } 2514 2515 bytes -= towrite; 2516 data += towrite; 2517 addr += towrite; 2518 } 2519 out: 2520 return r; 2521 } 2522 2523 2524 static int emulator_read_emulated(unsigned long addr, 2525 void *val, 2526 unsigned int bytes, 2527 struct kvm_vcpu *vcpu) 2528 { 2529 gpa_t gpa; 2530 2531 if (vcpu->mmio_read_completed) { 2532 memcpy(val, vcpu->mmio_data, bytes); 2533 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 2534 vcpu->mmio_phys_addr, *(u64 *)val); 2535 vcpu->mmio_read_completed = 0; 2536 return X86EMUL_CONTINUE; 2537 } 2538 2539 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2540 2541 /* For APIC access vmexit */ 2542 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2543 goto mmio; 2544 2545 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2546 == X86EMUL_CONTINUE) 2547 return X86EMUL_CONTINUE; 2548 if (gpa == UNMAPPED_GVA) 2549 return X86EMUL_PROPAGATE_FAULT; 2550 2551 mmio: 2552 /* 2553 * Is this MMIO handled locally? 2554 */ 2555 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 2556 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 2557 return X86EMUL_CONTINUE; 2558 } 2559 2560 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 2561 2562 vcpu->mmio_needed = 1; 2563 vcpu->mmio_phys_addr = gpa; 2564 vcpu->mmio_size = bytes; 2565 vcpu->mmio_is_write = 0; 2566 2567 return X86EMUL_UNHANDLEABLE; 2568 } 2569 2570 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2571 const void *val, int bytes) 2572 { 2573 int ret; 2574 2575 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2576 if (ret < 0) 2577 return 0; 2578 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2579 return 1; 2580 } 2581 2582 static int emulator_write_emulated_onepage(unsigned long addr, 2583 const void *val, 2584 unsigned int bytes, 2585 struct kvm_vcpu *vcpu) 2586 { 2587 gpa_t gpa; 2588 2589 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2590 2591 if (gpa == UNMAPPED_GVA) { 2592 kvm_inject_page_fault(vcpu, addr, 2); 2593 return X86EMUL_PROPAGATE_FAULT; 2594 } 2595 2596 /* For APIC access vmexit */ 2597 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2598 goto mmio; 2599 2600 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2601 return X86EMUL_CONTINUE; 2602 2603 mmio: 2604 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 2605 /* 2606 * Is this MMIO handled locally? 2607 */ 2608 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 2609 return X86EMUL_CONTINUE; 2610 2611 vcpu->mmio_needed = 1; 2612 vcpu->mmio_phys_addr = gpa; 2613 vcpu->mmio_size = bytes; 2614 vcpu->mmio_is_write = 1; 2615 memcpy(vcpu->mmio_data, val, bytes); 2616 2617 return X86EMUL_CONTINUE; 2618 } 2619 2620 int emulator_write_emulated(unsigned long addr, 2621 const void *val, 2622 unsigned int bytes, 2623 struct kvm_vcpu *vcpu) 2624 { 2625 /* Crossing a page boundary? */ 2626 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2627 int rc, now; 2628 2629 now = -addr & ~PAGE_MASK; 2630 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2631 if (rc != X86EMUL_CONTINUE) 2632 return rc; 2633 addr += now; 2634 val += now; 2635 bytes -= now; 2636 } 2637 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2638 } 2639 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2640 2641 static int emulator_cmpxchg_emulated(unsigned long addr, 2642 const void *old, 2643 const void *new, 2644 unsigned int bytes, 2645 struct kvm_vcpu *vcpu) 2646 { 2647 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 2648 #ifndef CONFIG_X86_64 2649 /* guests cmpxchg8b have to be emulated atomically */ 2650 if (bytes == 8) { 2651 gpa_t gpa; 2652 struct page *page; 2653 char *kaddr; 2654 u64 val; 2655 2656 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2657 2658 if (gpa == UNMAPPED_GVA || 2659 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2660 goto emul_write; 2661 2662 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2663 goto emul_write; 2664 2665 val = *(u64 *)new; 2666 2667 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2668 2669 kaddr = kmap_atomic(page, KM_USER0); 2670 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2671 kunmap_atomic(kaddr, KM_USER0); 2672 kvm_release_page_dirty(page); 2673 } 2674 emul_write: 2675 #endif 2676 2677 return emulator_write_emulated(addr, new, bytes, vcpu); 2678 } 2679 2680 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2681 { 2682 return kvm_x86_ops->get_segment_base(vcpu, seg); 2683 } 2684 2685 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2686 { 2687 kvm_mmu_invlpg(vcpu, address); 2688 return X86EMUL_CONTINUE; 2689 } 2690 2691 int emulate_clts(struct kvm_vcpu *vcpu) 2692 { 2693 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2694 return X86EMUL_CONTINUE; 2695 } 2696 2697 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2698 { 2699 struct kvm_vcpu *vcpu = ctxt->vcpu; 2700 2701 switch (dr) { 2702 case 0 ... 3: 2703 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2704 return X86EMUL_CONTINUE; 2705 default: 2706 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2707 return X86EMUL_UNHANDLEABLE; 2708 } 2709 } 2710 2711 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2712 { 2713 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2714 int exception; 2715 2716 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2717 if (exception) { 2718 /* FIXME: better handling */ 2719 return X86EMUL_UNHANDLEABLE; 2720 } 2721 return X86EMUL_CONTINUE; 2722 } 2723 2724 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2725 { 2726 u8 opcodes[4]; 2727 unsigned long rip = kvm_rip_read(vcpu); 2728 unsigned long rip_linear; 2729 2730 if (!printk_ratelimit()) 2731 return; 2732 2733 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2734 2735 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2736 2737 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2738 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2739 } 2740 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2741 2742 static struct x86_emulate_ops emulate_ops = { 2743 .read_std = kvm_read_guest_virt, 2744 .read_emulated = emulator_read_emulated, 2745 .write_emulated = emulator_write_emulated, 2746 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2747 }; 2748 2749 static void cache_all_regs(struct kvm_vcpu *vcpu) 2750 { 2751 kvm_register_read(vcpu, VCPU_REGS_RAX); 2752 kvm_register_read(vcpu, VCPU_REGS_RSP); 2753 kvm_register_read(vcpu, VCPU_REGS_RIP); 2754 vcpu->arch.regs_dirty = ~0; 2755 } 2756 2757 int emulate_instruction(struct kvm_vcpu *vcpu, 2758 struct kvm_run *run, 2759 unsigned long cr2, 2760 u16 error_code, 2761 int emulation_type) 2762 { 2763 int r, shadow_mask; 2764 struct decode_cache *c; 2765 2766 kvm_clear_exception_queue(vcpu); 2767 vcpu->arch.mmio_fault_cr2 = cr2; 2768 /* 2769 * TODO: fix emulate.c to use guest_read/write_register 2770 * instead of direct ->regs accesses, can save hundred cycles 2771 * on Intel for instructions that don't read/change RSP, for 2772 * for example. 2773 */ 2774 cache_all_regs(vcpu); 2775 2776 vcpu->mmio_is_write = 0; 2777 vcpu->arch.pio.string = 0; 2778 2779 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2780 int cs_db, cs_l; 2781 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2782 2783 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2784 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2785 vcpu->arch.emulate_ctxt.mode = 2786 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2787 ? X86EMUL_MODE_REAL : cs_l 2788 ? X86EMUL_MODE_PROT64 : cs_db 2789 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2790 2791 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2792 2793 /* Only allow emulation of specific instructions on #UD 2794 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 2795 c = &vcpu->arch.emulate_ctxt.decode; 2796 if (emulation_type & EMULTYPE_TRAP_UD) { 2797 if (!c->twobyte) 2798 return EMULATE_FAIL; 2799 switch (c->b) { 2800 case 0x01: /* VMMCALL */ 2801 if (c->modrm_mod != 3 || c->modrm_rm != 1) 2802 return EMULATE_FAIL; 2803 break; 2804 case 0x34: /* sysenter */ 2805 case 0x35: /* sysexit */ 2806 if (c->modrm_mod != 0 || c->modrm_rm != 0) 2807 return EMULATE_FAIL; 2808 break; 2809 case 0x05: /* syscall */ 2810 if (c->modrm_mod != 0 || c->modrm_rm != 0) 2811 return EMULATE_FAIL; 2812 break; 2813 default: 2814 return EMULATE_FAIL; 2815 } 2816 2817 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 2818 return EMULATE_FAIL; 2819 } 2820 2821 ++vcpu->stat.insn_emulation; 2822 if (r) { 2823 ++vcpu->stat.insn_emulation_fail; 2824 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2825 return EMULATE_DONE; 2826 return EMULATE_FAIL; 2827 } 2828 } 2829 2830 if (emulation_type & EMULTYPE_SKIP) { 2831 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 2832 return EMULATE_DONE; 2833 } 2834 2835 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2836 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 2837 2838 if (r == 0) 2839 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 2840 2841 if (vcpu->arch.pio.string) 2842 return EMULATE_DO_MMIO; 2843 2844 if ((r || vcpu->mmio_is_write) && run) { 2845 run->exit_reason = KVM_EXIT_MMIO; 2846 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2847 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2848 run->mmio.len = vcpu->mmio_size; 2849 run->mmio.is_write = vcpu->mmio_is_write; 2850 } 2851 2852 if (r) { 2853 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2854 return EMULATE_DONE; 2855 if (!vcpu->mmio_needed) { 2856 kvm_report_emulation_failure(vcpu, "mmio"); 2857 return EMULATE_FAIL; 2858 } 2859 return EMULATE_DO_MMIO; 2860 } 2861 2862 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2863 2864 if (vcpu->mmio_is_write) { 2865 vcpu->mmio_needed = 0; 2866 return EMULATE_DO_MMIO; 2867 } 2868 2869 return EMULATE_DONE; 2870 } 2871 EXPORT_SYMBOL_GPL(emulate_instruction); 2872 2873 static int pio_copy_data(struct kvm_vcpu *vcpu) 2874 { 2875 void *p = vcpu->arch.pio_data; 2876 gva_t q = vcpu->arch.pio.guest_gva; 2877 unsigned bytes; 2878 int ret; 2879 2880 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2881 if (vcpu->arch.pio.in) 2882 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2883 else 2884 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2885 return ret; 2886 } 2887 2888 int complete_pio(struct kvm_vcpu *vcpu) 2889 { 2890 struct kvm_pio_request *io = &vcpu->arch.pio; 2891 long delta; 2892 int r; 2893 unsigned long val; 2894 2895 if (!io->string) { 2896 if (io->in) { 2897 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2898 memcpy(&val, vcpu->arch.pio_data, io->size); 2899 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2900 } 2901 } else { 2902 if (io->in) { 2903 r = pio_copy_data(vcpu); 2904 if (r) 2905 return r; 2906 } 2907 2908 delta = 1; 2909 if (io->rep) { 2910 delta *= io->cur_count; 2911 /* 2912 * The size of the register should really depend on 2913 * current address size. 2914 */ 2915 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2916 val -= delta; 2917 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2918 } 2919 if (io->down) 2920 delta = -delta; 2921 delta *= io->size; 2922 if (io->in) { 2923 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2924 val += delta; 2925 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2926 } else { 2927 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2928 val += delta; 2929 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2930 } 2931 } 2932 2933 io->count -= io->cur_count; 2934 io->cur_count = 0; 2935 2936 return 0; 2937 } 2938 2939 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 2940 { 2941 /* TODO: String I/O for in kernel device */ 2942 int r; 2943 2944 if (vcpu->arch.pio.in) 2945 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 2946 vcpu->arch.pio.size, pd); 2947 else 2948 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 2949 vcpu->arch.pio.size, pd); 2950 return r; 2951 } 2952 2953 static int pio_string_write(struct kvm_vcpu *vcpu) 2954 { 2955 struct kvm_pio_request *io = &vcpu->arch.pio; 2956 void *pd = vcpu->arch.pio_data; 2957 int i, r = 0; 2958 2959 for (i = 0; i < io->cur_count; i++) { 2960 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 2961 io->port, io->size, pd)) { 2962 r = -EOPNOTSUPP; 2963 break; 2964 } 2965 pd += io->size; 2966 } 2967 return r; 2968 } 2969 2970 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2971 int size, unsigned port) 2972 { 2973 unsigned long val; 2974 2975 vcpu->run->exit_reason = KVM_EXIT_IO; 2976 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2977 vcpu->run->io.size = vcpu->arch.pio.size = size; 2978 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2979 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2980 vcpu->run->io.port = vcpu->arch.pio.port = port; 2981 vcpu->arch.pio.in = in; 2982 vcpu->arch.pio.string = 0; 2983 vcpu->arch.pio.down = 0; 2984 vcpu->arch.pio.rep = 0; 2985 2986 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 2987 size, 1); 2988 2989 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2990 memcpy(vcpu->arch.pio_data, &val, 4); 2991 2992 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 2993 complete_pio(vcpu); 2994 return 1; 2995 } 2996 return 0; 2997 } 2998 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2999 3000 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3001 int size, unsigned long count, int down, 3002 gva_t address, int rep, unsigned port) 3003 { 3004 unsigned now, in_page; 3005 int ret = 0; 3006 3007 vcpu->run->exit_reason = KVM_EXIT_IO; 3008 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3009 vcpu->run->io.size = vcpu->arch.pio.size = size; 3010 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3011 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 3012 vcpu->run->io.port = vcpu->arch.pio.port = port; 3013 vcpu->arch.pio.in = in; 3014 vcpu->arch.pio.string = 1; 3015 vcpu->arch.pio.down = down; 3016 vcpu->arch.pio.rep = rep; 3017 3018 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3019 size, count); 3020 3021 if (!count) { 3022 kvm_x86_ops->skip_emulated_instruction(vcpu); 3023 return 1; 3024 } 3025 3026 if (!down) 3027 in_page = PAGE_SIZE - offset_in_page(address); 3028 else 3029 in_page = offset_in_page(address) + size; 3030 now = min(count, (unsigned long)in_page / size); 3031 if (!now) 3032 now = 1; 3033 if (down) { 3034 /* 3035 * String I/O in reverse. Yuck. Kill the guest, fix later. 3036 */ 3037 pr_unimpl(vcpu, "guest string pio down\n"); 3038 kvm_inject_gp(vcpu, 0); 3039 return 1; 3040 } 3041 vcpu->run->io.count = now; 3042 vcpu->arch.pio.cur_count = now; 3043 3044 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 3045 kvm_x86_ops->skip_emulated_instruction(vcpu); 3046 3047 vcpu->arch.pio.guest_gva = address; 3048 3049 if (!vcpu->arch.pio.in) { 3050 /* string PIO write */ 3051 ret = pio_copy_data(vcpu); 3052 if (ret == X86EMUL_PROPAGATE_FAULT) { 3053 kvm_inject_gp(vcpu, 0); 3054 return 1; 3055 } 3056 if (ret == 0 && !pio_string_write(vcpu)) { 3057 complete_pio(vcpu); 3058 if (vcpu->arch.pio.count == 0) 3059 ret = 1; 3060 } 3061 } 3062 /* no string PIO read support yet */ 3063 3064 return ret; 3065 } 3066 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3067 3068 static void bounce_off(void *info) 3069 { 3070 /* nothing */ 3071 } 3072 3073 static unsigned int ref_freq; 3074 static unsigned long tsc_khz_ref; 3075 3076 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3077 void *data) 3078 { 3079 struct cpufreq_freqs *freq = data; 3080 struct kvm *kvm; 3081 struct kvm_vcpu *vcpu; 3082 int i, send_ipi = 0; 3083 3084 if (!ref_freq) 3085 ref_freq = freq->old; 3086 3087 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3088 return 0; 3089 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3090 return 0; 3091 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3092 3093 spin_lock(&kvm_lock); 3094 list_for_each_entry(kvm, &vm_list, vm_list) { 3095 kvm_for_each_vcpu(i, vcpu, kvm) { 3096 if (vcpu->cpu != freq->cpu) 3097 continue; 3098 if (!kvm_request_guest_time_update(vcpu)) 3099 continue; 3100 if (vcpu->cpu != smp_processor_id()) 3101 send_ipi++; 3102 } 3103 } 3104 spin_unlock(&kvm_lock); 3105 3106 if (freq->old < freq->new && send_ipi) { 3107 /* 3108 * We upscale the frequency. Must make the guest 3109 * doesn't see old kvmclock values while running with 3110 * the new frequency, otherwise we risk the guest sees 3111 * time go backwards. 3112 * 3113 * In case we update the frequency for another cpu 3114 * (which might be in guest context) send an interrupt 3115 * to kick the cpu out of guest context. Next time 3116 * guest context is entered kvmclock will be updated, 3117 * so the guest will not see stale values. 3118 */ 3119 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 3120 } 3121 return 0; 3122 } 3123 3124 static struct notifier_block kvmclock_cpufreq_notifier_block = { 3125 .notifier_call = kvmclock_cpufreq_notifier 3126 }; 3127 3128 int kvm_arch_init(void *opaque) 3129 { 3130 int r, cpu; 3131 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3132 3133 if (kvm_x86_ops) { 3134 printk(KERN_ERR "kvm: already loaded the other module\n"); 3135 r = -EEXIST; 3136 goto out; 3137 } 3138 3139 if (!ops->cpu_has_kvm_support()) { 3140 printk(KERN_ERR "kvm: no hardware support\n"); 3141 r = -EOPNOTSUPP; 3142 goto out; 3143 } 3144 if (ops->disabled_by_bios()) { 3145 printk(KERN_ERR "kvm: disabled by bios\n"); 3146 r = -EOPNOTSUPP; 3147 goto out; 3148 } 3149 3150 r = kvm_mmu_module_init(); 3151 if (r) 3152 goto out; 3153 3154 kvm_init_msr_list(); 3155 3156 kvm_x86_ops = ops; 3157 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 3158 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 3159 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3160 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3161 3162 for_each_possible_cpu(cpu) 3163 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 3164 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 3165 tsc_khz_ref = tsc_khz; 3166 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 3167 CPUFREQ_TRANSITION_NOTIFIER); 3168 } 3169 3170 return 0; 3171 3172 out: 3173 return r; 3174 } 3175 3176 void kvm_arch_exit(void) 3177 { 3178 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 3179 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 3180 CPUFREQ_TRANSITION_NOTIFIER); 3181 kvm_x86_ops = NULL; 3182 kvm_mmu_module_exit(); 3183 } 3184 3185 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3186 { 3187 ++vcpu->stat.halt_exits; 3188 if (irqchip_in_kernel(vcpu->kvm)) { 3189 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3190 return 1; 3191 } else { 3192 vcpu->run->exit_reason = KVM_EXIT_HLT; 3193 return 0; 3194 } 3195 } 3196 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 3197 3198 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 3199 unsigned long a1) 3200 { 3201 if (is_long_mode(vcpu)) 3202 return a0; 3203 else 3204 return a0 | ((gpa_t)a1 << 32); 3205 } 3206 3207 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3208 { 3209 unsigned long nr, a0, a1, a2, a3, ret; 3210 int r = 1; 3211 3212 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3213 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3214 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3215 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3216 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3217 3218 trace_kvm_hypercall(nr, a0, a1, a2, a3); 3219 3220 if (!is_long_mode(vcpu)) { 3221 nr &= 0xFFFFFFFF; 3222 a0 &= 0xFFFFFFFF; 3223 a1 &= 0xFFFFFFFF; 3224 a2 &= 0xFFFFFFFF; 3225 a3 &= 0xFFFFFFFF; 3226 } 3227 3228 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 3229 ret = -KVM_EPERM; 3230 goto out; 3231 } 3232 3233 switch (nr) { 3234 case KVM_HC_VAPIC_POLL_IRQ: 3235 ret = 0; 3236 break; 3237 case KVM_HC_MMU_OP: 3238 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 3239 break; 3240 default: 3241 ret = -KVM_ENOSYS; 3242 break; 3243 } 3244 out: 3245 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3246 ++vcpu->stat.hypercalls; 3247 return r; 3248 } 3249 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 3250 3251 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3252 { 3253 char instruction[3]; 3254 int ret = 0; 3255 unsigned long rip = kvm_rip_read(vcpu); 3256 3257 3258 /* 3259 * Blow out the MMU to ensure that no other VCPU has an active mapping 3260 * to ensure that the updated hypercall appears atomically across all 3261 * VCPUs. 3262 */ 3263 kvm_mmu_zap_all(vcpu->kvm); 3264 3265 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3266 if (emulator_write_emulated(rip, instruction, 3, vcpu) 3267 != X86EMUL_CONTINUE) 3268 ret = -EFAULT; 3269 3270 return ret; 3271 } 3272 3273 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3274 { 3275 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 3276 } 3277 3278 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3279 { 3280 struct descriptor_table dt = { limit, base }; 3281 3282 kvm_x86_ops->set_gdt(vcpu, &dt); 3283 } 3284 3285 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3286 { 3287 struct descriptor_table dt = { limit, base }; 3288 3289 kvm_x86_ops->set_idt(vcpu, &dt); 3290 } 3291 3292 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 3293 unsigned long *rflags) 3294 { 3295 kvm_lmsw(vcpu, msw); 3296 *rflags = kvm_x86_ops->get_rflags(vcpu); 3297 } 3298 3299 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3300 { 3301 unsigned long value; 3302 3303 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3304 switch (cr) { 3305 case 0: 3306 value = vcpu->arch.cr0; 3307 break; 3308 case 2: 3309 value = vcpu->arch.cr2; 3310 break; 3311 case 3: 3312 value = vcpu->arch.cr3; 3313 break; 3314 case 4: 3315 value = vcpu->arch.cr4; 3316 break; 3317 case 8: 3318 value = kvm_get_cr8(vcpu); 3319 break; 3320 default: 3321 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3322 return 0; 3323 } 3324 3325 return value; 3326 } 3327 3328 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3329 unsigned long *rflags) 3330 { 3331 switch (cr) { 3332 case 0: 3333 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3334 *rflags = kvm_x86_ops->get_rflags(vcpu); 3335 break; 3336 case 2: 3337 vcpu->arch.cr2 = val; 3338 break; 3339 case 3: 3340 kvm_set_cr3(vcpu, val); 3341 break; 3342 case 4: 3343 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 3344 break; 3345 case 8: 3346 kvm_set_cr8(vcpu, val & 0xfUL); 3347 break; 3348 default: 3349 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3350 } 3351 } 3352 3353 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 3354 { 3355 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 3356 int j, nent = vcpu->arch.cpuid_nent; 3357 3358 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 3359 /* when no next entry is found, the current entry[i] is reselected */ 3360 for (j = i + 1; ; j = (j + 1) % nent) { 3361 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 3362 if (ej->function == e->function) { 3363 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 3364 return j; 3365 } 3366 } 3367 return 0; /* silence gcc, even though control never reaches here */ 3368 } 3369 3370 /* find an entry with matching function, matching index (if needed), and that 3371 * should be read next (if it's stateful) */ 3372 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3373 u32 function, u32 index) 3374 { 3375 if (e->function != function) 3376 return 0; 3377 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3378 return 0; 3379 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3380 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3381 return 0; 3382 return 1; 3383 } 3384 3385 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3386 u32 function, u32 index) 3387 { 3388 int i; 3389 struct kvm_cpuid_entry2 *best = NULL; 3390 3391 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3392 struct kvm_cpuid_entry2 *e; 3393 3394 e = &vcpu->arch.cpuid_entries[i]; 3395 if (is_matching_cpuid_entry(e, function, index)) { 3396 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3397 move_to_next_stateful_cpuid_entry(vcpu, i); 3398 best = e; 3399 break; 3400 } 3401 /* 3402 * Both basic or both extended? 3403 */ 3404 if (((e->function ^ function) & 0x80000000) == 0) 3405 if (!best || e->function > best->function) 3406 best = e; 3407 } 3408 return best; 3409 } 3410 3411 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3412 { 3413 struct kvm_cpuid_entry2 *best; 3414 3415 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3416 if (best) 3417 return best->eax & 0xff; 3418 return 36; 3419 } 3420 3421 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3422 { 3423 u32 function, index; 3424 struct kvm_cpuid_entry2 *best; 3425 3426 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3427 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3428 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3429 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3430 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3431 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3432 best = kvm_find_cpuid_entry(vcpu, function, index); 3433 if (best) { 3434 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3435 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3436 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3437 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3438 } 3439 kvm_x86_ops->skip_emulated_instruction(vcpu); 3440 trace_kvm_cpuid(function, 3441 kvm_register_read(vcpu, VCPU_REGS_RAX), 3442 kvm_register_read(vcpu, VCPU_REGS_RBX), 3443 kvm_register_read(vcpu, VCPU_REGS_RCX), 3444 kvm_register_read(vcpu, VCPU_REGS_RDX)); 3445 } 3446 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3447 3448 /* 3449 * Check if userspace requested an interrupt window, and that the 3450 * interrupt window is open. 3451 * 3452 * No need to exit to userspace if we already have an interrupt queued. 3453 */ 3454 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3455 struct kvm_run *kvm_run) 3456 { 3457 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3458 kvm_run->request_interrupt_window && 3459 kvm_arch_interrupt_allowed(vcpu)); 3460 } 3461 3462 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3463 struct kvm_run *kvm_run) 3464 { 3465 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3466 kvm_run->cr8 = kvm_get_cr8(vcpu); 3467 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3468 if (irqchip_in_kernel(vcpu->kvm)) 3469 kvm_run->ready_for_interrupt_injection = 1; 3470 else 3471 kvm_run->ready_for_interrupt_injection = 3472 kvm_arch_interrupt_allowed(vcpu) && 3473 !kvm_cpu_has_interrupt(vcpu) && 3474 !kvm_event_needs_reinjection(vcpu); 3475 } 3476 3477 static void vapic_enter(struct kvm_vcpu *vcpu) 3478 { 3479 struct kvm_lapic *apic = vcpu->arch.apic; 3480 struct page *page; 3481 3482 if (!apic || !apic->vapic_addr) 3483 return; 3484 3485 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3486 3487 vcpu->arch.apic->vapic_page = page; 3488 } 3489 3490 static void vapic_exit(struct kvm_vcpu *vcpu) 3491 { 3492 struct kvm_lapic *apic = vcpu->arch.apic; 3493 3494 if (!apic || !apic->vapic_addr) 3495 return; 3496 3497 down_read(&vcpu->kvm->slots_lock); 3498 kvm_release_page_dirty(apic->vapic_page); 3499 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3500 up_read(&vcpu->kvm->slots_lock); 3501 } 3502 3503 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3504 { 3505 int max_irr, tpr; 3506 3507 if (!kvm_x86_ops->update_cr8_intercept) 3508 return; 3509 3510 if (!vcpu->arch.apic) 3511 return; 3512 3513 if (!vcpu->arch.apic->vapic_addr) 3514 max_irr = kvm_lapic_find_highest_irr(vcpu); 3515 else 3516 max_irr = -1; 3517 3518 if (max_irr != -1) 3519 max_irr >>= 4; 3520 3521 tpr = kvm_lapic_get_cr8(vcpu); 3522 3523 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3524 } 3525 3526 static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3527 { 3528 /* try to reinject previous events if any */ 3529 if (vcpu->arch.exception.pending) { 3530 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 3531 vcpu->arch.exception.has_error_code, 3532 vcpu->arch.exception.error_code); 3533 return; 3534 } 3535 3536 if (vcpu->arch.nmi_injected) { 3537 kvm_x86_ops->set_nmi(vcpu); 3538 return; 3539 } 3540 3541 if (vcpu->arch.interrupt.pending) { 3542 kvm_x86_ops->set_irq(vcpu); 3543 return; 3544 } 3545 3546 /* try to inject new event if pending */ 3547 if (vcpu->arch.nmi_pending) { 3548 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3549 vcpu->arch.nmi_pending = false; 3550 vcpu->arch.nmi_injected = true; 3551 kvm_x86_ops->set_nmi(vcpu); 3552 } 3553 } else if (kvm_cpu_has_interrupt(vcpu)) { 3554 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3555 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3556 false); 3557 kvm_x86_ops->set_irq(vcpu); 3558 } 3559 } 3560 } 3561 3562 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3563 { 3564 int r; 3565 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3566 kvm_run->request_interrupt_window; 3567 3568 if (vcpu->requests) 3569 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3570 kvm_mmu_unload(vcpu); 3571 3572 r = kvm_mmu_reload(vcpu); 3573 if (unlikely(r)) 3574 goto out; 3575 3576 if (vcpu->requests) { 3577 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3578 __kvm_migrate_timers(vcpu); 3579 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3580 kvm_write_guest_time(vcpu); 3581 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3582 kvm_mmu_sync_roots(vcpu); 3583 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3584 kvm_x86_ops->tlb_flush(vcpu); 3585 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3586 &vcpu->requests)) { 3587 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3588 r = 0; 3589 goto out; 3590 } 3591 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3592 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3593 r = 0; 3594 goto out; 3595 } 3596 } 3597 3598 preempt_disable(); 3599 3600 kvm_x86_ops->prepare_guest_switch(vcpu); 3601 kvm_load_guest_fpu(vcpu); 3602 3603 local_irq_disable(); 3604 3605 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3606 smp_mb__after_clear_bit(); 3607 3608 if (vcpu->requests || need_resched() || signal_pending(current)) { 3609 set_bit(KVM_REQ_KICK, &vcpu->requests); 3610 local_irq_enable(); 3611 preempt_enable(); 3612 r = 1; 3613 goto out; 3614 } 3615 3616 inject_pending_event(vcpu, kvm_run); 3617 3618 /* enable NMI/IRQ window open exits if needed */ 3619 if (vcpu->arch.nmi_pending) 3620 kvm_x86_ops->enable_nmi_window(vcpu); 3621 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3622 kvm_x86_ops->enable_irq_window(vcpu); 3623 3624 if (kvm_lapic_enabled(vcpu)) { 3625 update_cr8_intercept(vcpu); 3626 kvm_lapic_sync_to_vapic(vcpu); 3627 } 3628 3629 up_read(&vcpu->kvm->slots_lock); 3630 3631 kvm_guest_enter(); 3632 3633 if (unlikely(vcpu->arch.switch_db_regs)) { 3634 set_debugreg(0, 7); 3635 set_debugreg(vcpu->arch.eff_db[0], 0); 3636 set_debugreg(vcpu->arch.eff_db[1], 1); 3637 set_debugreg(vcpu->arch.eff_db[2], 2); 3638 set_debugreg(vcpu->arch.eff_db[3], 3); 3639 } 3640 3641 trace_kvm_entry(vcpu->vcpu_id); 3642 kvm_x86_ops->run(vcpu, kvm_run); 3643 3644 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3645 set_debugreg(current->thread.debugreg0, 0); 3646 set_debugreg(current->thread.debugreg1, 1); 3647 set_debugreg(current->thread.debugreg2, 2); 3648 set_debugreg(current->thread.debugreg3, 3); 3649 set_debugreg(current->thread.debugreg6, 6); 3650 set_debugreg(current->thread.debugreg7, 7); 3651 } 3652 3653 set_bit(KVM_REQ_KICK, &vcpu->requests); 3654 local_irq_enable(); 3655 3656 ++vcpu->stat.exits; 3657 3658 /* 3659 * We must have an instruction between local_irq_enable() and 3660 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3661 * the interrupt shadow. The stat.exits increment will do nicely. 3662 * But we need to prevent reordering, hence this barrier(): 3663 */ 3664 barrier(); 3665 3666 kvm_guest_exit(); 3667 3668 preempt_enable(); 3669 3670 down_read(&vcpu->kvm->slots_lock); 3671 3672 /* 3673 * Profile KVM exit RIPs: 3674 */ 3675 if (unlikely(prof_on == KVM_PROFILING)) { 3676 unsigned long rip = kvm_rip_read(vcpu); 3677 profile_hit(KVM_PROFILING, (void *)rip); 3678 } 3679 3680 3681 kvm_lapic_sync_from_vapic(vcpu); 3682 3683 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3684 out: 3685 return r; 3686 } 3687 3688 3689 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3690 { 3691 int r; 3692 3693 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3694 pr_debug("vcpu %d received sipi with vector # %x\n", 3695 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3696 kvm_lapic_reset(vcpu); 3697 r = kvm_arch_vcpu_reset(vcpu); 3698 if (r) 3699 return r; 3700 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3701 } 3702 3703 down_read(&vcpu->kvm->slots_lock); 3704 vapic_enter(vcpu); 3705 3706 r = 1; 3707 while (r > 0) { 3708 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3709 r = vcpu_enter_guest(vcpu, kvm_run); 3710 else { 3711 up_read(&vcpu->kvm->slots_lock); 3712 kvm_vcpu_block(vcpu); 3713 down_read(&vcpu->kvm->slots_lock); 3714 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3715 { 3716 switch(vcpu->arch.mp_state) { 3717 case KVM_MP_STATE_HALTED: 3718 vcpu->arch.mp_state = 3719 KVM_MP_STATE_RUNNABLE; 3720 case KVM_MP_STATE_RUNNABLE: 3721 break; 3722 case KVM_MP_STATE_SIPI_RECEIVED: 3723 default: 3724 r = -EINTR; 3725 break; 3726 } 3727 } 3728 } 3729 3730 if (r <= 0) 3731 break; 3732 3733 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3734 if (kvm_cpu_has_pending_timer(vcpu)) 3735 kvm_inject_pending_timer_irqs(vcpu); 3736 3737 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3738 r = -EINTR; 3739 kvm_run->exit_reason = KVM_EXIT_INTR; 3740 ++vcpu->stat.request_irq_exits; 3741 } 3742 if (signal_pending(current)) { 3743 r = -EINTR; 3744 kvm_run->exit_reason = KVM_EXIT_INTR; 3745 ++vcpu->stat.signal_exits; 3746 } 3747 if (need_resched()) { 3748 up_read(&vcpu->kvm->slots_lock); 3749 kvm_resched(vcpu); 3750 down_read(&vcpu->kvm->slots_lock); 3751 } 3752 } 3753 3754 up_read(&vcpu->kvm->slots_lock); 3755 post_kvm_run_save(vcpu, kvm_run); 3756 3757 vapic_exit(vcpu); 3758 3759 return r; 3760 } 3761 3762 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3763 { 3764 int r; 3765 sigset_t sigsaved; 3766 3767 vcpu_load(vcpu); 3768 3769 if (vcpu->sigset_active) 3770 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3771 3772 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3773 kvm_vcpu_block(vcpu); 3774 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3775 r = -EAGAIN; 3776 goto out; 3777 } 3778 3779 /* re-sync apic's tpr */ 3780 if (!irqchip_in_kernel(vcpu->kvm)) 3781 kvm_set_cr8(vcpu, kvm_run->cr8); 3782 3783 if (vcpu->arch.pio.cur_count) { 3784 r = complete_pio(vcpu); 3785 if (r) 3786 goto out; 3787 } 3788 #if CONFIG_HAS_IOMEM 3789 if (vcpu->mmio_needed) { 3790 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3791 vcpu->mmio_read_completed = 1; 3792 vcpu->mmio_needed = 0; 3793 3794 down_read(&vcpu->kvm->slots_lock); 3795 r = emulate_instruction(vcpu, kvm_run, 3796 vcpu->arch.mmio_fault_cr2, 0, 3797 EMULTYPE_NO_DECODE); 3798 up_read(&vcpu->kvm->slots_lock); 3799 if (r == EMULATE_DO_MMIO) { 3800 /* 3801 * Read-modify-write. Back to userspace. 3802 */ 3803 r = 0; 3804 goto out; 3805 } 3806 } 3807 #endif 3808 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3809 kvm_register_write(vcpu, VCPU_REGS_RAX, 3810 kvm_run->hypercall.ret); 3811 3812 r = __vcpu_run(vcpu, kvm_run); 3813 3814 out: 3815 if (vcpu->sigset_active) 3816 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3817 3818 vcpu_put(vcpu); 3819 return r; 3820 } 3821 3822 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3823 { 3824 vcpu_load(vcpu); 3825 3826 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3827 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3828 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3829 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3830 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3831 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3832 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3833 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3834 #ifdef CONFIG_X86_64 3835 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3836 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3837 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3838 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3839 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3840 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3841 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3842 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3843 #endif 3844 3845 regs->rip = kvm_rip_read(vcpu); 3846 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3847 3848 /* 3849 * Don't leak debug flags in case they were set for guest debugging 3850 */ 3851 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3852 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3853 3854 vcpu_put(vcpu); 3855 3856 return 0; 3857 } 3858 3859 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3860 { 3861 vcpu_load(vcpu); 3862 3863 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3864 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3865 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3866 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3867 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3868 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3869 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3870 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3871 #ifdef CONFIG_X86_64 3872 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3873 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3874 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3875 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3876 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3877 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3878 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3879 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3880 3881 #endif 3882 3883 kvm_rip_write(vcpu, regs->rip); 3884 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3885 3886 3887 vcpu->arch.exception.pending = false; 3888 3889 vcpu_put(vcpu); 3890 3891 return 0; 3892 } 3893 3894 void kvm_get_segment(struct kvm_vcpu *vcpu, 3895 struct kvm_segment *var, int seg) 3896 { 3897 kvm_x86_ops->get_segment(vcpu, var, seg); 3898 } 3899 3900 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3901 { 3902 struct kvm_segment cs; 3903 3904 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3905 *db = cs.db; 3906 *l = cs.l; 3907 } 3908 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3909 3910 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3911 struct kvm_sregs *sregs) 3912 { 3913 struct descriptor_table dt; 3914 3915 vcpu_load(vcpu); 3916 3917 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3918 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3919 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3920 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3921 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3922 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3923 3924 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3925 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3926 3927 kvm_x86_ops->get_idt(vcpu, &dt); 3928 sregs->idt.limit = dt.limit; 3929 sregs->idt.base = dt.base; 3930 kvm_x86_ops->get_gdt(vcpu, &dt); 3931 sregs->gdt.limit = dt.limit; 3932 sregs->gdt.base = dt.base; 3933 3934 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3935 sregs->cr0 = vcpu->arch.cr0; 3936 sregs->cr2 = vcpu->arch.cr2; 3937 sregs->cr3 = vcpu->arch.cr3; 3938 sregs->cr4 = vcpu->arch.cr4; 3939 sregs->cr8 = kvm_get_cr8(vcpu); 3940 sregs->efer = vcpu->arch.shadow_efer; 3941 sregs->apic_base = kvm_get_apic_base(vcpu); 3942 3943 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 3944 3945 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 3946 set_bit(vcpu->arch.interrupt.nr, 3947 (unsigned long *)sregs->interrupt_bitmap); 3948 3949 vcpu_put(vcpu); 3950 3951 return 0; 3952 } 3953 3954 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3955 struct kvm_mp_state *mp_state) 3956 { 3957 vcpu_load(vcpu); 3958 mp_state->mp_state = vcpu->arch.mp_state; 3959 vcpu_put(vcpu); 3960 return 0; 3961 } 3962 3963 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3964 struct kvm_mp_state *mp_state) 3965 { 3966 vcpu_load(vcpu); 3967 vcpu->arch.mp_state = mp_state->mp_state; 3968 vcpu_put(vcpu); 3969 return 0; 3970 } 3971 3972 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3973 struct kvm_segment *var, int seg) 3974 { 3975 kvm_x86_ops->set_segment(vcpu, var, seg); 3976 } 3977 3978 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3979 struct kvm_segment *kvm_desct) 3980 { 3981 kvm_desct->base = get_desc_base(seg_desc); 3982 kvm_desct->limit = get_desc_limit(seg_desc); 3983 if (seg_desc->g) { 3984 kvm_desct->limit <<= 12; 3985 kvm_desct->limit |= 0xfff; 3986 } 3987 kvm_desct->selector = selector; 3988 kvm_desct->type = seg_desc->type; 3989 kvm_desct->present = seg_desc->p; 3990 kvm_desct->dpl = seg_desc->dpl; 3991 kvm_desct->db = seg_desc->d; 3992 kvm_desct->s = seg_desc->s; 3993 kvm_desct->l = seg_desc->l; 3994 kvm_desct->g = seg_desc->g; 3995 kvm_desct->avl = seg_desc->avl; 3996 if (!selector) 3997 kvm_desct->unusable = 1; 3998 else 3999 kvm_desct->unusable = 0; 4000 kvm_desct->padding = 0; 4001 } 4002 4003 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 4004 u16 selector, 4005 struct descriptor_table *dtable) 4006 { 4007 if (selector & 1 << 2) { 4008 struct kvm_segment kvm_seg; 4009 4010 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 4011 4012 if (kvm_seg.unusable) 4013 dtable->limit = 0; 4014 else 4015 dtable->limit = kvm_seg.limit; 4016 dtable->base = kvm_seg.base; 4017 } 4018 else 4019 kvm_x86_ops->get_gdt(vcpu, dtable); 4020 } 4021 4022 /* allowed just for 8 bytes segments */ 4023 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4024 struct desc_struct *seg_desc) 4025 { 4026 struct descriptor_table dtable; 4027 u16 index = selector >> 3; 4028 4029 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4030 4031 if (dtable.limit < index * 8 + 7) { 4032 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4033 return 1; 4034 } 4035 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4036 } 4037 4038 /* allowed just for 8 bytes segments */ 4039 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4040 struct desc_struct *seg_desc) 4041 { 4042 struct descriptor_table dtable; 4043 u16 index = selector >> 3; 4044 4045 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4046 4047 if (dtable.limit < index * 8 + 7) 4048 return 1; 4049 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4050 } 4051 4052 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 4053 struct desc_struct *seg_desc) 4054 { 4055 u32 base_addr = get_desc_base(seg_desc); 4056 4057 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4058 } 4059 4060 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4061 { 4062 struct kvm_segment kvm_seg; 4063 4064 kvm_get_segment(vcpu, &kvm_seg, seg); 4065 return kvm_seg.selector; 4066 } 4067 4068 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 4069 u16 selector, 4070 struct kvm_segment *kvm_seg) 4071 { 4072 struct desc_struct seg_desc; 4073 4074 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 4075 return 1; 4076 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 4077 return 0; 4078 } 4079 4080 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4081 { 4082 struct kvm_segment segvar = { 4083 .base = selector << 4, 4084 .limit = 0xffff, 4085 .selector = selector, 4086 .type = 3, 4087 .present = 1, 4088 .dpl = 3, 4089 .db = 0, 4090 .s = 1, 4091 .l = 0, 4092 .g = 0, 4093 .avl = 0, 4094 .unusable = 0, 4095 }; 4096 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4097 return 0; 4098 } 4099 4100 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4101 { 4102 return (seg != VCPU_SREG_LDTR) && 4103 (seg != VCPU_SREG_TR) && 4104 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4105 } 4106 4107 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4108 int type_bits, int seg) 4109 { 4110 struct kvm_segment kvm_seg; 4111 4112 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4113 return kvm_load_realmode_segment(vcpu, selector, seg); 4114 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4115 return 1; 4116 kvm_seg.type |= type_bits; 4117 4118 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4119 seg != VCPU_SREG_LDTR) 4120 if (!kvm_seg.s) 4121 kvm_seg.unusable = 1; 4122 4123 kvm_set_segment(vcpu, &kvm_seg, seg); 4124 return 0; 4125 } 4126 4127 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4128 struct tss_segment_32 *tss) 4129 { 4130 tss->cr3 = vcpu->arch.cr3; 4131 tss->eip = kvm_rip_read(vcpu); 4132 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4133 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4134 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4135 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4136 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4137 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4138 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4139 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4140 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4141 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4142 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4143 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4144 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4145 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 4146 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 4147 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4148 } 4149 4150 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4151 struct tss_segment_32 *tss) 4152 { 4153 kvm_set_cr3(vcpu, tss->cr3); 4154 4155 kvm_rip_write(vcpu, tss->eip); 4156 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4157 4158 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4159 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4160 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 4161 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 4162 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 4163 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 4164 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4165 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4166 4167 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4168 return 1; 4169 4170 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4171 return 1; 4172 4173 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4174 return 1; 4175 4176 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4177 return 1; 4178 4179 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4180 return 1; 4181 4182 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4183 return 1; 4184 4185 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4186 return 1; 4187 return 0; 4188 } 4189 4190 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 4191 struct tss_segment_16 *tss) 4192 { 4193 tss->ip = kvm_rip_read(vcpu); 4194 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4195 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4196 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4197 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4198 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4199 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4200 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4201 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 4202 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 4203 4204 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4205 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4206 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4207 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4208 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4209 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 4210 } 4211 4212 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4213 struct tss_segment_16 *tss) 4214 { 4215 kvm_rip_write(vcpu, tss->ip); 4216 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4217 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4218 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4219 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4220 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 4221 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 4222 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 4223 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 4224 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 4225 4226 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 4227 return 1; 4228 4229 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4230 return 1; 4231 4232 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4233 return 1; 4234 4235 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4236 return 1; 4237 4238 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4239 return 1; 4240 return 0; 4241 } 4242 4243 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 4244 u16 old_tss_sel, u32 old_tss_base, 4245 struct desc_struct *nseg_desc) 4246 { 4247 struct tss_segment_16 tss_segment_16; 4248 int ret = 0; 4249 4250 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4251 sizeof tss_segment_16)) 4252 goto out; 4253 4254 save_state_to_tss16(vcpu, &tss_segment_16); 4255 4256 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4257 sizeof tss_segment_16)) 4258 goto out; 4259 4260 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4261 &tss_segment_16, sizeof tss_segment_16)) 4262 goto out; 4263 4264 if (old_tss_sel != 0xffff) { 4265 tss_segment_16.prev_task_link = old_tss_sel; 4266 4267 if (kvm_write_guest(vcpu->kvm, 4268 get_tss_base_addr(vcpu, nseg_desc), 4269 &tss_segment_16.prev_task_link, 4270 sizeof tss_segment_16.prev_task_link)) 4271 goto out; 4272 } 4273 4274 if (load_state_from_tss16(vcpu, &tss_segment_16)) 4275 goto out; 4276 4277 ret = 1; 4278 out: 4279 return ret; 4280 } 4281 4282 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 4283 u16 old_tss_sel, u32 old_tss_base, 4284 struct desc_struct *nseg_desc) 4285 { 4286 struct tss_segment_32 tss_segment_32; 4287 int ret = 0; 4288 4289 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4290 sizeof tss_segment_32)) 4291 goto out; 4292 4293 save_state_to_tss32(vcpu, &tss_segment_32); 4294 4295 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4296 sizeof tss_segment_32)) 4297 goto out; 4298 4299 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4300 &tss_segment_32, sizeof tss_segment_32)) 4301 goto out; 4302 4303 if (old_tss_sel != 0xffff) { 4304 tss_segment_32.prev_task_link = old_tss_sel; 4305 4306 if (kvm_write_guest(vcpu->kvm, 4307 get_tss_base_addr(vcpu, nseg_desc), 4308 &tss_segment_32.prev_task_link, 4309 sizeof tss_segment_32.prev_task_link)) 4310 goto out; 4311 } 4312 4313 if (load_state_from_tss32(vcpu, &tss_segment_32)) 4314 goto out; 4315 4316 ret = 1; 4317 out: 4318 return ret; 4319 } 4320 4321 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 4322 { 4323 struct kvm_segment tr_seg; 4324 struct desc_struct cseg_desc; 4325 struct desc_struct nseg_desc; 4326 int ret = 0; 4327 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 4328 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 4329 4330 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 4331 4332 /* FIXME: Handle errors. Failure to read either TSS or their 4333 * descriptors should generate a pagefault. 4334 */ 4335 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 4336 goto out; 4337 4338 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 4339 goto out; 4340 4341 if (reason != TASK_SWITCH_IRET) { 4342 int cpl; 4343 4344 cpl = kvm_x86_ops->get_cpl(vcpu); 4345 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 4346 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4347 return 1; 4348 } 4349 } 4350 4351 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4352 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4353 return 1; 4354 } 4355 4356 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4357 cseg_desc.type &= ~(1 << 1); //clear the B flag 4358 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4359 } 4360 4361 if (reason == TASK_SWITCH_IRET) { 4362 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4363 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4364 } 4365 4366 /* set back link to prev task only if NT bit is set in eflags 4367 note that old_tss_sel is not used afetr this point */ 4368 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4369 old_tss_sel = 0xffff; 4370 4371 /* set back link to prev task only if NT bit is set in eflags 4372 note that old_tss_sel is not used afetr this point */ 4373 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4374 old_tss_sel = 0xffff; 4375 4376 if (nseg_desc.type & 8) 4377 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4378 old_tss_base, &nseg_desc); 4379 else 4380 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4381 old_tss_base, &nseg_desc); 4382 4383 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4384 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4385 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4386 } 4387 4388 if (reason != TASK_SWITCH_IRET) { 4389 nseg_desc.type |= (1 << 1); 4390 save_guest_segment_descriptor(vcpu, tss_selector, 4391 &nseg_desc); 4392 } 4393 4394 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4395 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4396 tr_seg.type = 11; 4397 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4398 out: 4399 return ret; 4400 } 4401 EXPORT_SYMBOL_GPL(kvm_task_switch); 4402 4403 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4404 struct kvm_sregs *sregs) 4405 { 4406 int mmu_reset_needed = 0; 4407 int pending_vec, max_bits; 4408 struct descriptor_table dt; 4409 4410 vcpu_load(vcpu); 4411 4412 dt.limit = sregs->idt.limit; 4413 dt.base = sregs->idt.base; 4414 kvm_x86_ops->set_idt(vcpu, &dt); 4415 dt.limit = sregs->gdt.limit; 4416 dt.base = sregs->gdt.base; 4417 kvm_x86_ops->set_gdt(vcpu, &dt); 4418 4419 vcpu->arch.cr2 = sregs->cr2; 4420 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4421 vcpu->arch.cr3 = sregs->cr3; 4422 4423 kvm_set_cr8(vcpu, sregs->cr8); 4424 4425 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4426 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4427 kvm_set_apic_base(vcpu, sregs->apic_base); 4428 4429 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4430 4431 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4432 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4433 vcpu->arch.cr0 = sregs->cr0; 4434 4435 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4436 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4437 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4438 load_pdptrs(vcpu, vcpu->arch.cr3); 4439 4440 if (mmu_reset_needed) 4441 kvm_mmu_reset_context(vcpu); 4442 4443 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4444 pending_vec = find_first_bit( 4445 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4446 if (pending_vec < max_bits) { 4447 kvm_queue_interrupt(vcpu, pending_vec, false); 4448 pr_debug("Set back pending irq %d\n", pending_vec); 4449 if (irqchip_in_kernel(vcpu->kvm)) 4450 kvm_pic_clear_isr_ack(vcpu->kvm); 4451 } 4452 4453 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4454 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4455 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4456 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4457 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4458 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4459 4460 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4461 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4462 4463 update_cr8_intercept(vcpu); 4464 4465 /* Older userspace won't unhalt the vcpu on reset. */ 4466 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 4467 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4468 !(vcpu->arch.cr0 & X86_CR0_PE)) 4469 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4470 4471 vcpu_put(vcpu); 4472 4473 return 0; 4474 } 4475 4476 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4477 struct kvm_guest_debug *dbg) 4478 { 4479 int i, r; 4480 4481 vcpu_load(vcpu); 4482 4483 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4484 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4485 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4486 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4487 vcpu->arch.switch_db_regs = 4488 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4489 } else { 4490 for (i = 0; i < KVM_NR_DB_REGS; i++) 4491 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4492 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4493 } 4494 4495 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4496 4497 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4498 kvm_queue_exception(vcpu, DB_VECTOR); 4499 else if (dbg->control & KVM_GUESTDBG_INJECT_BP) 4500 kvm_queue_exception(vcpu, BP_VECTOR); 4501 4502 vcpu_put(vcpu); 4503 4504 return r; 4505 } 4506 4507 /* 4508 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4509 * we have asm/x86/processor.h 4510 */ 4511 struct fxsave { 4512 u16 cwd; 4513 u16 swd; 4514 u16 twd; 4515 u16 fop; 4516 u64 rip; 4517 u64 rdp; 4518 u32 mxcsr; 4519 u32 mxcsr_mask; 4520 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4521 #ifdef CONFIG_X86_64 4522 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4523 #else 4524 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4525 #endif 4526 }; 4527 4528 /* 4529 * Translate a guest virtual address to a guest physical address. 4530 */ 4531 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4532 struct kvm_translation *tr) 4533 { 4534 unsigned long vaddr = tr->linear_address; 4535 gpa_t gpa; 4536 4537 vcpu_load(vcpu); 4538 down_read(&vcpu->kvm->slots_lock); 4539 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4540 up_read(&vcpu->kvm->slots_lock); 4541 tr->physical_address = gpa; 4542 tr->valid = gpa != UNMAPPED_GVA; 4543 tr->writeable = 1; 4544 tr->usermode = 0; 4545 vcpu_put(vcpu); 4546 4547 return 0; 4548 } 4549 4550 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4551 { 4552 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4553 4554 vcpu_load(vcpu); 4555 4556 memcpy(fpu->fpr, fxsave->st_space, 128); 4557 fpu->fcw = fxsave->cwd; 4558 fpu->fsw = fxsave->swd; 4559 fpu->ftwx = fxsave->twd; 4560 fpu->last_opcode = fxsave->fop; 4561 fpu->last_ip = fxsave->rip; 4562 fpu->last_dp = fxsave->rdp; 4563 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4564 4565 vcpu_put(vcpu); 4566 4567 return 0; 4568 } 4569 4570 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4571 { 4572 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4573 4574 vcpu_load(vcpu); 4575 4576 memcpy(fxsave->st_space, fpu->fpr, 128); 4577 fxsave->cwd = fpu->fcw; 4578 fxsave->swd = fpu->fsw; 4579 fxsave->twd = fpu->ftwx; 4580 fxsave->fop = fpu->last_opcode; 4581 fxsave->rip = fpu->last_ip; 4582 fxsave->rdp = fpu->last_dp; 4583 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4584 4585 vcpu_put(vcpu); 4586 4587 return 0; 4588 } 4589 4590 void fx_init(struct kvm_vcpu *vcpu) 4591 { 4592 unsigned after_mxcsr_mask; 4593 4594 /* 4595 * Touch the fpu the first time in non atomic context as if 4596 * this is the first fpu instruction the exception handler 4597 * will fire before the instruction returns and it'll have to 4598 * allocate ram with GFP_KERNEL. 4599 */ 4600 if (!used_math()) 4601 kvm_fx_save(&vcpu->arch.host_fx_image); 4602 4603 /* Initialize guest FPU by resetting ours and saving into guest's */ 4604 preempt_disable(); 4605 kvm_fx_save(&vcpu->arch.host_fx_image); 4606 kvm_fx_finit(); 4607 kvm_fx_save(&vcpu->arch.guest_fx_image); 4608 kvm_fx_restore(&vcpu->arch.host_fx_image); 4609 preempt_enable(); 4610 4611 vcpu->arch.cr0 |= X86_CR0_ET; 4612 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4613 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4614 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4615 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4616 } 4617 EXPORT_SYMBOL_GPL(fx_init); 4618 4619 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4620 { 4621 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4622 return; 4623 4624 vcpu->guest_fpu_loaded = 1; 4625 kvm_fx_save(&vcpu->arch.host_fx_image); 4626 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4627 } 4628 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4629 4630 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4631 { 4632 if (!vcpu->guest_fpu_loaded) 4633 return; 4634 4635 vcpu->guest_fpu_loaded = 0; 4636 kvm_fx_save(&vcpu->arch.guest_fx_image); 4637 kvm_fx_restore(&vcpu->arch.host_fx_image); 4638 ++vcpu->stat.fpu_reload; 4639 } 4640 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4641 4642 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4643 { 4644 if (vcpu->arch.time_page) { 4645 kvm_release_page_dirty(vcpu->arch.time_page); 4646 vcpu->arch.time_page = NULL; 4647 } 4648 4649 kvm_x86_ops->vcpu_free(vcpu); 4650 } 4651 4652 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4653 unsigned int id) 4654 { 4655 return kvm_x86_ops->vcpu_create(kvm, id); 4656 } 4657 4658 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4659 { 4660 int r; 4661 4662 /* We do fxsave: this must be aligned. */ 4663 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4664 4665 vcpu->arch.mtrr_state.have_fixed = 1; 4666 vcpu_load(vcpu); 4667 r = kvm_arch_vcpu_reset(vcpu); 4668 if (r == 0) 4669 r = kvm_mmu_setup(vcpu); 4670 vcpu_put(vcpu); 4671 if (r < 0) 4672 goto free_vcpu; 4673 4674 return 0; 4675 free_vcpu: 4676 kvm_x86_ops->vcpu_free(vcpu); 4677 return r; 4678 } 4679 4680 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4681 { 4682 vcpu_load(vcpu); 4683 kvm_mmu_unload(vcpu); 4684 vcpu_put(vcpu); 4685 4686 kvm_x86_ops->vcpu_free(vcpu); 4687 } 4688 4689 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4690 { 4691 vcpu->arch.nmi_pending = false; 4692 vcpu->arch.nmi_injected = false; 4693 4694 vcpu->arch.switch_db_regs = 0; 4695 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4696 vcpu->arch.dr6 = DR6_FIXED_1; 4697 vcpu->arch.dr7 = DR7_FIXED_1; 4698 4699 return kvm_x86_ops->vcpu_reset(vcpu); 4700 } 4701 4702 void kvm_arch_hardware_enable(void *garbage) 4703 { 4704 kvm_x86_ops->hardware_enable(garbage); 4705 } 4706 4707 void kvm_arch_hardware_disable(void *garbage) 4708 { 4709 kvm_x86_ops->hardware_disable(garbage); 4710 } 4711 4712 int kvm_arch_hardware_setup(void) 4713 { 4714 return kvm_x86_ops->hardware_setup(); 4715 } 4716 4717 void kvm_arch_hardware_unsetup(void) 4718 { 4719 kvm_x86_ops->hardware_unsetup(); 4720 } 4721 4722 void kvm_arch_check_processor_compat(void *rtn) 4723 { 4724 kvm_x86_ops->check_processor_compatibility(rtn); 4725 } 4726 4727 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4728 { 4729 struct page *page; 4730 struct kvm *kvm; 4731 int r; 4732 4733 BUG_ON(vcpu->kvm == NULL); 4734 kvm = vcpu->kvm; 4735 4736 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4737 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 4738 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4739 else 4740 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4741 4742 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4743 if (!page) { 4744 r = -ENOMEM; 4745 goto fail; 4746 } 4747 vcpu->arch.pio_data = page_address(page); 4748 4749 r = kvm_mmu_create(vcpu); 4750 if (r < 0) 4751 goto fail_free_pio_data; 4752 4753 if (irqchip_in_kernel(kvm)) { 4754 r = kvm_create_lapic(vcpu); 4755 if (r < 0) 4756 goto fail_mmu_destroy; 4757 } 4758 4759 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 4760 GFP_KERNEL); 4761 if (!vcpu->arch.mce_banks) { 4762 r = -ENOMEM; 4763 goto fail_mmu_destroy; 4764 } 4765 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 4766 4767 return 0; 4768 4769 fail_mmu_destroy: 4770 kvm_mmu_destroy(vcpu); 4771 fail_free_pio_data: 4772 free_page((unsigned long)vcpu->arch.pio_data); 4773 fail: 4774 return r; 4775 } 4776 4777 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4778 { 4779 kvm_free_lapic(vcpu); 4780 down_read(&vcpu->kvm->slots_lock); 4781 kvm_mmu_destroy(vcpu); 4782 up_read(&vcpu->kvm->slots_lock); 4783 free_page((unsigned long)vcpu->arch.pio_data); 4784 } 4785 4786 struct kvm *kvm_arch_create_vm(void) 4787 { 4788 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4789 4790 if (!kvm) 4791 return ERR_PTR(-ENOMEM); 4792 4793 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4794 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4795 4796 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4797 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4798 4799 rdtscll(kvm->arch.vm_init_tsc); 4800 4801 return kvm; 4802 } 4803 4804 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4805 { 4806 vcpu_load(vcpu); 4807 kvm_mmu_unload(vcpu); 4808 vcpu_put(vcpu); 4809 } 4810 4811 static void kvm_free_vcpus(struct kvm *kvm) 4812 { 4813 unsigned int i; 4814 struct kvm_vcpu *vcpu; 4815 4816 /* 4817 * Unpin any mmu pages first. 4818 */ 4819 kvm_for_each_vcpu(i, vcpu, kvm) 4820 kvm_unload_vcpu_mmu(vcpu); 4821 kvm_for_each_vcpu(i, vcpu, kvm) 4822 kvm_arch_vcpu_free(vcpu); 4823 4824 mutex_lock(&kvm->lock); 4825 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 4826 kvm->vcpus[i] = NULL; 4827 4828 atomic_set(&kvm->online_vcpus, 0); 4829 mutex_unlock(&kvm->lock); 4830 } 4831 4832 void kvm_arch_sync_events(struct kvm *kvm) 4833 { 4834 kvm_free_all_assigned_devices(kvm); 4835 } 4836 4837 void kvm_arch_destroy_vm(struct kvm *kvm) 4838 { 4839 kvm_iommu_unmap_guest(kvm); 4840 kvm_free_pit(kvm); 4841 kfree(kvm->arch.vpic); 4842 kfree(kvm->arch.vioapic); 4843 kvm_free_vcpus(kvm); 4844 kvm_free_physmem(kvm); 4845 if (kvm->arch.apic_access_page) 4846 put_page(kvm->arch.apic_access_page); 4847 if (kvm->arch.ept_identity_pagetable) 4848 put_page(kvm->arch.ept_identity_pagetable); 4849 kfree(kvm); 4850 } 4851 4852 int kvm_arch_set_memory_region(struct kvm *kvm, 4853 struct kvm_userspace_memory_region *mem, 4854 struct kvm_memory_slot old, 4855 int user_alloc) 4856 { 4857 int npages = mem->memory_size >> PAGE_SHIFT; 4858 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4859 4860 /*To keep backward compatibility with older userspace, 4861 *x86 needs to hanlde !user_alloc case. 4862 */ 4863 if (!user_alloc) { 4864 if (npages && !old.rmap) { 4865 unsigned long userspace_addr; 4866 4867 down_write(¤t->mm->mmap_sem); 4868 userspace_addr = do_mmap(NULL, 0, 4869 npages * PAGE_SIZE, 4870 PROT_READ | PROT_WRITE, 4871 MAP_PRIVATE | MAP_ANONYMOUS, 4872 0); 4873 up_write(¤t->mm->mmap_sem); 4874 4875 if (IS_ERR((void *)userspace_addr)) 4876 return PTR_ERR((void *)userspace_addr); 4877 4878 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4879 spin_lock(&kvm->mmu_lock); 4880 memslot->userspace_addr = userspace_addr; 4881 spin_unlock(&kvm->mmu_lock); 4882 } else { 4883 if (!old.user_alloc && old.rmap) { 4884 int ret; 4885 4886 down_write(¤t->mm->mmap_sem); 4887 ret = do_munmap(current->mm, old.userspace_addr, 4888 old.npages * PAGE_SIZE); 4889 up_write(¤t->mm->mmap_sem); 4890 if (ret < 0) 4891 printk(KERN_WARNING 4892 "kvm_vm_ioctl_set_memory_region: " 4893 "failed to munmap memory\n"); 4894 } 4895 } 4896 } 4897 4898 spin_lock(&kvm->mmu_lock); 4899 if (!kvm->arch.n_requested_mmu_pages) { 4900 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4901 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4902 } 4903 4904 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4905 spin_unlock(&kvm->mmu_lock); 4906 4907 return 0; 4908 } 4909 4910 void kvm_arch_flush_shadow(struct kvm *kvm) 4911 { 4912 kvm_mmu_zap_all(kvm); 4913 kvm_reload_remote_mmus(kvm); 4914 } 4915 4916 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4917 { 4918 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4919 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4920 || vcpu->arch.nmi_pending || 4921 (kvm_arch_interrupt_allowed(vcpu) && 4922 kvm_cpu_has_interrupt(vcpu)); 4923 } 4924 4925 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4926 { 4927 int me; 4928 int cpu = vcpu->cpu; 4929 4930 if (waitqueue_active(&vcpu->wq)) { 4931 wake_up_interruptible(&vcpu->wq); 4932 ++vcpu->stat.halt_wakeup; 4933 } 4934 4935 me = get_cpu(); 4936 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 4937 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 4938 smp_send_reschedule(cpu); 4939 put_cpu(); 4940 } 4941 4942 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 4943 { 4944 return kvm_x86_ops->interrupt_allowed(vcpu); 4945 } 4946 4947 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 4948 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 4949 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 4950 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 4951 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 4952