1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 #include <linux/user-return-notifier.h> 41 #include <trace/events/kvm.h> 42 #undef TRACE_INCLUDE_FILE 43 #define CREATE_TRACE_POINTS 44 #include "trace.h" 45 46 #include <asm/debugreg.h> 47 #include <asm/uaccess.h> 48 #include <asm/msr.h> 49 #include <asm/desc.h> 50 #include <asm/mtrr.h> 51 #include <asm/mce.h> 52 53 #define MAX_IO_MSRS 256 54 #define CR0_RESERVED_BITS \ 55 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 56 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 57 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 58 #define CR4_RESERVED_BITS \ 59 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 60 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 61 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 62 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 63 64 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 65 66 #define KVM_MAX_MCE_BANKS 32 67 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 68 69 /* EFER defaults: 70 * - enable syscall per default because its emulated by KVM 71 * - enable LME and LMA per default on 64 bit KVM 72 */ 73 #ifdef CONFIG_X86_64 74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 75 #else 76 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 77 #endif 78 79 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 80 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 81 82 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 83 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 84 struct kvm_cpuid_entry2 __user *entries); 85 86 struct kvm_x86_ops *kvm_x86_ops; 87 EXPORT_SYMBOL_GPL(kvm_x86_ops); 88 89 int ignore_msrs = 0; 90 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 91 92 #define KVM_NR_SHARED_MSRS 16 93 94 struct kvm_shared_msrs_global { 95 int nr; 96 struct kvm_shared_msr { 97 u32 msr; 98 u64 value; 99 } msrs[KVM_NR_SHARED_MSRS]; 100 }; 101 102 struct kvm_shared_msrs { 103 struct user_return_notifier urn; 104 bool registered; 105 u64 current_value[KVM_NR_SHARED_MSRS]; 106 }; 107 108 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 109 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); 110 111 struct kvm_stats_debugfs_item debugfs_entries[] = { 112 { "pf_fixed", VCPU_STAT(pf_fixed) }, 113 { "pf_guest", VCPU_STAT(pf_guest) }, 114 { "tlb_flush", VCPU_STAT(tlb_flush) }, 115 { "invlpg", VCPU_STAT(invlpg) }, 116 { "exits", VCPU_STAT(exits) }, 117 { "io_exits", VCPU_STAT(io_exits) }, 118 { "mmio_exits", VCPU_STAT(mmio_exits) }, 119 { "signal_exits", VCPU_STAT(signal_exits) }, 120 { "irq_window", VCPU_STAT(irq_window_exits) }, 121 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 122 { "halt_exits", VCPU_STAT(halt_exits) }, 123 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 124 { "hypercalls", VCPU_STAT(hypercalls) }, 125 { "request_irq", VCPU_STAT(request_irq_exits) }, 126 { "irq_exits", VCPU_STAT(irq_exits) }, 127 { "host_state_reload", VCPU_STAT(host_state_reload) }, 128 { "efer_reload", VCPU_STAT(efer_reload) }, 129 { "fpu_reload", VCPU_STAT(fpu_reload) }, 130 { "insn_emulation", VCPU_STAT(insn_emulation) }, 131 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 132 { "irq_injections", VCPU_STAT(irq_injections) }, 133 { "nmi_injections", VCPU_STAT(nmi_injections) }, 134 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 135 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 136 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 137 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 138 { "mmu_flooded", VM_STAT(mmu_flooded) }, 139 { "mmu_recycled", VM_STAT(mmu_recycled) }, 140 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 141 { "mmu_unsync", VM_STAT(mmu_unsync) }, 142 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 143 { "largepages", VM_STAT(lpages) }, 144 { NULL } 145 }; 146 147 static void kvm_on_user_return(struct user_return_notifier *urn) 148 { 149 unsigned slot; 150 struct kvm_shared_msr *global; 151 struct kvm_shared_msrs *locals 152 = container_of(urn, struct kvm_shared_msrs, urn); 153 154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 155 global = &shared_msrs_global.msrs[slot]; 156 if (global->value != locals->current_value[slot]) { 157 wrmsrl(global->msr, global->value); 158 locals->current_value[slot] = global->value; 159 } 160 } 161 locals->registered = false; 162 user_return_notifier_unregister(urn); 163 } 164 165 void kvm_define_shared_msr(unsigned slot, u32 msr) 166 { 167 int cpu; 168 u64 value; 169 170 if (slot >= shared_msrs_global.nr) 171 shared_msrs_global.nr = slot + 1; 172 shared_msrs_global.msrs[slot].msr = msr; 173 rdmsrl_safe(msr, &value); 174 shared_msrs_global.msrs[slot].value = value; 175 for_each_online_cpu(cpu) 176 per_cpu(shared_msrs, cpu).current_value[slot] = value; 177 } 178 EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 179 180 static void kvm_shared_msr_cpu_online(void) 181 { 182 unsigned i; 183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); 184 185 for (i = 0; i < shared_msrs_global.nr; ++i) 186 locals->current_value[i] = shared_msrs_global.msrs[i].value; 187 } 188 189 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 190 { 191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 192 193 if (((value ^ smsr->current_value[slot]) & mask) == 0) 194 return; 195 smsr->current_value[slot] = value; 196 wrmsrl(shared_msrs_global.msrs[slot].msr, value); 197 if (!smsr->registered) { 198 smsr->urn.on_user_return = kvm_on_user_return; 199 user_return_notifier_register(&smsr->urn); 200 smsr->registered = true; 201 } 202 } 203 EXPORT_SYMBOL_GPL(kvm_set_shared_msr); 204 205 static void drop_user_return_notifiers(void *ignore) 206 { 207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 208 209 if (smsr->registered) 210 kvm_on_user_return(&smsr->urn); 211 } 212 213 unsigned long segment_base(u16 selector) 214 { 215 struct descriptor_table gdt; 216 struct desc_struct *d; 217 unsigned long table_base; 218 unsigned long v; 219 220 if (selector == 0) 221 return 0; 222 223 kvm_get_gdt(&gdt); 224 table_base = gdt.base; 225 226 if (selector & 4) { /* from ldt */ 227 u16 ldt_selector = kvm_read_ldt(); 228 229 table_base = segment_base(ldt_selector); 230 } 231 d = (struct desc_struct *)(table_base + (selector & ~7)); 232 v = get_desc_base(d); 233 #ifdef CONFIG_X86_64 234 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 235 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 236 #endif 237 return v; 238 } 239 EXPORT_SYMBOL_GPL(segment_base); 240 241 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 242 { 243 if (irqchip_in_kernel(vcpu->kvm)) 244 return vcpu->arch.apic_base; 245 else 246 return vcpu->arch.apic_base; 247 } 248 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 249 250 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 251 { 252 /* TODO: reserve bits check */ 253 if (irqchip_in_kernel(vcpu->kvm)) 254 kvm_lapic_set_base(vcpu, data); 255 else 256 vcpu->arch.apic_base = data; 257 } 258 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 259 260 #define EXCPT_BENIGN 0 261 #define EXCPT_CONTRIBUTORY 1 262 #define EXCPT_PF 2 263 264 static int exception_class(int vector) 265 { 266 switch (vector) { 267 case PF_VECTOR: 268 return EXCPT_PF; 269 case DE_VECTOR: 270 case TS_VECTOR: 271 case NP_VECTOR: 272 case SS_VECTOR: 273 case GP_VECTOR: 274 return EXCPT_CONTRIBUTORY; 275 default: 276 break; 277 } 278 return EXCPT_BENIGN; 279 } 280 281 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 282 unsigned nr, bool has_error, u32 error_code) 283 { 284 u32 prev_nr; 285 int class1, class2; 286 287 if (!vcpu->arch.exception.pending) { 288 queue: 289 vcpu->arch.exception.pending = true; 290 vcpu->arch.exception.has_error_code = has_error; 291 vcpu->arch.exception.nr = nr; 292 vcpu->arch.exception.error_code = error_code; 293 return; 294 } 295 296 /* to check exception */ 297 prev_nr = vcpu->arch.exception.nr; 298 if (prev_nr == DF_VECTOR) { 299 /* triple fault -> shutdown */ 300 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 301 return; 302 } 303 class1 = exception_class(prev_nr); 304 class2 = exception_class(nr); 305 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 306 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 307 /* generate double fault per SDM Table 5-5 */ 308 vcpu->arch.exception.pending = true; 309 vcpu->arch.exception.has_error_code = true; 310 vcpu->arch.exception.nr = DF_VECTOR; 311 vcpu->arch.exception.error_code = 0; 312 } else 313 /* replace previous exception with a new one in a hope 314 that instruction re-execution will regenerate lost 315 exception */ 316 goto queue; 317 } 318 319 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 320 { 321 kvm_multiple_exception(vcpu, nr, false, 0); 322 } 323 EXPORT_SYMBOL_GPL(kvm_queue_exception); 324 325 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 326 u32 error_code) 327 { 328 ++vcpu->stat.pf_guest; 329 vcpu->arch.cr2 = addr; 330 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 331 } 332 333 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 334 { 335 vcpu->arch.nmi_pending = 1; 336 } 337 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 338 339 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 340 { 341 kvm_multiple_exception(vcpu, nr, true, error_code); 342 } 343 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 344 345 /* 346 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 347 * a #GP and return false. 348 */ 349 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 350 { 351 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 352 return true; 353 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 354 return false; 355 } 356 EXPORT_SYMBOL_GPL(kvm_require_cpl); 357 358 /* 359 * Load the pae pdptrs. Return true is they are all valid. 360 */ 361 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 362 { 363 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 364 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 365 int i; 366 int ret; 367 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 368 369 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 370 offset * sizeof(u64), sizeof(pdpte)); 371 if (ret < 0) { 372 ret = 0; 373 goto out; 374 } 375 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 376 if (is_present_gpte(pdpte[i]) && 377 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 378 ret = 0; 379 goto out; 380 } 381 } 382 ret = 1; 383 384 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 385 __set_bit(VCPU_EXREG_PDPTR, 386 (unsigned long *)&vcpu->arch.regs_avail); 387 __set_bit(VCPU_EXREG_PDPTR, 388 (unsigned long *)&vcpu->arch.regs_dirty); 389 out: 390 391 return ret; 392 } 393 EXPORT_SYMBOL_GPL(load_pdptrs); 394 395 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 396 { 397 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 398 bool changed = true; 399 int r; 400 401 if (is_long_mode(vcpu) || !is_pae(vcpu)) 402 return false; 403 404 if (!test_bit(VCPU_EXREG_PDPTR, 405 (unsigned long *)&vcpu->arch.regs_avail)) 406 return true; 407 408 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 409 if (r < 0) 410 goto out; 411 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 412 out: 413 414 return changed; 415 } 416 417 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 418 { 419 if (cr0 & CR0_RESERVED_BITS) { 420 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 421 cr0, vcpu->arch.cr0); 422 kvm_inject_gp(vcpu, 0); 423 return; 424 } 425 426 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 427 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 428 kvm_inject_gp(vcpu, 0); 429 return; 430 } 431 432 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 433 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 434 "and a clear PE flag\n"); 435 kvm_inject_gp(vcpu, 0); 436 return; 437 } 438 439 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 440 #ifdef CONFIG_X86_64 441 if ((vcpu->arch.shadow_efer & EFER_LME)) { 442 int cs_db, cs_l; 443 444 if (!is_pae(vcpu)) { 445 printk(KERN_DEBUG "set_cr0: #GP, start paging " 446 "in long mode while PAE is disabled\n"); 447 kvm_inject_gp(vcpu, 0); 448 return; 449 } 450 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 451 if (cs_l) { 452 printk(KERN_DEBUG "set_cr0: #GP, start paging " 453 "in long mode while CS.L == 1\n"); 454 kvm_inject_gp(vcpu, 0); 455 return; 456 457 } 458 } else 459 #endif 460 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 461 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 462 "reserved bits\n"); 463 kvm_inject_gp(vcpu, 0); 464 return; 465 } 466 467 } 468 469 kvm_x86_ops->set_cr0(vcpu, cr0); 470 vcpu->arch.cr0 = cr0; 471 472 kvm_mmu_reset_context(vcpu); 473 return; 474 } 475 EXPORT_SYMBOL_GPL(kvm_set_cr0); 476 477 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 478 { 479 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 480 } 481 EXPORT_SYMBOL_GPL(kvm_lmsw); 482 483 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 484 { 485 unsigned long old_cr4 = vcpu->arch.cr4; 486 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 487 488 if (cr4 & CR4_RESERVED_BITS) { 489 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 490 kvm_inject_gp(vcpu, 0); 491 return; 492 } 493 494 if (is_long_mode(vcpu)) { 495 if (!(cr4 & X86_CR4_PAE)) { 496 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 497 "in long mode\n"); 498 kvm_inject_gp(vcpu, 0); 499 return; 500 } 501 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 502 && ((cr4 ^ old_cr4) & pdptr_bits) 503 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 504 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 505 kvm_inject_gp(vcpu, 0); 506 return; 507 } 508 509 if (cr4 & X86_CR4_VMXE) { 510 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 511 kvm_inject_gp(vcpu, 0); 512 return; 513 } 514 kvm_x86_ops->set_cr4(vcpu, cr4); 515 vcpu->arch.cr4 = cr4; 516 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 517 kvm_mmu_reset_context(vcpu); 518 } 519 EXPORT_SYMBOL_GPL(kvm_set_cr4); 520 521 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 522 { 523 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 524 kvm_mmu_sync_roots(vcpu); 525 kvm_mmu_flush_tlb(vcpu); 526 return; 527 } 528 529 if (is_long_mode(vcpu)) { 530 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 531 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 532 kvm_inject_gp(vcpu, 0); 533 return; 534 } 535 } else { 536 if (is_pae(vcpu)) { 537 if (cr3 & CR3_PAE_RESERVED_BITS) { 538 printk(KERN_DEBUG 539 "set_cr3: #GP, reserved bits\n"); 540 kvm_inject_gp(vcpu, 0); 541 return; 542 } 543 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 544 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 545 "reserved bits\n"); 546 kvm_inject_gp(vcpu, 0); 547 return; 548 } 549 } 550 /* 551 * We don't check reserved bits in nonpae mode, because 552 * this isn't enforced, and VMware depends on this. 553 */ 554 } 555 556 /* 557 * Does the new cr3 value map to physical memory? (Note, we 558 * catch an invalid cr3 even in real-mode, because it would 559 * cause trouble later on when we turn on paging anyway.) 560 * 561 * A real CPU would silently accept an invalid cr3 and would 562 * attempt to use it - with largely undefined (and often hard 563 * to debug) behavior on the guest side. 564 */ 565 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 566 kvm_inject_gp(vcpu, 0); 567 else { 568 vcpu->arch.cr3 = cr3; 569 vcpu->arch.mmu.new_cr3(vcpu); 570 } 571 } 572 EXPORT_SYMBOL_GPL(kvm_set_cr3); 573 574 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 575 { 576 if (cr8 & CR8_RESERVED_BITS) { 577 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 578 kvm_inject_gp(vcpu, 0); 579 return; 580 } 581 if (irqchip_in_kernel(vcpu->kvm)) 582 kvm_lapic_set_tpr(vcpu, cr8); 583 else 584 vcpu->arch.cr8 = cr8; 585 } 586 EXPORT_SYMBOL_GPL(kvm_set_cr8); 587 588 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 589 { 590 if (irqchip_in_kernel(vcpu->kvm)) 591 return kvm_lapic_get_cr8(vcpu); 592 else 593 return vcpu->arch.cr8; 594 } 595 EXPORT_SYMBOL_GPL(kvm_get_cr8); 596 597 static inline u32 bit(int bitno) 598 { 599 return 1 << (bitno & 31); 600 } 601 602 /* 603 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 604 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 605 * 606 * This list is modified at module load time to reflect the 607 * capabilities of the host cpu. This capabilities test skips MSRs that are 608 * kvm-specific. Those are put in the beginning of the list. 609 */ 610 611 #define KVM_SAVE_MSRS_BEGIN 2 612 static u32 msrs_to_save[] = { 613 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 614 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 615 MSR_K6_STAR, 616 #ifdef CONFIG_X86_64 617 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 618 #endif 619 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 620 }; 621 622 static unsigned num_msrs_to_save; 623 624 static u32 emulated_msrs[] = { 625 MSR_IA32_MISC_ENABLE, 626 }; 627 628 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 629 { 630 if (efer & efer_reserved_bits) { 631 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 632 efer); 633 kvm_inject_gp(vcpu, 0); 634 return; 635 } 636 637 if (is_paging(vcpu) 638 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 639 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 640 kvm_inject_gp(vcpu, 0); 641 return; 642 } 643 644 if (efer & EFER_FFXSR) { 645 struct kvm_cpuid_entry2 *feat; 646 647 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 648 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 649 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 650 kvm_inject_gp(vcpu, 0); 651 return; 652 } 653 } 654 655 if (efer & EFER_SVME) { 656 struct kvm_cpuid_entry2 *feat; 657 658 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 659 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 660 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 661 kvm_inject_gp(vcpu, 0); 662 return; 663 } 664 } 665 666 kvm_x86_ops->set_efer(vcpu, efer); 667 668 efer &= ~EFER_LMA; 669 efer |= vcpu->arch.shadow_efer & EFER_LMA; 670 671 vcpu->arch.shadow_efer = efer; 672 673 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 674 kvm_mmu_reset_context(vcpu); 675 } 676 677 void kvm_enable_efer_bits(u64 mask) 678 { 679 efer_reserved_bits &= ~mask; 680 } 681 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 682 683 684 /* 685 * Writes msr value into into the appropriate "register". 686 * Returns 0 on success, non-0 otherwise. 687 * Assumes vcpu_load() was already called. 688 */ 689 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 690 { 691 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 692 } 693 694 /* 695 * Adapt set_msr() to msr_io()'s calling convention 696 */ 697 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 698 { 699 return kvm_set_msr(vcpu, index, *data); 700 } 701 702 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 703 { 704 static int version; 705 struct pvclock_wall_clock wc; 706 struct timespec boot; 707 708 if (!wall_clock) 709 return; 710 711 version++; 712 713 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 714 715 /* 716 * The guest calculates current wall clock time by adding 717 * system time (updated by kvm_write_guest_time below) to the 718 * wall clock specified here. guest system time equals host 719 * system time for us, thus we must fill in host boot time here. 720 */ 721 getboottime(&boot); 722 723 wc.sec = boot.tv_sec; 724 wc.nsec = boot.tv_nsec; 725 wc.version = version; 726 727 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 728 729 version++; 730 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 731 } 732 733 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 734 { 735 uint32_t quotient, remainder; 736 737 /* Don't try to replace with do_div(), this one calculates 738 * "(dividend << 32) / divisor" */ 739 __asm__ ( "divl %4" 740 : "=a" (quotient), "=d" (remainder) 741 : "0" (0), "1" (dividend), "r" (divisor) ); 742 return quotient; 743 } 744 745 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 746 { 747 uint64_t nsecs = 1000000000LL; 748 int32_t shift = 0; 749 uint64_t tps64; 750 uint32_t tps32; 751 752 tps64 = tsc_khz * 1000LL; 753 while (tps64 > nsecs*2) { 754 tps64 >>= 1; 755 shift--; 756 } 757 758 tps32 = (uint32_t)tps64; 759 while (tps32 <= (uint32_t)nsecs) { 760 tps32 <<= 1; 761 shift++; 762 } 763 764 hv_clock->tsc_shift = shift; 765 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 766 767 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 768 __func__, tsc_khz, hv_clock->tsc_shift, 769 hv_clock->tsc_to_system_mul); 770 } 771 772 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 773 774 static void kvm_write_guest_time(struct kvm_vcpu *v) 775 { 776 struct timespec ts; 777 unsigned long flags; 778 struct kvm_vcpu_arch *vcpu = &v->arch; 779 void *shared_kaddr; 780 unsigned long this_tsc_khz; 781 782 if ((!vcpu->time_page)) 783 return; 784 785 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 786 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 787 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 788 vcpu->hv_clock_tsc_khz = this_tsc_khz; 789 } 790 put_cpu_var(cpu_tsc_khz); 791 792 /* Keep irq disabled to prevent changes to the clock */ 793 local_irq_save(flags); 794 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 795 ktime_get_ts(&ts); 796 monotonic_to_bootbased(&ts); 797 local_irq_restore(flags); 798 799 /* With all the info we got, fill in the values */ 800 801 vcpu->hv_clock.system_time = ts.tv_nsec + 802 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 803 804 /* 805 * The interface expects us to write an even number signaling that the 806 * update is finished. Since the guest won't see the intermediate 807 * state, we just increase by 2 at the end. 808 */ 809 vcpu->hv_clock.version += 2; 810 811 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 812 813 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 814 sizeof(vcpu->hv_clock)); 815 816 kunmap_atomic(shared_kaddr, KM_USER0); 817 818 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 819 } 820 821 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 822 { 823 struct kvm_vcpu_arch *vcpu = &v->arch; 824 825 if (!vcpu->time_page) 826 return 0; 827 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 828 return 1; 829 } 830 831 static bool msr_mtrr_valid(unsigned msr) 832 { 833 switch (msr) { 834 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 835 case MSR_MTRRfix64K_00000: 836 case MSR_MTRRfix16K_80000: 837 case MSR_MTRRfix16K_A0000: 838 case MSR_MTRRfix4K_C0000: 839 case MSR_MTRRfix4K_C8000: 840 case MSR_MTRRfix4K_D0000: 841 case MSR_MTRRfix4K_D8000: 842 case MSR_MTRRfix4K_E0000: 843 case MSR_MTRRfix4K_E8000: 844 case MSR_MTRRfix4K_F0000: 845 case MSR_MTRRfix4K_F8000: 846 case MSR_MTRRdefType: 847 case MSR_IA32_CR_PAT: 848 return true; 849 case 0x2f8: 850 return true; 851 } 852 return false; 853 } 854 855 static bool valid_pat_type(unsigned t) 856 { 857 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 858 } 859 860 static bool valid_mtrr_type(unsigned t) 861 { 862 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 863 } 864 865 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 866 { 867 int i; 868 869 if (!msr_mtrr_valid(msr)) 870 return false; 871 872 if (msr == MSR_IA32_CR_PAT) { 873 for (i = 0; i < 8; i++) 874 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 875 return false; 876 return true; 877 } else if (msr == MSR_MTRRdefType) { 878 if (data & ~0xcff) 879 return false; 880 return valid_mtrr_type(data & 0xff); 881 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 882 for (i = 0; i < 8 ; i++) 883 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 884 return false; 885 return true; 886 } 887 888 /* variable MTRRs */ 889 return valid_mtrr_type(data & 0xff); 890 } 891 892 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 893 { 894 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 895 896 if (!mtrr_valid(vcpu, msr, data)) 897 return 1; 898 899 if (msr == MSR_MTRRdefType) { 900 vcpu->arch.mtrr_state.def_type = data; 901 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 902 } else if (msr == MSR_MTRRfix64K_00000) 903 p[0] = data; 904 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 905 p[1 + msr - MSR_MTRRfix16K_80000] = data; 906 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 907 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 908 else if (msr == MSR_IA32_CR_PAT) 909 vcpu->arch.pat = data; 910 else { /* Variable MTRRs */ 911 int idx, is_mtrr_mask; 912 u64 *pt; 913 914 idx = (msr - 0x200) / 2; 915 is_mtrr_mask = msr - 0x200 - 2 * idx; 916 if (!is_mtrr_mask) 917 pt = 918 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 919 else 920 pt = 921 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 922 *pt = data; 923 } 924 925 kvm_mmu_reset_context(vcpu); 926 return 0; 927 } 928 929 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 930 { 931 u64 mcg_cap = vcpu->arch.mcg_cap; 932 unsigned bank_num = mcg_cap & 0xff; 933 934 switch (msr) { 935 case MSR_IA32_MCG_STATUS: 936 vcpu->arch.mcg_status = data; 937 break; 938 case MSR_IA32_MCG_CTL: 939 if (!(mcg_cap & MCG_CTL_P)) 940 return 1; 941 if (data != 0 && data != ~(u64)0) 942 return -1; 943 vcpu->arch.mcg_ctl = data; 944 break; 945 default: 946 if (msr >= MSR_IA32_MC0_CTL && 947 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 948 u32 offset = msr - MSR_IA32_MC0_CTL; 949 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 950 if ((offset & 0x3) == 0 && 951 data != 0 && data != ~(u64)0) 952 return -1; 953 vcpu->arch.mce_banks[offset] = data; 954 break; 955 } 956 return 1; 957 } 958 return 0; 959 } 960 961 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) 962 { 963 struct kvm *kvm = vcpu->kvm; 964 int lm = is_long_mode(vcpu); 965 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 966 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; 967 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 968 : kvm->arch.xen_hvm_config.blob_size_32; 969 u32 page_num = data & ~PAGE_MASK; 970 u64 page_addr = data & PAGE_MASK; 971 u8 *page; 972 int r; 973 974 r = -E2BIG; 975 if (page_num >= blob_size) 976 goto out; 977 r = -ENOMEM; 978 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 979 if (!page) 980 goto out; 981 r = -EFAULT; 982 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) 983 goto out_free; 984 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 985 goto out_free; 986 r = 0; 987 out_free: 988 kfree(page); 989 out: 990 return r; 991 } 992 993 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 994 { 995 switch (msr) { 996 case MSR_EFER: 997 set_efer(vcpu, data); 998 break; 999 case MSR_K7_HWCR: 1000 data &= ~(u64)0x40; /* ignore flush filter disable */ 1001 if (data != 0) { 1002 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1003 data); 1004 return 1; 1005 } 1006 break; 1007 case MSR_FAM10H_MMIO_CONF_BASE: 1008 if (data != 0) { 1009 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1010 "0x%llx\n", data); 1011 return 1; 1012 } 1013 break; 1014 case MSR_AMD64_NB_CFG: 1015 break; 1016 case MSR_IA32_DEBUGCTLMSR: 1017 if (!data) { 1018 /* We support the non-activated case already */ 1019 break; 1020 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 1021 /* Values other than LBR and BTF are vendor-specific, 1022 thus reserved and should throw a #GP */ 1023 return 1; 1024 } 1025 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1026 __func__, data); 1027 break; 1028 case MSR_IA32_UCODE_REV: 1029 case MSR_IA32_UCODE_WRITE: 1030 case MSR_VM_HSAVE_PA: 1031 case MSR_AMD64_PATCH_LOADER: 1032 break; 1033 case 0x200 ... 0x2ff: 1034 return set_msr_mtrr(vcpu, msr, data); 1035 case MSR_IA32_APICBASE: 1036 kvm_set_apic_base(vcpu, data); 1037 break; 1038 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1039 return kvm_x2apic_msr_write(vcpu, msr, data); 1040 case MSR_IA32_MISC_ENABLE: 1041 vcpu->arch.ia32_misc_enable_msr = data; 1042 break; 1043 case MSR_KVM_WALL_CLOCK: 1044 vcpu->kvm->arch.wall_clock = data; 1045 kvm_write_wall_clock(vcpu->kvm, data); 1046 break; 1047 case MSR_KVM_SYSTEM_TIME: { 1048 if (vcpu->arch.time_page) { 1049 kvm_release_page_dirty(vcpu->arch.time_page); 1050 vcpu->arch.time_page = NULL; 1051 } 1052 1053 vcpu->arch.time = data; 1054 1055 /* we verify if the enable bit is set... */ 1056 if (!(data & 1)) 1057 break; 1058 1059 /* ...but clean it before doing the actual write */ 1060 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 1061 1062 vcpu->arch.time_page = 1063 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1064 1065 if (is_error_page(vcpu->arch.time_page)) { 1066 kvm_release_page_clean(vcpu->arch.time_page); 1067 vcpu->arch.time_page = NULL; 1068 } 1069 1070 kvm_request_guest_time_update(vcpu); 1071 break; 1072 } 1073 case MSR_IA32_MCG_CTL: 1074 case MSR_IA32_MCG_STATUS: 1075 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1076 return set_msr_mce(vcpu, msr, data); 1077 1078 /* Performance counters are not protected by a CPUID bit, 1079 * so we should check all of them in the generic path for the sake of 1080 * cross vendor migration. 1081 * Writing a zero into the event select MSRs disables them, 1082 * which we perfectly emulate ;-). Any other value should be at least 1083 * reported, some guests depend on them. 1084 */ 1085 case MSR_P6_EVNTSEL0: 1086 case MSR_P6_EVNTSEL1: 1087 case MSR_K7_EVNTSEL0: 1088 case MSR_K7_EVNTSEL1: 1089 case MSR_K7_EVNTSEL2: 1090 case MSR_K7_EVNTSEL3: 1091 if (data != 0) 1092 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1093 "0x%x data 0x%llx\n", msr, data); 1094 break; 1095 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1096 * so we ignore writes to make it happy. 1097 */ 1098 case MSR_P6_PERFCTR0: 1099 case MSR_P6_PERFCTR1: 1100 case MSR_K7_PERFCTR0: 1101 case MSR_K7_PERFCTR1: 1102 case MSR_K7_PERFCTR2: 1103 case MSR_K7_PERFCTR3: 1104 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1105 "0x%x data 0x%llx\n", msr, data); 1106 break; 1107 default: 1108 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1109 return xen_hvm_config(vcpu, data); 1110 if (!ignore_msrs) { 1111 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1112 msr, data); 1113 return 1; 1114 } else { 1115 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1116 msr, data); 1117 break; 1118 } 1119 } 1120 return 0; 1121 } 1122 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1123 1124 1125 /* 1126 * Reads an msr value (of 'msr_index') into 'pdata'. 1127 * Returns 0 on success, non-0 otherwise. 1128 * Assumes vcpu_load() was already called. 1129 */ 1130 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1131 { 1132 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 1133 } 1134 1135 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1136 { 1137 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1138 1139 if (!msr_mtrr_valid(msr)) 1140 return 1; 1141 1142 if (msr == MSR_MTRRdefType) 1143 *pdata = vcpu->arch.mtrr_state.def_type + 1144 (vcpu->arch.mtrr_state.enabled << 10); 1145 else if (msr == MSR_MTRRfix64K_00000) 1146 *pdata = p[0]; 1147 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1148 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 1149 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1150 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 1151 else if (msr == MSR_IA32_CR_PAT) 1152 *pdata = vcpu->arch.pat; 1153 else { /* Variable MTRRs */ 1154 int idx, is_mtrr_mask; 1155 u64 *pt; 1156 1157 idx = (msr - 0x200) / 2; 1158 is_mtrr_mask = msr - 0x200 - 2 * idx; 1159 if (!is_mtrr_mask) 1160 pt = 1161 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1162 else 1163 pt = 1164 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1165 *pdata = *pt; 1166 } 1167 1168 return 0; 1169 } 1170 1171 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1172 { 1173 u64 data; 1174 u64 mcg_cap = vcpu->arch.mcg_cap; 1175 unsigned bank_num = mcg_cap & 0xff; 1176 1177 switch (msr) { 1178 case MSR_IA32_P5_MC_ADDR: 1179 case MSR_IA32_P5_MC_TYPE: 1180 data = 0; 1181 break; 1182 case MSR_IA32_MCG_CAP: 1183 data = vcpu->arch.mcg_cap; 1184 break; 1185 case MSR_IA32_MCG_CTL: 1186 if (!(mcg_cap & MCG_CTL_P)) 1187 return 1; 1188 data = vcpu->arch.mcg_ctl; 1189 break; 1190 case MSR_IA32_MCG_STATUS: 1191 data = vcpu->arch.mcg_status; 1192 break; 1193 default: 1194 if (msr >= MSR_IA32_MC0_CTL && 1195 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1196 u32 offset = msr - MSR_IA32_MC0_CTL; 1197 data = vcpu->arch.mce_banks[offset]; 1198 break; 1199 } 1200 return 1; 1201 } 1202 *pdata = data; 1203 return 0; 1204 } 1205 1206 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1207 { 1208 u64 data; 1209 1210 switch (msr) { 1211 case MSR_IA32_PLATFORM_ID: 1212 case MSR_IA32_UCODE_REV: 1213 case MSR_IA32_EBL_CR_POWERON: 1214 case MSR_IA32_DEBUGCTLMSR: 1215 case MSR_IA32_LASTBRANCHFROMIP: 1216 case MSR_IA32_LASTBRANCHTOIP: 1217 case MSR_IA32_LASTINTFROMIP: 1218 case MSR_IA32_LASTINTTOIP: 1219 case MSR_K8_SYSCFG: 1220 case MSR_K7_HWCR: 1221 case MSR_VM_HSAVE_PA: 1222 case MSR_P6_PERFCTR0: 1223 case MSR_P6_PERFCTR1: 1224 case MSR_P6_EVNTSEL0: 1225 case MSR_P6_EVNTSEL1: 1226 case MSR_K7_EVNTSEL0: 1227 case MSR_K7_PERFCTR0: 1228 case MSR_K8_INT_PENDING_MSG: 1229 case MSR_AMD64_NB_CFG: 1230 case MSR_FAM10H_MMIO_CONF_BASE: 1231 data = 0; 1232 break; 1233 case MSR_MTRRcap: 1234 data = 0x500 | KVM_NR_VAR_MTRR; 1235 break; 1236 case 0x200 ... 0x2ff: 1237 return get_msr_mtrr(vcpu, msr, pdata); 1238 case 0xcd: /* fsb frequency */ 1239 data = 3; 1240 break; 1241 case MSR_IA32_APICBASE: 1242 data = kvm_get_apic_base(vcpu); 1243 break; 1244 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1245 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1246 break; 1247 case MSR_IA32_MISC_ENABLE: 1248 data = vcpu->arch.ia32_misc_enable_msr; 1249 break; 1250 case MSR_IA32_PERF_STATUS: 1251 /* TSC increment by tick */ 1252 data = 1000ULL; 1253 /* CPU multiplier */ 1254 data |= (((uint64_t)4ULL) << 40); 1255 break; 1256 case MSR_EFER: 1257 data = vcpu->arch.shadow_efer; 1258 break; 1259 case MSR_KVM_WALL_CLOCK: 1260 data = vcpu->kvm->arch.wall_clock; 1261 break; 1262 case MSR_KVM_SYSTEM_TIME: 1263 data = vcpu->arch.time; 1264 break; 1265 case MSR_IA32_P5_MC_ADDR: 1266 case MSR_IA32_P5_MC_TYPE: 1267 case MSR_IA32_MCG_CAP: 1268 case MSR_IA32_MCG_CTL: 1269 case MSR_IA32_MCG_STATUS: 1270 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1271 return get_msr_mce(vcpu, msr, pdata); 1272 default: 1273 if (!ignore_msrs) { 1274 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1275 return 1; 1276 } else { 1277 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1278 data = 0; 1279 } 1280 break; 1281 } 1282 *pdata = data; 1283 return 0; 1284 } 1285 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1286 1287 /* 1288 * Read or write a bunch of msrs. All parameters are kernel addresses. 1289 * 1290 * @return number of msrs set successfully. 1291 */ 1292 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1293 struct kvm_msr_entry *entries, 1294 int (*do_msr)(struct kvm_vcpu *vcpu, 1295 unsigned index, u64 *data)) 1296 { 1297 int i; 1298 1299 vcpu_load(vcpu); 1300 1301 down_read(&vcpu->kvm->slots_lock); 1302 for (i = 0; i < msrs->nmsrs; ++i) 1303 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1304 break; 1305 up_read(&vcpu->kvm->slots_lock); 1306 1307 vcpu_put(vcpu); 1308 1309 return i; 1310 } 1311 1312 /* 1313 * Read or write a bunch of msrs. Parameters are user addresses. 1314 * 1315 * @return number of msrs set successfully. 1316 */ 1317 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1318 int (*do_msr)(struct kvm_vcpu *vcpu, 1319 unsigned index, u64 *data), 1320 int writeback) 1321 { 1322 struct kvm_msrs msrs; 1323 struct kvm_msr_entry *entries; 1324 int r, n; 1325 unsigned size; 1326 1327 r = -EFAULT; 1328 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1329 goto out; 1330 1331 r = -E2BIG; 1332 if (msrs.nmsrs >= MAX_IO_MSRS) 1333 goto out; 1334 1335 r = -ENOMEM; 1336 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1337 entries = vmalloc(size); 1338 if (!entries) 1339 goto out; 1340 1341 r = -EFAULT; 1342 if (copy_from_user(entries, user_msrs->entries, size)) 1343 goto out_free; 1344 1345 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1346 if (r < 0) 1347 goto out_free; 1348 1349 r = -EFAULT; 1350 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1351 goto out_free; 1352 1353 r = n; 1354 1355 out_free: 1356 vfree(entries); 1357 out: 1358 return r; 1359 } 1360 1361 int kvm_dev_ioctl_check_extension(long ext) 1362 { 1363 int r; 1364 1365 switch (ext) { 1366 case KVM_CAP_IRQCHIP: 1367 case KVM_CAP_HLT: 1368 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1369 case KVM_CAP_SET_TSS_ADDR: 1370 case KVM_CAP_EXT_CPUID: 1371 case KVM_CAP_CLOCKSOURCE: 1372 case KVM_CAP_PIT: 1373 case KVM_CAP_NOP_IO_DELAY: 1374 case KVM_CAP_MP_STATE: 1375 case KVM_CAP_SYNC_MMU: 1376 case KVM_CAP_REINJECT_CONTROL: 1377 case KVM_CAP_IRQ_INJECT_STATUS: 1378 case KVM_CAP_ASSIGN_DEV_IRQ: 1379 case KVM_CAP_IRQFD: 1380 case KVM_CAP_IOEVENTFD: 1381 case KVM_CAP_PIT2: 1382 case KVM_CAP_PIT_STATE2: 1383 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1384 case KVM_CAP_XEN_HVM: 1385 case KVM_CAP_ADJUST_CLOCK: 1386 case KVM_CAP_VCPU_EVENTS: 1387 r = 1; 1388 break; 1389 case KVM_CAP_COALESCED_MMIO: 1390 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1391 break; 1392 case KVM_CAP_VAPIC: 1393 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1394 break; 1395 case KVM_CAP_NR_VCPUS: 1396 r = KVM_MAX_VCPUS; 1397 break; 1398 case KVM_CAP_NR_MEMSLOTS: 1399 r = KVM_MEMORY_SLOTS; 1400 break; 1401 case KVM_CAP_PV_MMU: /* obsolete */ 1402 r = 0; 1403 break; 1404 case KVM_CAP_IOMMU: 1405 r = iommu_found(); 1406 break; 1407 case KVM_CAP_MCE: 1408 r = KVM_MAX_MCE_BANKS; 1409 break; 1410 default: 1411 r = 0; 1412 break; 1413 } 1414 return r; 1415 1416 } 1417 1418 long kvm_arch_dev_ioctl(struct file *filp, 1419 unsigned int ioctl, unsigned long arg) 1420 { 1421 void __user *argp = (void __user *)arg; 1422 long r; 1423 1424 switch (ioctl) { 1425 case KVM_GET_MSR_INDEX_LIST: { 1426 struct kvm_msr_list __user *user_msr_list = argp; 1427 struct kvm_msr_list msr_list; 1428 unsigned n; 1429 1430 r = -EFAULT; 1431 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1432 goto out; 1433 n = msr_list.nmsrs; 1434 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1435 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1436 goto out; 1437 r = -E2BIG; 1438 if (n < msr_list.nmsrs) 1439 goto out; 1440 r = -EFAULT; 1441 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1442 num_msrs_to_save * sizeof(u32))) 1443 goto out; 1444 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 1445 &emulated_msrs, 1446 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1447 goto out; 1448 r = 0; 1449 break; 1450 } 1451 case KVM_GET_SUPPORTED_CPUID: { 1452 struct kvm_cpuid2 __user *cpuid_arg = argp; 1453 struct kvm_cpuid2 cpuid; 1454 1455 r = -EFAULT; 1456 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1457 goto out; 1458 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1459 cpuid_arg->entries); 1460 if (r) 1461 goto out; 1462 1463 r = -EFAULT; 1464 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1465 goto out; 1466 r = 0; 1467 break; 1468 } 1469 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 1470 u64 mce_cap; 1471 1472 mce_cap = KVM_MCE_CAP_SUPPORTED; 1473 r = -EFAULT; 1474 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 1475 goto out; 1476 r = 0; 1477 break; 1478 } 1479 default: 1480 r = -EINVAL; 1481 } 1482 out: 1483 return r; 1484 } 1485 1486 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1487 { 1488 kvm_x86_ops->vcpu_load(vcpu, cpu); 1489 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1490 unsigned long khz = cpufreq_quick_get(cpu); 1491 if (!khz) 1492 khz = tsc_khz; 1493 per_cpu(cpu_tsc_khz, cpu) = khz; 1494 } 1495 kvm_request_guest_time_update(vcpu); 1496 } 1497 1498 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1499 { 1500 kvm_x86_ops->vcpu_put(vcpu); 1501 kvm_put_guest_fpu(vcpu); 1502 } 1503 1504 static int is_efer_nx(void) 1505 { 1506 unsigned long long efer = 0; 1507 1508 rdmsrl_safe(MSR_EFER, &efer); 1509 return efer & EFER_NX; 1510 } 1511 1512 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1513 { 1514 int i; 1515 struct kvm_cpuid_entry2 *e, *entry; 1516 1517 entry = NULL; 1518 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1519 e = &vcpu->arch.cpuid_entries[i]; 1520 if (e->function == 0x80000001) { 1521 entry = e; 1522 break; 1523 } 1524 } 1525 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1526 entry->edx &= ~(1 << 20); 1527 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1528 } 1529 } 1530 1531 /* when an old userspace process fills a new kernel module */ 1532 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1533 struct kvm_cpuid *cpuid, 1534 struct kvm_cpuid_entry __user *entries) 1535 { 1536 int r, i; 1537 struct kvm_cpuid_entry *cpuid_entries; 1538 1539 r = -E2BIG; 1540 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1541 goto out; 1542 r = -ENOMEM; 1543 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1544 if (!cpuid_entries) 1545 goto out; 1546 r = -EFAULT; 1547 if (copy_from_user(cpuid_entries, entries, 1548 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1549 goto out_free; 1550 for (i = 0; i < cpuid->nent; i++) { 1551 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1552 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1553 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1554 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1555 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1556 vcpu->arch.cpuid_entries[i].index = 0; 1557 vcpu->arch.cpuid_entries[i].flags = 0; 1558 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1559 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1560 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1561 } 1562 vcpu->arch.cpuid_nent = cpuid->nent; 1563 cpuid_fix_nx_cap(vcpu); 1564 r = 0; 1565 kvm_apic_set_version(vcpu); 1566 1567 out_free: 1568 vfree(cpuid_entries); 1569 out: 1570 return r; 1571 } 1572 1573 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1574 struct kvm_cpuid2 *cpuid, 1575 struct kvm_cpuid_entry2 __user *entries) 1576 { 1577 int r; 1578 1579 r = -E2BIG; 1580 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1581 goto out; 1582 r = -EFAULT; 1583 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1584 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1585 goto out; 1586 vcpu->arch.cpuid_nent = cpuid->nent; 1587 kvm_apic_set_version(vcpu); 1588 return 0; 1589 1590 out: 1591 return r; 1592 } 1593 1594 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1595 struct kvm_cpuid2 *cpuid, 1596 struct kvm_cpuid_entry2 __user *entries) 1597 { 1598 int r; 1599 1600 r = -E2BIG; 1601 if (cpuid->nent < vcpu->arch.cpuid_nent) 1602 goto out; 1603 r = -EFAULT; 1604 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1605 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1606 goto out; 1607 return 0; 1608 1609 out: 1610 cpuid->nent = vcpu->arch.cpuid_nent; 1611 return r; 1612 } 1613 1614 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1615 u32 index) 1616 { 1617 entry->function = function; 1618 entry->index = index; 1619 cpuid_count(entry->function, entry->index, 1620 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1621 entry->flags = 0; 1622 } 1623 1624 #define F(x) bit(X86_FEATURE_##x) 1625 1626 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1627 u32 index, int *nent, int maxnent) 1628 { 1629 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1630 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; 1631 #ifdef CONFIG_X86_64 1632 unsigned f_lm = F(LM); 1633 #else 1634 unsigned f_lm = 0; 1635 #endif 1636 1637 /* cpuid 1.edx */ 1638 const u32 kvm_supported_word0_x86_features = 1639 F(FPU) | F(VME) | F(DE) | F(PSE) | 1640 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1641 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1642 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1643 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1644 0 /* Reserved, DS, ACPI */ | F(MMX) | 1645 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1646 0 /* HTT, TM, Reserved, PBE */; 1647 /* cpuid 0x80000001.edx */ 1648 const u32 kvm_supported_word1_x86_features = 1649 F(FPU) | F(VME) | F(DE) | F(PSE) | 1650 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1651 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1652 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1653 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1654 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1655 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1656 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1657 /* cpuid 1.ecx */ 1658 const u32 kvm_supported_word4_x86_features = 1659 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1660 0 /* DS-CPL, VMX, SMX, EST */ | 1661 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1662 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1663 0 /* Reserved, DCA */ | F(XMM4_1) | 1664 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1665 0 /* Reserved, XSAVE, OSXSAVE */; 1666 /* cpuid 0x80000001.ecx */ 1667 const u32 kvm_supported_word6_x86_features = 1668 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1669 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1670 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1671 0 /* SKINIT */ | 0 /* WDT */; 1672 1673 /* all calls to cpuid_count() should be made on the same cpu */ 1674 get_cpu(); 1675 do_cpuid_1_ent(entry, function, index); 1676 ++*nent; 1677 1678 switch (function) { 1679 case 0: 1680 entry->eax = min(entry->eax, (u32)0xb); 1681 break; 1682 case 1: 1683 entry->edx &= kvm_supported_word0_x86_features; 1684 entry->ecx &= kvm_supported_word4_x86_features; 1685 /* we support x2apic emulation even if host does not support 1686 * it since we emulate x2apic in software */ 1687 entry->ecx |= F(X2APIC); 1688 break; 1689 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1690 * may return different values. This forces us to get_cpu() before 1691 * issuing the first command, and also to emulate this annoying behavior 1692 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1693 case 2: { 1694 int t, times = entry->eax & 0xff; 1695 1696 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1697 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1698 for (t = 1; t < times && *nent < maxnent; ++t) { 1699 do_cpuid_1_ent(&entry[t], function, 0); 1700 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1701 ++*nent; 1702 } 1703 break; 1704 } 1705 /* function 4 and 0xb have additional index. */ 1706 case 4: { 1707 int i, cache_type; 1708 1709 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1710 /* read more entries until cache_type is zero */ 1711 for (i = 1; *nent < maxnent; ++i) { 1712 cache_type = entry[i - 1].eax & 0x1f; 1713 if (!cache_type) 1714 break; 1715 do_cpuid_1_ent(&entry[i], function, i); 1716 entry[i].flags |= 1717 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1718 ++*nent; 1719 } 1720 break; 1721 } 1722 case 0xb: { 1723 int i, level_type; 1724 1725 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1726 /* read more entries until level_type is zero */ 1727 for (i = 1; *nent < maxnent; ++i) { 1728 level_type = entry[i - 1].ecx & 0xff00; 1729 if (!level_type) 1730 break; 1731 do_cpuid_1_ent(&entry[i], function, i); 1732 entry[i].flags |= 1733 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1734 ++*nent; 1735 } 1736 break; 1737 } 1738 case 0x80000000: 1739 entry->eax = min(entry->eax, 0x8000001a); 1740 break; 1741 case 0x80000001: 1742 entry->edx &= kvm_supported_word1_x86_features; 1743 entry->ecx &= kvm_supported_word6_x86_features; 1744 break; 1745 } 1746 put_cpu(); 1747 } 1748 1749 #undef F 1750 1751 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1752 struct kvm_cpuid_entry2 __user *entries) 1753 { 1754 struct kvm_cpuid_entry2 *cpuid_entries; 1755 int limit, nent = 0, r = -E2BIG; 1756 u32 func; 1757 1758 if (cpuid->nent < 1) 1759 goto out; 1760 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1761 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 1762 r = -ENOMEM; 1763 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1764 if (!cpuid_entries) 1765 goto out; 1766 1767 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1768 limit = cpuid_entries[0].eax; 1769 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1770 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1771 &nent, cpuid->nent); 1772 r = -E2BIG; 1773 if (nent >= cpuid->nent) 1774 goto out_free; 1775 1776 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1777 limit = cpuid_entries[nent - 1].eax; 1778 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1779 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1780 &nent, cpuid->nent); 1781 r = -E2BIG; 1782 if (nent >= cpuid->nent) 1783 goto out_free; 1784 1785 r = -EFAULT; 1786 if (copy_to_user(entries, cpuid_entries, 1787 nent * sizeof(struct kvm_cpuid_entry2))) 1788 goto out_free; 1789 cpuid->nent = nent; 1790 r = 0; 1791 1792 out_free: 1793 vfree(cpuid_entries); 1794 out: 1795 return r; 1796 } 1797 1798 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1799 struct kvm_lapic_state *s) 1800 { 1801 vcpu_load(vcpu); 1802 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1803 vcpu_put(vcpu); 1804 1805 return 0; 1806 } 1807 1808 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1809 struct kvm_lapic_state *s) 1810 { 1811 vcpu_load(vcpu); 1812 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1813 kvm_apic_post_state_restore(vcpu); 1814 update_cr8_intercept(vcpu); 1815 vcpu_put(vcpu); 1816 1817 return 0; 1818 } 1819 1820 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1821 struct kvm_interrupt *irq) 1822 { 1823 if (irq->irq < 0 || irq->irq >= 256) 1824 return -EINVAL; 1825 if (irqchip_in_kernel(vcpu->kvm)) 1826 return -ENXIO; 1827 vcpu_load(vcpu); 1828 1829 kvm_queue_interrupt(vcpu, irq->irq, false); 1830 1831 vcpu_put(vcpu); 1832 1833 return 0; 1834 } 1835 1836 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1837 { 1838 vcpu_load(vcpu); 1839 kvm_inject_nmi(vcpu); 1840 vcpu_put(vcpu); 1841 1842 return 0; 1843 } 1844 1845 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1846 struct kvm_tpr_access_ctl *tac) 1847 { 1848 if (tac->flags) 1849 return -EINVAL; 1850 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1851 return 0; 1852 } 1853 1854 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 1855 u64 mcg_cap) 1856 { 1857 int r; 1858 unsigned bank_num = mcg_cap & 0xff, bank; 1859 1860 r = -EINVAL; 1861 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 1862 goto out; 1863 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1864 goto out; 1865 r = 0; 1866 vcpu->arch.mcg_cap = mcg_cap; 1867 /* Init IA32_MCG_CTL to all 1s */ 1868 if (mcg_cap & MCG_CTL_P) 1869 vcpu->arch.mcg_ctl = ~(u64)0; 1870 /* Init IA32_MCi_CTL to all 1s */ 1871 for (bank = 0; bank < bank_num; bank++) 1872 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 1873 out: 1874 return r; 1875 } 1876 1877 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 1878 struct kvm_x86_mce *mce) 1879 { 1880 u64 mcg_cap = vcpu->arch.mcg_cap; 1881 unsigned bank_num = mcg_cap & 0xff; 1882 u64 *banks = vcpu->arch.mce_banks; 1883 1884 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 1885 return -EINVAL; 1886 /* 1887 * if IA32_MCG_CTL is not all 1s, the uncorrected error 1888 * reporting is disabled 1889 */ 1890 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 1891 vcpu->arch.mcg_ctl != ~(u64)0) 1892 return 0; 1893 banks += 4 * mce->bank; 1894 /* 1895 * if IA32_MCi_CTL is not all 1s, the uncorrected error 1896 * reporting is disabled for the bank 1897 */ 1898 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 1899 return 0; 1900 if (mce->status & MCI_STATUS_UC) { 1901 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 1902 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 1903 printk(KERN_DEBUG "kvm: set_mce: " 1904 "injects mce exception while " 1905 "previous one is in progress!\n"); 1906 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 1907 return 0; 1908 } 1909 if (banks[1] & MCI_STATUS_VAL) 1910 mce->status |= MCI_STATUS_OVER; 1911 banks[2] = mce->addr; 1912 banks[3] = mce->misc; 1913 vcpu->arch.mcg_status = mce->mcg_status; 1914 banks[1] = mce->status; 1915 kvm_queue_exception(vcpu, MC_VECTOR); 1916 } else if (!(banks[1] & MCI_STATUS_VAL) 1917 || !(banks[1] & MCI_STATUS_UC)) { 1918 if (banks[1] & MCI_STATUS_VAL) 1919 mce->status |= MCI_STATUS_OVER; 1920 banks[2] = mce->addr; 1921 banks[3] = mce->misc; 1922 banks[1] = mce->status; 1923 } else 1924 banks[1] |= MCI_STATUS_OVER; 1925 return 0; 1926 } 1927 1928 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 1929 struct kvm_vcpu_events *events) 1930 { 1931 vcpu_load(vcpu); 1932 1933 events->exception.injected = vcpu->arch.exception.pending; 1934 events->exception.nr = vcpu->arch.exception.nr; 1935 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 1936 events->exception.error_code = vcpu->arch.exception.error_code; 1937 1938 events->interrupt.injected = vcpu->arch.interrupt.pending; 1939 events->interrupt.nr = vcpu->arch.interrupt.nr; 1940 events->interrupt.soft = vcpu->arch.interrupt.soft; 1941 1942 events->nmi.injected = vcpu->arch.nmi_injected; 1943 events->nmi.pending = vcpu->arch.nmi_pending; 1944 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 1945 1946 events->sipi_vector = vcpu->arch.sipi_vector; 1947 1948 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 1949 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 1950 1951 vcpu_put(vcpu); 1952 } 1953 1954 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 1955 struct kvm_vcpu_events *events) 1956 { 1957 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 1958 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 1959 return -EINVAL; 1960 1961 vcpu_load(vcpu); 1962 1963 vcpu->arch.exception.pending = events->exception.injected; 1964 vcpu->arch.exception.nr = events->exception.nr; 1965 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 1966 vcpu->arch.exception.error_code = events->exception.error_code; 1967 1968 vcpu->arch.interrupt.pending = events->interrupt.injected; 1969 vcpu->arch.interrupt.nr = events->interrupt.nr; 1970 vcpu->arch.interrupt.soft = events->interrupt.soft; 1971 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 1972 kvm_pic_clear_isr_ack(vcpu->kvm); 1973 1974 vcpu->arch.nmi_injected = events->nmi.injected; 1975 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 1976 vcpu->arch.nmi_pending = events->nmi.pending; 1977 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 1978 1979 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 1980 vcpu->arch.sipi_vector = events->sipi_vector; 1981 1982 vcpu_put(vcpu); 1983 1984 return 0; 1985 } 1986 1987 long kvm_arch_vcpu_ioctl(struct file *filp, 1988 unsigned int ioctl, unsigned long arg) 1989 { 1990 struct kvm_vcpu *vcpu = filp->private_data; 1991 void __user *argp = (void __user *)arg; 1992 int r; 1993 struct kvm_lapic_state *lapic = NULL; 1994 1995 switch (ioctl) { 1996 case KVM_GET_LAPIC: { 1997 r = -EINVAL; 1998 if (!vcpu->arch.apic) 1999 goto out; 2000 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2001 2002 r = -ENOMEM; 2003 if (!lapic) 2004 goto out; 2005 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2006 if (r) 2007 goto out; 2008 r = -EFAULT; 2009 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2010 goto out; 2011 r = 0; 2012 break; 2013 } 2014 case KVM_SET_LAPIC: { 2015 r = -EINVAL; 2016 if (!vcpu->arch.apic) 2017 goto out; 2018 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2019 r = -ENOMEM; 2020 if (!lapic) 2021 goto out; 2022 r = -EFAULT; 2023 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2024 goto out; 2025 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2026 if (r) 2027 goto out; 2028 r = 0; 2029 break; 2030 } 2031 case KVM_INTERRUPT: { 2032 struct kvm_interrupt irq; 2033 2034 r = -EFAULT; 2035 if (copy_from_user(&irq, argp, sizeof irq)) 2036 goto out; 2037 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 2038 if (r) 2039 goto out; 2040 r = 0; 2041 break; 2042 } 2043 case KVM_NMI: { 2044 r = kvm_vcpu_ioctl_nmi(vcpu); 2045 if (r) 2046 goto out; 2047 r = 0; 2048 break; 2049 } 2050 case KVM_SET_CPUID: { 2051 struct kvm_cpuid __user *cpuid_arg = argp; 2052 struct kvm_cpuid cpuid; 2053 2054 r = -EFAULT; 2055 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2056 goto out; 2057 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 2058 if (r) 2059 goto out; 2060 break; 2061 } 2062 case KVM_SET_CPUID2: { 2063 struct kvm_cpuid2 __user *cpuid_arg = argp; 2064 struct kvm_cpuid2 cpuid; 2065 2066 r = -EFAULT; 2067 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2068 goto out; 2069 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 2070 cpuid_arg->entries); 2071 if (r) 2072 goto out; 2073 break; 2074 } 2075 case KVM_GET_CPUID2: { 2076 struct kvm_cpuid2 __user *cpuid_arg = argp; 2077 struct kvm_cpuid2 cpuid; 2078 2079 r = -EFAULT; 2080 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2081 goto out; 2082 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 2083 cpuid_arg->entries); 2084 if (r) 2085 goto out; 2086 r = -EFAULT; 2087 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 2088 goto out; 2089 r = 0; 2090 break; 2091 } 2092 case KVM_GET_MSRS: 2093 r = msr_io(vcpu, argp, kvm_get_msr, 1); 2094 break; 2095 case KVM_SET_MSRS: 2096 r = msr_io(vcpu, argp, do_set_msr, 0); 2097 break; 2098 case KVM_TPR_ACCESS_REPORTING: { 2099 struct kvm_tpr_access_ctl tac; 2100 2101 r = -EFAULT; 2102 if (copy_from_user(&tac, argp, sizeof tac)) 2103 goto out; 2104 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 2105 if (r) 2106 goto out; 2107 r = -EFAULT; 2108 if (copy_to_user(argp, &tac, sizeof tac)) 2109 goto out; 2110 r = 0; 2111 break; 2112 }; 2113 case KVM_SET_VAPIC_ADDR: { 2114 struct kvm_vapic_addr va; 2115 2116 r = -EINVAL; 2117 if (!irqchip_in_kernel(vcpu->kvm)) 2118 goto out; 2119 r = -EFAULT; 2120 if (copy_from_user(&va, argp, sizeof va)) 2121 goto out; 2122 r = 0; 2123 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 2124 break; 2125 } 2126 case KVM_X86_SETUP_MCE: { 2127 u64 mcg_cap; 2128 2129 r = -EFAULT; 2130 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 2131 goto out; 2132 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 2133 break; 2134 } 2135 case KVM_X86_SET_MCE: { 2136 struct kvm_x86_mce mce; 2137 2138 r = -EFAULT; 2139 if (copy_from_user(&mce, argp, sizeof mce)) 2140 goto out; 2141 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2142 break; 2143 } 2144 case KVM_GET_VCPU_EVENTS: { 2145 struct kvm_vcpu_events events; 2146 2147 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 2148 2149 r = -EFAULT; 2150 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 2151 break; 2152 r = 0; 2153 break; 2154 } 2155 case KVM_SET_VCPU_EVENTS: { 2156 struct kvm_vcpu_events events; 2157 2158 r = -EFAULT; 2159 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 2160 break; 2161 2162 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2163 break; 2164 } 2165 default: 2166 r = -EINVAL; 2167 } 2168 out: 2169 kfree(lapic); 2170 return r; 2171 } 2172 2173 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2174 { 2175 int ret; 2176 2177 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 2178 return -1; 2179 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 2180 return ret; 2181 } 2182 2183 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 2184 u64 ident_addr) 2185 { 2186 kvm->arch.ept_identity_map_addr = ident_addr; 2187 return 0; 2188 } 2189 2190 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 2191 u32 kvm_nr_mmu_pages) 2192 { 2193 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2194 return -EINVAL; 2195 2196 down_write(&kvm->slots_lock); 2197 spin_lock(&kvm->mmu_lock); 2198 2199 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2200 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2201 2202 spin_unlock(&kvm->mmu_lock); 2203 up_write(&kvm->slots_lock); 2204 return 0; 2205 } 2206 2207 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 2208 { 2209 return kvm->arch.n_alloc_mmu_pages; 2210 } 2211 2212 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2213 { 2214 int i; 2215 struct kvm_mem_alias *alias; 2216 2217 for (i = 0; i < kvm->arch.naliases; ++i) { 2218 alias = &kvm->arch.aliases[i]; 2219 if (gfn >= alias->base_gfn 2220 && gfn < alias->base_gfn + alias->npages) 2221 return alias->target_gfn + gfn - alias->base_gfn; 2222 } 2223 return gfn; 2224 } 2225 2226 /* 2227 * Set a new alias region. Aliases map a portion of physical memory into 2228 * another portion. This is useful for memory windows, for example the PC 2229 * VGA region. 2230 */ 2231 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 2232 struct kvm_memory_alias *alias) 2233 { 2234 int r, n; 2235 struct kvm_mem_alias *p; 2236 2237 r = -EINVAL; 2238 /* General sanity checks */ 2239 if (alias->memory_size & (PAGE_SIZE - 1)) 2240 goto out; 2241 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 2242 goto out; 2243 if (alias->slot >= KVM_ALIAS_SLOTS) 2244 goto out; 2245 if (alias->guest_phys_addr + alias->memory_size 2246 < alias->guest_phys_addr) 2247 goto out; 2248 if (alias->target_phys_addr + alias->memory_size 2249 < alias->target_phys_addr) 2250 goto out; 2251 2252 down_write(&kvm->slots_lock); 2253 spin_lock(&kvm->mmu_lock); 2254 2255 p = &kvm->arch.aliases[alias->slot]; 2256 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2257 p->npages = alias->memory_size >> PAGE_SHIFT; 2258 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2259 2260 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2261 if (kvm->arch.aliases[n - 1].npages) 2262 break; 2263 kvm->arch.naliases = n; 2264 2265 spin_unlock(&kvm->mmu_lock); 2266 kvm_mmu_zap_all(kvm); 2267 2268 up_write(&kvm->slots_lock); 2269 2270 return 0; 2271 2272 out: 2273 return r; 2274 } 2275 2276 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2277 { 2278 int r; 2279 2280 r = 0; 2281 switch (chip->chip_id) { 2282 case KVM_IRQCHIP_PIC_MASTER: 2283 memcpy(&chip->chip.pic, 2284 &pic_irqchip(kvm)->pics[0], 2285 sizeof(struct kvm_pic_state)); 2286 break; 2287 case KVM_IRQCHIP_PIC_SLAVE: 2288 memcpy(&chip->chip.pic, 2289 &pic_irqchip(kvm)->pics[1], 2290 sizeof(struct kvm_pic_state)); 2291 break; 2292 case KVM_IRQCHIP_IOAPIC: 2293 r = kvm_get_ioapic(kvm, &chip->chip.ioapic); 2294 break; 2295 default: 2296 r = -EINVAL; 2297 break; 2298 } 2299 return r; 2300 } 2301 2302 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2303 { 2304 int r; 2305 2306 r = 0; 2307 switch (chip->chip_id) { 2308 case KVM_IRQCHIP_PIC_MASTER: 2309 spin_lock(&pic_irqchip(kvm)->lock); 2310 memcpy(&pic_irqchip(kvm)->pics[0], 2311 &chip->chip.pic, 2312 sizeof(struct kvm_pic_state)); 2313 spin_unlock(&pic_irqchip(kvm)->lock); 2314 break; 2315 case KVM_IRQCHIP_PIC_SLAVE: 2316 spin_lock(&pic_irqchip(kvm)->lock); 2317 memcpy(&pic_irqchip(kvm)->pics[1], 2318 &chip->chip.pic, 2319 sizeof(struct kvm_pic_state)); 2320 spin_unlock(&pic_irqchip(kvm)->lock); 2321 break; 2322 case KVM_IRQCHIP_IOAPIC: 2323 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2324 break; 2325 default: 2326 r = -EINVAL; 2327 break; 2328 } 2329 kvm_pic_update_irq(pic_irqchip(kvm)); 2330 return r; 2331 } 2332 2333 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2334 { 2335 int r = 0; 2336 2337 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2338 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2339 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2340 return r; 2341 } 2342 2343 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2344 { 2345 int r = 0; 2346 2347 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2348 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2349 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 2350 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2351 return r; 2352 } 2353 2354 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2355 { 2356 int r = 0; 2357 2358 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2359 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 2360 sizeof(ps->channels)); 2361 ps->flags = kvm->arch.vpit->pit_state.flags; 2362 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2363 return r; 2364 } 2365 2366 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2367 { 2368 int r = 0, start = 0; 2369 u32 prev_legacy, cur_legacy; 2370 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2371 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 2372 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 2373 if (!prev_legacy && cur_legacy) 2374 start = 1; 2375 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 2376 sizeof(kvm->arch.vpit->pit_state.channels)); 2377 kvm->arch.vpit->pit_state.flags = ps->flags; 2378 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 2379 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2380 return r; 2381 } 2382 2383 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 2384 struct kvm_reinject_control *control) 2385 { 2386 if (!kvm->arch.vpit) 2387 return -ENXIO; 2388 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2389 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2390 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2391 return 0; 2392 } 2393 2394 /* 2395 * Get (and clear) the dirty memory log for a memory slot. 2396 */ 2397 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2398 struct kvm_dirty_log *log) 2399 { 2400 int r; 2401 int n; 2402 struct kvm_memory_slot *memslot; 2403 int is_dirty = 0; 2404 2405 down_write(&kvm->slots_lock); 2406 2407 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2408 if (r) 2409 goto out; 2410 2411 /* If nothing is dirty, don't bother messing with page tables. */ 2412 if (is_dirty) { 2413 spin_lock(&kvm->mmu_lock); 2414 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2415 spin_unlock(&kvm->mmu_lock); 2416 memslot = &kvm->memslots[log->slot]; 2417 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2418 memset(memslot->dirty_bitmap, 0, n); 2419 } 2420 r = 0; 2421 out: 2422 up_write(&kvm->slots_lock); 2423 return r; 2424 } 2425 2426 long kvm_arch_vm_ioctl(struct file *filp, 2427 unsigned int ioctl, unsigned long arg) 2428 { 2429 struct kvm *kvm = filp->private_data; 2430 void __user *argp = (void __user *)arg; 2431 int r = -ENOTTY; 2432 /* 2433 * This union makes it completely explicit to gcc-3.x 2434 * that these two variables' stack usage should be 2435 * combined, not added together. 2436 */ 2437 union { 2438 struct kvm_pit_state ps; 2439 struct kvm_pit_state2 ps2; 2440 struct kvm_memory_alias alias; 2441 struct kvm_pit_config pit_config; 2442 } u; 2443 2444 switch (ioctl) { 2445 case KVM_SET_TSS_ADDR: 2446 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 2447 if (r < 0) 2448 goto out; 2449 break; 2450 case KVM_SET_IDENTITY_MAP_ADDR: { 2451 u64 ident_addr; 2452 2453 r = -EFAULT; 2454 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 2455 goto out; 2456 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 2457 if (r < 0) 2458 goto out; 2459 break; 2460 } 2461 case KVM_SET_MEMORY_REGION: { 2462 struct kvm_memory_region kvm_mem; 2463 struct kvm_userspace_memory_region kvm_userspace_mem; 2464 2465 r = -EFAULT; 2466 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2467 goto out; 2468 kvm_userspace_mem.slot = kvm_mem.slot; 2469 kvm_userspace_mem.flags = kvm_mem.flags; 2470 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 2471 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 2472 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 2473 if (r) 2474 goto out; 2475 break; 2476 } 2477 case KVM_SET_NR_MMU_PAGES: 2478 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2479 if (r) 2480 goto out; 2481 break; 2482 case KVM_GET_NR_MMU_PAGES: 2483 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2484 break; 2485 case KVM_SET_MEMORY_ALIAS: 2486 r = -EFAULT; 2487 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 2488 goto out; 2489 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 2490 if (r) 2491 goto out; 2492 break; 2493 case KVM_CREATE_IRQCHIP: { 2494 struct kvm_pic *vpic; 2495 2496 mutex_lock(&kvm->lock); 2497 r = -EEXIST; 2498 if (kvm->arch.vpic) 2499 goto create_irqchip_unlock; 2500 r = -ENOMEM; 2501 vpic = kvm_create_pic(kvm); 2502 if (vpic) { 2503 r = kvm_ioapic_init(kvm); 2504 if (r) { 2505 kfree(vpic); 2506 goto create_irqchip_unlock; 2507 } 2508 } else 2509 goto create_irqchip_unlock; 2510 smp_wmb(); 2511 kvm->arch.vpic = vpic; 2512 smp_wmb(); 2513 r = kvm_setup_default_irq_routing(kvm); 2514 if (r) { 2515 mutex_lock(&kvm->irq_lock); 2516 kfree(kvm->arch.vpic); 2517 kfree(kvm->arch.vioapic); 2518 kvm->arch.vpic = NULL; 2519 kvm->arch.vioapic = NULL; 2520 mutex_unlock(&kvm->irq_lock); 2521 } 2522 create_irqchip_unlock: 2523 mutex_unlock(&kvm->lock); 2524 break; 2525 } 2526 case KVM_CREATE_PIT: 2527 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2528 goto create_pit; 2529 case KVM_CREATE_PIT2: 2530 r = -EFAULT; 2531 if (copy_from_user(&u.pit_config, argp, 2532 sizeof(struct kvm_pit_config))) 2533 goto out; 2534 create_pit: 2535 down_write(&kvm->slots_lock); 2536 r = -EEXIST; 2537 if (kvm->arch.vpit) 2538 goto create_pit_unlock; 2539 r = -ENOMEM; 2540 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 2541 if (kvm->arch.vpit) 2542 r = 0; 2543 create_pit_unlock: 2544 up_write(&kvm->slots_lock); 2545 break; 2546 case KVM_IRQ_LINE_STATUS: 2547 case KVM_IRQ_LINE: { 2548 struct kvm_irq_level irq_event; 2549 2550 r = -EFAULT; 2551 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2552 goto out; 2553 if (irqchip_in_kernel(kvm)) { 2554 __s32 status; 2555 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2556 irq_event.irq, irq_event.level); 2557 if (ioctl == KVM_IRQ_LINE_STATUS) { 2558 irq_event.status = status; 2559 if (copy_to_user(argp, &irq_event, 2560 sizeof irq_event)) 2561 goto out; 2562 } 2563 r = 0; 2564 } 2565 break; 2566 } 2567 case KVM_GET_IRQCHIP: { 2568 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2569 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2570 2571 r = -ENOMEM; 2572 if (!chip) 2573 goto out; 2574 r = -EFAULT; 2575 if (copy_from_user(chip, argp, sizeof *chip)) 2576 goto get_irqchip_out; 2577 r = -ENXIO; 2578 if (!irqchip_in_kernel(kvm)) 2579 goto get_irqchip_out; 2580 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 2581 if (r) 2582 goto get_irqchip_out; 2583 r = -EFAULT; 2584 if (copy_to_user(argp, chip, sizeof *chip)) 2585 goto get_irqchip_out; 2586 r = 0; 2587 get_irqchip_out: 2588 kfree(chip); 2589 if (r) 2590 goto out; 2591 break; 2592 } 2593 case KVM_SET_IRQCHIP: { 2594 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 2595 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 2596 2597 r = -ENOMEM; 2598 if (!chip) 2599 goto out; 2600 r = -EFAULT; 2601 if (copy_from_user(chip, argp, sizeof *chip)) 2602 goto set_irqchip_out; 2603 r = -ENXIO; 2604 if (!irqchip_in_kernel(kvm)) 2605 goto set_irqchip_out; 2606 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 2607 if (r) 2608 goto set_irqchip_out; 2609 r = 0; 2610 set_irqchip_out: 2611 kfree(chip); 2612 if (r) 2613 goto out; 2614 break; 2615 } 2616 case KVM_GET_PIT: { 2617 r = -EFAULT; 2618 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 2619 goto out; 2620 r = -ENXIO; 2621 if (!kvm->arch.vpit) 2622 goto out; 2623 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 2624 if (r) 2625 goto out; 2626 r = -EFAULT; 2627 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 2628 goto out; 2629 r = 0; 2630 break; 2631 } 2632 case KVM_SET_PIT: { 2633 r = -EFAULT; 2634 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 2635 goto out; 2636 r = -ENXIO; 2637 if (!kvm->arch.vpit) 2638 goto out; 2639 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2640 if (r) 2641 goto out; 2642 r = 0; 2643 break; 2644 } 2645 case KVM_GET_PIT2: { 2646 r = -ENXIO; 2647 if (!kvm->arch.vpit) 2648 goto out; 2649 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 2650 if (r) 2651 goto out; 2652 r = -EFAULT; 2653 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 2654 goto out; 2655 r = 0; 2656 break; 2657 } 2658 case KVM_SET_PIT2: { 2659 r = -EFAULT; 2660 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 2661 goto out; 2662 r = -ENXIO; 2663 if (!kvm->arch.vpit) 2664 goto out; 2665 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 2666 if (r) 2667 goto out; 2668 r = 0; 2669 break; 2670 } 2671 case KVM_REINJECT_CONTROL: { 2672 struct kvm_reinject_control control; 2673 r = -EFAULT; 2674 if (copy_from_user(&control, argp, sizeof(control))) 2675 goto out; 2676 r = kvm_vm_ioctl_reinject(kvm, &control); 2677 if (r) 2678 goto out; 2679 r = 0; 2680 break; 2681 } 2682 case KVM_XEN_HVM_CONFIG: { 2683 r = -EFAULT; 2684 if (copy_from_user(&kvm->arch.xen_hvm_config, argp, 2685 sizeof(struct kvm_xen_hvm_config))) 2686 goto out; 2687 r = -EINVAL; 2688 if (kvm->arch.xen_hvm_config.flags) 2689 goto out; 2690 r = 0; 2691 break; 2692 } 2693 case KVM_SET_CLOCK: { 2694 struct timespec now; 2695 struct kvm_clock_data user_ns; 2696 u64 now_ns; 2697 s64 delta; 2698 2699 r = -EFAULT; 2700 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 2701 goto out; 2702 2703 r = -EINVAL; 2704 if (user_ns.flags) 2705 goto out; 2706 2707 r = 0; 2708 ktime_get_ts(&now); 2709 now_ns = timespec_to_ns(&now); 2710 delta = user_ns.clock - now_ns; 2711 kvm->arch.kvmclock_offset = delta; 2712 break; 2713 } 2714 case KVM_GET_CLOCK: { 2715 struct timespec now; 2716 struct kvm_clock_data user_ns; 2717 u64 now_ns; 2718 2719 ktime_get_ts(&now); 2720 now_ns = timespec_to_ns(&now); 2721 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 2722 user_ns.flags = 0; 2723 2724 r = -EFAULT; 2725 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 2726 goto out; 2727 r = 0; 2728 break; 2729 } 2730 2731 default: 2732 ; 2733 } 2734 out: 2735 return r; 2736 } 2737 2738 static void kvm_init_msr_list(void) 2739 { 2740 u32 dummy[2]; 2741 unsigned i, j; 2742 2743 /* skip the first msrs in the list. KVM-specific */ 2744 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 2745 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2746 continue; 2747 if (j < i) 2748 msrs_to_save[j] = msrs_to_save[i]; 2749 j++; 2750 } 2751 num_msrs_to_save = j; 2752 } 2753 2754 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 2755 const void *v) 2756 { 2757 if (vcpu->arch.apic && 2758 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 2759 return 0; 2760 2761 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 2762 } 2763 2764 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 2765 { 2766 if (vcpu->arch.apic && 2767 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 2768 return 0; 2769 2770 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 2771 } 2772 2773 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2774 struct kvm_vcpu *vcpu) 2775 { 2776 void *data = val; 2777 int r = X86EMUL_CONTINUE; 2778 2779 while (bytes) { 2780 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2781 unsigned offset = addr & (PAGE_SIZE-1); 2782 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2783 int ret; 2784 2785 if (gpa == UNMAPPED_GVA) { 2786 r = X86EMUL_PROPAGATE_FAULT; 2787 goto out; 2788 } 2789 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2790 if (ret < 0) { 2791 r = X86EMUL_UNHANDLEABLE; 2792 goto out; 2793 } 2794 2795 bytes -= toread; 2796 data += toread; 2797 addr += toread; 2798 } 2799 out: 2800 return r; 2801 } 2802 2803 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2804 struct kvm_vcpu *vcpu) 2805 { 2806 void *data = val; 2807 int r = X86EMUL_CONTINUE; 2808 2809 while (bytes) { 2810 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2811 unsigned offset = addr & (PAGE_SIZE-1); 2812 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2813 int ret; 2814 2815 if (gpa == UNMAPPED_GVA) { 2816 r = X86EMUL_PROPAGATE_FAULT; 2817 goto out; 2818 } 2819 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2820 if (ret < 0) { 2821 r = X86EMUL_UNHANDLEABLE; 2822 goto out; 2823 } 2824 2825 bytes -= towrite; 2826 data += towrite; 2827 addr += towrite; 2828 } 2829 out: 2830 return r; 2831 } 2832 2833 2834 static int emulator_read_emulated(unsigned long addr, 2835 void *val, 2836 unsigned int bytes, 2837 struct kvm_vcpu *vcpu) 2838 { 2839 gpa_t gpa; 2840 2841 if (vcpu->mmio_read_completed) { 2842 memcpy(val, vcpu->mmio_data, bytes); 2843 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 2844 vcpu->mmio_phys_addr, *(u64 *)val); 2845 vcpu->mmio_read_completed = 0; 2846 return X86EMUL_CONTINUE; 2847 } 2848 2849 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2850 2851 /* For APIC access vmexit */ 2852 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2853 goto mmio; 2854 2855 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2856 == X86EMUL_CONTINUE) 2857 return X86EMUL_CONTINUE; 2858 if (gpa == UNMAPPED_GVA) 2859 return X86EMUL_PROPAGATE_FAULT; 2860 2861 mmio: 2862 /* 2863 * Is this MMIO handled locally? 2864 */ 2865 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 2866 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 2867 return X86EMUL_CONTINUE; 2868 } 2869 2870 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 2871 2872 vcpu->mmio_needed = 1; 2873 vcpu->mmio_phys_addr = gpa; 2874 vcpu->mmio_size = bytes; 2875 vcpu->mmio_is_write = 0; 2876 2877 return X86EMUL_UNHANDLEABLE; 2878 } 2879 2880 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2881 const void *val, int bytes) 2882 { 2883 int ret; 2884 2885 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2886 if (ret < 0) 2887 return 0; 2888 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2889 return 1; 2890 } 2891 2892 static int emulator_write_emulated_onepage(unsigned long addr, 2893 const void *val, 2894 unsigned int bytes, 2895 struct kvm_vcpu *vcpu) 2896 { 2897 gpa_t gpa; 2898 2899 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2900 2901 if (gpa == UNMAPPED_GVA) { 2902 kvm_inject_page_fault(vcpu, addr, 2); 2903 return X86EMUL_PROPAGATE_FAULT; 2904 } 2905 2906 /* For APIC access vmexit */ 2907 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2908 goto mmio; 2909 2910 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2911 return X86EMUL_CONTINUE; 2912 2913 mmio: 2914 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 2915 /* 2916 * Is this MMIO handled locally? 2917 */ 2918 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 2919 return X86EMUL_CONTINUE; 2920 2921 vcpu->mmio_needed = 1; 2922 vcpu->mmio_phys_addr = gpa; 2923 vcpu->mmio_size = bytes; 2924 vcpu->mmio_is_write = 1; 2925 memcpy(vcpu->mmio_data, val, bytes); 2926 2927 return X86EMUL_CONTINUE; 2928 } 2929 2930 int emulator_write_emulated(unsigned long addr, 2931 const void *val, 2932 unsigned int bytes, 2933 struct kvm_vcpu *vcpu) 2934 { 2935 /* Crossing a page boundary? */ 2936 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2937 int rc, now; 2938 2939 now = -addr & ~PAGE_MASK; 2940 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2941 if (rc != X86EMUL_CONTINUE) 2942 return rc; 2943 addr += now; 2944 val += now; 2945 bytes -= now; 2946 } 2947 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2948 } 2949 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2950 2951 static int emulator_cmpxchg_emulated(unsigned long addr, 2952 const void *old, 2953 const void *new, 2954 unsigned int bytes, 2955 struct kvm_vcpu *vcpu) 2956 { 2957 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 2958 #ifndef CONFIG_X86_64 2959 /* guests cmpxchg8b have to be emulated atomically */ 2960 if (bytes == 8) { 2961 gpa_t gpa; 2962 struct page *page; 2963 char *kaddr; 2964 u64 val; 2965 2966 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2967 2968 if (gpa == UNMAPPED_GVA || 2969 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2970 goto emul_write; 2971 2972 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2973 goto emul_write; 2974 2975 val = *(u64 *)new; 2976 2977 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2978 2979 kaddr = kmap_atomic(page, KM_USER0); 2980 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2981 kunmap_atomic(kaddr, KM_USER0); 2982 kvm_release_page_dirty(page); 2983 } 2984 emul_write: 2985 #endif 2986 2987 return emulator_write_emulated(addr, new, bytes, vcpu); 2988 } 2989 2990 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2991 { 2992 return kvm_x86_ops->get_segment_base(vcpu, seg); 2993 } 2994 2995 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2996 { 2997 kvm_mmu_invlpg(vcpu, address); 2998 return X86EMUL_CONTINUE; 2999 } 3000 3001 int emulate_clts(struct kvm_vcpu *vcpu) 3002 { 3003 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 3004 return X86EMUL_CONTINUE; 3005 } 3006 3007 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3008 { 3009 struct kvm_vcpu *vcpu = ctxt->vcpu; 3010 3011 switch (dr) { 3012 case 0 ... 3: 3013 *dest = kvm_x86_ops->get_dr(vcpu, dr); 3014 return X86EMUL_CONTINUE; 3015 default: 3016 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 3017 return X86EMUL_UNHANDLEABLE; 3018 } 3019 } 3020 3021 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3022 { 3023 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3024 int exception; 3025 3026 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 3027 if (exception) { 3028 /* FIXME: better handling */ 3029 return X86EMUL_UNHANDLEABLE; 3030 } 3031 return X86EMUL_CONTINUE; 3032 } 3033 3034 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3035 { 3036 u8 opcodes[4]; 3037 unsigned long rip = kvm_rip_read(vcpu); 3038 unsigned long rip_linear; 3039 3040 if (!printk_ratelimit()) 3041 return; 3042 3043 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3044 3045 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3046 3047 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3048 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3049 } 3050 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3051 3052 static struct x86_emulate_ops emulate_ops = { 3053 .read_std = kvm_read_guest_virt, 3054 .read_emulated = emulator_read_emulated, 3055 .write_emulated = emulator_write_emulated, 3056 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3057 }; 3058 3059 static void cache_all_regs(struct kvm_vcpu *vcpu) 3060 { 3061 kvm_register_read(vcpu, VCPU_REGS_RAX); 3062 kvm_register_read(vcpu, VCPU_REGS_RSP); 3063 kvm_register_read(vcpu, VCPU_REGS_RIP); 3064 vcpu->arch.regs_dirty = ~0; 3065 } 3066 3067 int emulate_instruction(struct kvm_vcpu *vcpu, 3068 unsigned long cr2, 3069 u16 error_code, 3070 int emulation_type) 3071 { 3072 int r, shadow_mask; 3073 struct decode_cache *c; 3074 struct kvm_run *run = vcpu->run; 3075 3076 kvm_clear_exception_queue(vcpu); 3077 vcpu->arch.mmio_fault_cr2 = cr2; 3078 /* 3079 * TODO: fix emulate.c to use guest_read/write_register 3080 * instead of direct ->regs accesses, can save hundred cycles 3081 * on Intel for instructions that don't read/change RSP, for 3082 * for example. 3083 */ 3084 cache_all_regs(vcpu); 3085 3086 vcpu->mmio_is_write = 0; 3087 vcpu->arch.pio.string = 0; 3088 3089 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3090 int cs_db, cs_l; 3091 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3092 3093 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3094 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3095 vcpu->arch.emulate_ctxt.mode = 3096 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3097 ? X86EMUL_MODE_REAL : cs_l 3098 ? X86EMUL_MODE_PROT64 : cs_db 3099 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3100 3101 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3102 3103 /* Only allow emulation of specific instructions on #UD 3104 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3105 c = &vcpu->arch.emulate_ctxt.decode; 3106 if (emulation_type & EMULTYPE_TRAP_UD) { 3107 if (!c->twobyte) 3108 return EMULATE_FAIL; 3109 switch (c->b) { 3110 case 0x01: /* VMMCALL */ 3111 if (c->modrm_mod != 3 || c->modrm_rm != 1) 3112 return EMULATE_FAIL; 3113 break; 3114 case 0x34: /* sysenter */ 3115 case 0x35: /* sysexit */ 3116 if (c->modrm_mod != 0 || c->modrm_rm != 0) 3117 return EMULATE_FAIL; 3118 break; 3119 case 0x05: /* syscall */ 3120 if (c->modrm_mod != 0 || c->modrm_rm != 0) 3121 return EMULATE_FAIL; 3122 break; 3123 default: 3124 return EMULATE_FAIL; 3125 } 3126 3127 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 3128 return EMULATE_FAIL; 3129 } 3130 3131 ++vcpu->stat.insn_emulation; 3132 if (r) { 3133 ++vcpu->stat.insn_emulation_fail; 3134 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3135 return EMULATE_DONE; 3136 return EMULATE_FAIL; 3137 } 3138 } 3139 3140 if (emulation_type & EMULTYPE_SKIP) { 3141 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 3142 return EMULATE_DONE; 3143 } 3144 3145 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3146 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3147 3148 if (r == 0) 3149 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3150 3151 if (vcpu->arch.pio.string) 3152 return EMULATE_DO_MMIO; 3153 3154 if ((r || vcpu->mmio_is_write) && run) { 3155 run->exit_reason = KVM_EXIT_MMIO; 3156 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3157 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3158 run->mmio.len = vcpu->mmio_size; 3159 run->mmio.is_write = vcpu->mmio_is_write; 3160 } 3161 3162 if (r) { 3163 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3164 return EMULATE_DONE; 3165 if (!vcpu->mmio_needed) { 3166 kvm_report_emulation_failure(vcpu, "mmio"); 3167 return EMULATE_FAIL; 3168 } 3169 return EMULATE_DO_MMIO; 3170 } 3171 3172 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3173 3174 if (vcpu->mmio_is_write) { 3175 vcpu->mmio_needed = 0; 3176 return EMULATE_DO_MMIO; 3177 } 3178 3179 return EMULATE_DONE; 3180 } 3181 EXPORT_SYMBOL_GPL(emulate_instruction); 3182 3183 static int pio_copy_data(struct kvm_vcpu *vcpu) 3184 { 3185 void *p = vcpu->arch.pio_data; 3186 gva_t q = vcpu->arch.pio.guest_gva; 3187 unsigned bytes; 3188 int ret; 3189 3190 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3191 if (vcpu->arch.pio.in) 3192 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3193 else 3194 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3195 return ret; 3196 } 3197 3198 int complete_pio(struct kvm_vcpu *vcpu) 3199 { 3200 struct kvm_pio_request *io = &vcpu->arch.pio; 3201 long delta; 3202 int r; 3203 unsigned long val; 3204 3205 if (!io->string) { 3206 if (io->in) { 3207 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3208 memcpy(&val, vcpu->arch.pio_data, io->size); 3209 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 3210 } 3211 } else { 3212 if (io->in) { 3213 r = pio_copy_data(vcpu); 3214 if (r) 3215 return r; 3216 } 3217 3218 delta = 1; 3219 if (io->rep) { 3220 delta *= io->cur_count; 3221 /* 3222 * The size of the register should really depend on 3223 * current address size. 3224 */ 3225 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 3226 val -= delta; 3227 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 3228 } 3229 if (io->down) 3230 delta = -delta; 3231 delta *= io->size; 3232 if (io->in) { 3233 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 3234 val += delta; 3235 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 3236 } else { 3237 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 3238 val += delta; 3239 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3240 } 3241 } 3242 3243 io->count -= io->cur_count; 3244 io->cur_count = 0; 3245 3246 return 0; 3247 } 3248 3249 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3250 { 3251 /* TODO: String I/O for in kernel device */ 3252 int r; 3253 3254 if (vcpu->arch.pio.in) 3255 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3256 vcpu->arch.pio.size, pd); 3257 else 3258 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3259 vcpu->arch.pio.size, pd); 3260 return r; 3261 } 3262 3263 static int pio_string_write(struct kvm_vcpu *vcpu) 3264 { 3265 struct kvm_pio_request *io = &vcpu->arch.pio; 3266 void *pd = vcpu->arch.pio_data; 3267 int i, r = 0; 3268 3269 for (i = 0; i < io->cur_count; i++) { 3270 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3271 io->port, io->size, pd)) { 3272 r = -EOPNOTSUPP; 3273 break; 3274 } 3275 pd += io->size; 3276 } 3277 return r; 3278 } 3279 3280 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) 3281 { 3282 unsigned long val; 3283 3284 vcpu->run->exit_reason = KVM_EXIT_IO; 3285 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3286 vcpu->run->io.size = vcpu->arch.pio.size = size; 3287 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3288 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 3289 vcpu->run->io.port = vcpu->arch.pio.port = port; 3290 vcpu->arch.pio.in = in; 3291 vcpu->arch.pio.string = 0; 3292 vcpu->arch.pio.down = 0; 3293 vcpu->arch.pio.rep = 0; 3294 3295 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3296 size, 1); 3297 3298 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3299 memcpy(vcpu->arch.pio_data, &val, 4); 3300 3301 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3302 complete_pio(vcpu); 3303 return 1; 3304 } 3305 return 0; 3306 } 3307 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3308 3309 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3310 int size, unsigned long count, int down, 3311 gva_t address, int rep, unsigned port) 3312 { 3313 unsigned now, in_page; 3314 int ret = 0; 3315 3316 vcpu->run->exit_reason = KVM_EXIT_IO; 3317 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3318 vcpu->run->io.size = vcpu->arch.pio.size = size; 3319 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3320 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 3321 vcpu->run->io.port = vcpu->arch.pio.port = port; 3322 vcpu->arch.pio.in = in; 3323 vcpu->arch.pio.string = 1; 3324 vcpu->arch.pio.down = down; 3325 vcpu->arch.pio.rep = rep; 3326 3327 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3328 size, count); 3329 3330 if (!count) { 3331 kvm_x86_ops->skip_emulated_instruction(vcpu); 3332 return 1; 3333 } 3334 3335 if (!down) 3336 in_page = PAGE_SIZE - offset_in_page(address); 3337 else 3338 in_page = offset_in_page(address) + size; 3339 now = min(count, (unsigned long)in_page / size); 3340 if (!now) 3341 now = 1; 3342 if (down) { 3343 /* 3344 * String I/O in reverse. Yuck. Kill the guest, fix later. 3345 */ 3346 pr_unimpl(vcpu, "guest string pio down\n"); 3347 kvm_inject_gp(vcpu, 0); 3348 return 1; 3349 } 3350 vcpu->run->io.count = now; 3351 vcpu->arch.pio.cur_count = now; 3352 3353 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 3354 kvm_x86_ops->skip_emulated_instruction(vcpu); 3355 3356 vcpu->arch.pio.guest_gva = address; 3357 3358 if (!vcpu->arch.pio.in) { 3359 /* string PIO write */ 3360 ret = pio_copy_data(vcpu); 3361 if (ret == X86EMUL_PROPAGATE_FAULT) { 3362 kvm_inject_gp(vcpu, 0); 3363 return 1; 3364 } 3365 if (ret == 0 && !pio_string_write(vcpu)) { 3366 complete_pio(vcpu); 3367 if (vcpu->arch.pio.count == 0) 3368 ret = 1; 3369 } 3370 } 3371 /* no string PIO read support yet */ 3372 3373 return ret; 3374 } 3375 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3376 3377 static void bounce_off(void *info) 3378 { 3379 /* nothing */ 3380 } 3381 3382 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3383 void *data) 3384 { 3385 struct cpufreq_freqs *freq = data; 3386 struct kvm *kvm; 3387 struct kvm_vcpu *vcpu; 3388 int i, send_ipi = 0; 3389 3390 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3391 return 0; 3392 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3393 return 0; 3394 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 3395 3396 spin_lock(&kvm_lock); 3397 list_for_each_entry(kvm, &vm_list, vm_list) { 3398 kvm_for_each_vcpu(i, vcpu, kvm) { 3399 if (vcpu->cpu != freq->cpu) 3400 continue; 3401 if (!kvm_request_guest_time_update(vcpu)) 3402 continue; 3403 if (vcpu->cpu != smp_processor_id()) 3404 send_ipi++; 3405 } 3406 } 3407 spin_unlock(&kvm_lock); 3408 3409 if (freq->old < freq->new && send_ipi) { 3410 /* 3411 * We upscale the frequency. Must make the guest 3412 * doesn't see old kvmclock values while running with 3413 * the new frequency, otherwise we risk the guest sees 3414 * time go backwards. 3415 * 3416 * In case we update the frequency for another cpu 3417 * (which might be in guest context) send an interrupt 3418 * to kick the cpu out of guest context. Next time 3419 * guest context is entered kvmclock will be updated, 3420 * so the guest will not see stale values. 3421 */ 3422 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 3423 } 3424 return 0; 3425 } 3426 3427 static struct notifier_block kvmclock_cpufreq_notifier_block = { 3428 .notifier_call = kvmclock_cpufreq_notifier 3429 }; 3430 3431 static void kvm_timer_init(void) 3432 { 3433 int cpu; 3434 3435 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 3436 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 3437 CPUFREQ_TRANSITION_NOTIFIER); 3438 for_each_online_cpu(cpu) { 3439 unsigned long khz = cpufreq_get(cpu); 3440 if (!khz) 3441 khz = tsc_khz; 3442 per_cpu(cpu_tsc_khz, cpu) = khz; 3443 } 3444 } else { 3445 for_each_possible_cpu(cpu) 3446 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 3447 } 3448 } 3449 3450 int kvm_arch_init(void *opaque) 3451 { 3452 int r; 3453 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3454 3455 if (kvm_x86_ops) { 3456 printk(KERN_ERR "kvm: already loaded the other module\n"); 3457 r = -EEXIST; 3458 goto out; 3459 } 3460 3461 if (!ops->cpu_has_kvm_support()) { 3462 printk(KERN_ERR "kvm: no hardware support\n"); 3463 r = -EOPNOTSUPP; 3464 goto out; 3465 } 3466 if (ops->disabled_by_bios()) { 3467 printk(KERN_ERR "kvm: disabled by bios\n"); 3468 r = -EOPNOTSUPP; 3469 goto out; 3470 } 3471 3472 r = kvm_mmu_module_init(); 3473 if (r) 3474 goto out; 3475 3476 kvm_init_msr_list(); 3477 3478 kvm_x86_ops = ops; 3479 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 3480 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 3481 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3482 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3483 3484 kvm_timer_init(); 3485 3486 return 0; 3487 3488 out: 3489 return r; 3490 } 3491 3492 void kvm_arch_exit(void) 3493 { 3494 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 3495 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 3496 CPUFREQ_TRANSITION_NOTIFIER); 3497 kvm_x86_ops = NULL; 3498 kvm_mmu_module_exit(); 3499 } 3500 3501 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3502 { 3503 ++vcpu->stat.halt_exits; 3504 if (irqchip_in_kernel(vcpu->kvm)) { 3505 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3506 return 1; 3507 } else { 3508 vcpu->run->exit_reason = KVM_EXIT_HLT; 3509 return 0; 3510 } 3511 } 3512 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 3513 3514 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 3515 unsigned long a1) 3516 { 3517 if (is_long_mode(vcpu)) 3518 return a0; 3519 else 3520 return a0 | ((gpa_t)a1 << 32); 3521 } 3522 3523 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3524 { 3525 unsigned long nr, a0, a1, a2, a3, ret; 3526 int r = 1; 3527 3528 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3529 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3530 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3531 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3532 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3533 3534 trace_kvm_hypercall(nr, a0, a1, a2, a3); 3535 3536 if (!is_long_mode(vcpu)) { 3537 nr &= 0xFFFFFFFF; 3538 a0 &= 0xFFFFFFFF; 3539 a1 &= 0xFFFFFFFF; 3540 a2 &= 0xFFFFFFFF; 3541 a3 &= 0xFFFFFFFF; 3542 } 3543 3544 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 3545 ret = -KVM_EPERM; 3546 goto out; 3547 } 3548 3549 switch (nr) { 3550 case KVM_HC_VAPIC_POLL_IRQ: 3551 ret = 0; 3552 break; 3553 case KVM_HC_MMU_OP: 3554 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 3555 break; 3556 default: 3557 ret = -KVM_ENOSYS; 3558 break; 3559 } 3560 out: 3561 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3562 ++vcpu->stat.hypercalls; 3563 return r; 3564 } 3565 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 3566 3567 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3568 { 3569 char instruction[3]; 3570 int ret = 0; 3571 unsigned long rip = kvm_rip_read(vcpu); 3572 3573 3574 /* 3575 * Blow out the MMU to ensure that no other VCPU has an active mapping 3576 * to ensure that the updated hypercall appears atomically across all 3577 * VCPUs. 3578 */ 3579 kvm_mmu_zap_all(vcpu->kvm); 3580 3581 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3582 if (emulator_write_emulated(rip, instruction, 3, vcpu) 3583 != X86EMUL_CONTINUE) 3584 ret = -EFAULT; 3585 3586 return ret; 3587 } 3588 3589 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3590 { 3591 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 3592 } 3593 3594 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3595 { 3596 struct descriptor_table dt = { limit, base }; 3597 3598 kvm_x86_ops->set_gdt(vcpu, &dt); 3599 } 3600 3601 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 3602 { 3603 struct descriptor_table dt = { limit, base }; 3604 3605 kvm_x86_ops->set_idt(vcpu, &dt); 3606 } 3607 3608 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 3609 unsigned long *rflags) 3610 { 3611 kvm_lmsw(vcpu, msw); 3612 *rflags = kvm_get_rflags(vcpu); 3613 } 3614 3615 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3616 { 3617 unsigned long value; 3618 3619 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3620 switch (cr) { 3621 case 0: 3622 value = vcpu->arch.cr0; 3623 break; 3624 case 2: 3625 value = vcpu->arch.cr2; 3626 break; 3627 case 3: 3628 value = vcpu->arch.cr3; 3629 break; 3630 case 4: 3631 value = vcpu->arch.cr4; 3632 break; 3633 case 8: 3634 value = kvm_get_cr8(vcpu); 3635 break; 3636 default: 3637 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3638 return 0; 3639 } 3640 3641 return value; 3642 } 3643 3644 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3645 unsigned long *rflags) 3646 { 3647 switch (cr) { 3648 case 0: 3649 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3650 *rflags = kvm_get_rflags(vcpu); 3651 break; 3652 case 2: 3653 vcpu->arch.cr2 = val; 3654 break; 3655 case 3: 3656 kvm_set_cr3(vcpu, val); 3657 break; 3658 case 4: 3659 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 3660 break; 3661 case 8: 3662 kvm_set_cr8(vcpu, val & 0xfUL); 3663 break; 3664 default: 3665 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3666 } 3667 } 3668 3669 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 3670 { 3671 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 3672 int j, nent = vcpu->arch.cpuid_nent; 3673 3674 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 3675 /* when no next entry is found, the current entry[i] is reselected */ 3676 for (j = i + 1; ; j = (j + 1) % nent) { 3677 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 3678 if (ej->function == e->function) { 3679 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 3680 return j; 3681 } 3682 } 3683 return 0; /* silence gcc, even though control never reaches here */ 3684 } 3685 3686 /* find an entry with matching function, matching index (if needed), and that 3687 * should be read next (if it's stateful) */ 3688 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3689 u32 function, u32 index) 3690 { 3691 if (e->function != function) 3692 return 0; 3693 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3694 return 0; 3695 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3696 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3697 return 0; 3698 return 1; 3699 } 3700 3701 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3702 u32 function, u32 index) 3703 { 3704 int i; 3705 struct kvm_cpuid_entry2 *best = NULL; 3706 3707 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3708 struct kvm_cpuid_entry2 *e; 3709 3710 e = &vcpu->arch.cpuid_entries[i]; 3711 if (is_matching_cpuid_entry(e, function, index)) { 3712 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3713 move_to_next_stateful_cpuid_entry(vcpu, i); 3714 best = e; 3715 break; 3716 } 3717 /* 3718 * Both basic or both extended? 3719 */ 3720 if (((e->function ^ function) & 0x80000000) == 0) 3721 if (!best || e->function > best->function) 3722 best = e; 3723 } 3724 return best; 3725 } 3726 3727 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3728 { 3729 struct kvm_cpuid_entry2 *best; 3730 3731 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3732 if (best) 3733 return best->eax & 0xff; 3734 return 36; 3735 } 3736 3737 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3738 { 3739 u32 function, index; 3740 struct kvm_cpuid_entry2 *best; 3741 3742 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3743 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3744 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3745 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3746 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3747 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3748 best = kvm_find_cpuid_entry(vcpu, function, index); 3749 if (best) { 3750 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3751 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3752 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3753 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3754 } 3755 kvm_x86_ops->skip_emulated_instruction(vcpu); 3756 trace_kvm_cpuid(function, 3757 kvm_register_read(vcpu, VCPU_REGS_RAX), 3758 kvm_register_read(vcpu, VCPU_REGS_RBX), 3759 kvm_register_read(vcpu, VCPU_REGS_RCX), 3760 kvm_register_read(vcpu, VCPU_REGS_RDX)); 3761 } 3762 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3763 3764 /* 3765 * Check if userspace requested an interrupt window, and that the 3766 * interrupt window is open. 3767 * 3768 * No need to exit to userspace if we already have an interrupt queued. 3769 */ 3770 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 3771 { 3772 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3773 vcpu->run->request_interrupt_window && 3774 kvm_arch_interrupt_allowed(vcpu)); 3775 } 3776 3777 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 3778 { 3779 struct kvm_run *kvm_run = vcpu->run; 3780 3781 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3782 kvm_run->cr8 = kvm_get_cr8(vcpu); 3783 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3784 if (irqchip_in_kernel(vcpu->kvm)) 3785 kvm_run->ready_for_interrupt_injection = 1; 3786 else 3787 kvm_run->ready_for_interrupt_injection = 3788 kvm_arch_interrupt_allowed(vcpu) && 3789 !kvm_cpu_has_interrupt(vcpu) && 3790 !kvm_event_needs_reinjection(vcpu); 3791 } 3792 3793 static void vapic_enter(struct kvm_vcpu *vcpu) 3794 { 3795 struct kvm_lapic *apic = vcpu->arch.apic; 3796 struct page *page; 3797 3798 if (!apic || !apic->vapic_addr) 3799 return; 3800 3801 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3802 3803 vcpu->arch.apic->vapic_page = page; 3804 } 3805 3806 static void vapic_exit(struct kvm_vcpu *vcpu) 3807 { 3808 struct kvm_lapic *apic = vcpu->arch.apic; 3809 3810 if (!apic || !apic->vapic_addr) 3811 return; 3812 3813 down_read(&vcpu->kvm->slots_lock); 3814 kvm_release_page_dirty(apic->vapic_page); 3815 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3816 up_read(&vcpu->kvm->slots_lock); 3817 } 3818 3819 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3820 { 3821 int max_irr, tpr; 3822 3823 if (!kvm_x86_ops->update_cr8_intercept) 3824 return; 3825 3826 if (!vcpu->arch.apic) 3827 return; 3828 3829 if (!vcpu->arch.apic->vapic_addr) 3830 max_irr = kvm_lapic_find_highest_irr(vcpu); 3831 else 3832 max_irr = -1; 3833 3834 if (max_irr != -1) 3835 max_irr >>= 4; 3836 3837 tpr = kvm_lapic_get_cr8(vcpu); 3838 3839 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3840 } 3841 3842 static void inject_pending_event(struct kvm_vcpu *vcpu) 3843 { 3844 /* try to reinject previous events if any */ 3845 if (vcpu->arch.exception.pending) { 3846 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 3847 vcpu->arch.exception.has_error_code, 3848 vcpu->arch.exception.error_code); 3849 return; 3850 } 3851 3852 if (vcpu->arch.nmi_injected) { 3853 kvm_x86_ops->set_nmi(vcpu); 3854 return; 3855 } 3856 3857 if (vcpu->arch.interrupt.pending) { 3858 kvm_x86_ops->set_irq(vcpu); 3859 return; 3860 } 3861 3862 /* try to inject new event if pending */ 3863 if (vcpu->arch.nmi_pending) { 3864 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3865 vcpu->arch.nmi_pending = false; 3866 vcpu->arch.nmi_injected = true; 3867 kvm_x86_ops->set_nmi(vcpu); 3868 } 3869 } else if (kvm_cpu_has_interrupt(vcpu)) { 3870 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3871 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3872 false); 3873 kvm_x86_ops->set_irq(vcpu); 3874 } 3875 } 3876 } 3877 3878 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 3879 { 3880 int r; 3881 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3882 vcpu->run->request_interrupt_window; 3883 3884 if (vcpu->requests) 3885 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3886 kvm_mmu_unload(vcpu); 3887 3888 r = kvm_mmu_reload(vcpu); 3889 if (unlikely(r)) 3890 goto out; 3891 3892 if (vcpu->requests) { 3893 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3894 __kvm_migrate_timers(vcpu); 3895 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3896 kvm_write_guest_time(vcpu); 3897 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3898 kvm_mmu_sync_roots(vcpu); 3899 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3900 kvm_x86_ops->tlb_flush(vcpu); 3901 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3902 &vcpu->requests)) { 3903 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 3904 r = 0; 3905 goto out; 3906 } 3907 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3908 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 3909 r = 0; 3910 goto out; 3911 } 3912 } 3913 3914 preempt_disable(); 3915 3916 kvm_x86_ops->prepare_guest_switch(vcpu); 3917 kvm_load_guest_fpu(vcpu); 3918 3919 local_irq_disable(); 3920 3921 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3922 smp_mb__after_clear_bit(); 3923 3924 if (vcpu->requests || need_resched() || signal_pending(current)) { 3925 set_bit(KVM_REQ_KICK, &vcpu->requests); 3926 local_irq_enable(); 3927 preempt_enable(); 3928 r = 1; 3929 goto out; 3930 } 3931 3932 inject_pending_event(vcpu); 3933 3934 /* enable NMI/IRQ window open exits if needed */ 3935 if (vcpu->arch.nmi_pending) 3936 kvm_x86_ops->enable_nmi_window(vcpu); 3937 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3938 kvm_x86_ops->enable_irq_window(vcpu); 3939 3940 if (kvm_lapic_enabled(vcpu)) { 3941 update_cr8_intercept(vcpu); 3942 kvm_lapic_sync_to_vapic(vcpu); 3943 } 3944 3945 up_read(&vcpu->kvm->slots_lock); 3946 3947 kvm_guest_enter(); 3948 3949 if (unlikely(vcpu->arch.switch_db_regs)) { 3950 set_debugreg(0, 7); 3951 set_debugreg(vcpu->arch.eff_db[0], 0); 3952 set_debugreg(vcpu->arch.eff_db[1], 1); 3953 set_debugreg(vcpu->arch.eff_db[2], 2); 3954 set_debugreg(vcpu->arch.eff_db[3], 3); 3955 } 3956 3957 trace_kvm_entry(vcpu->vcpu_id); 3958 kvm_x86_ops->run(vcpu); 3959 3960 /* 3961 * If the guest has used debug registers, at least dr7 3962 * will be disabled while returning to the host. 3963 * If we don't have active breakpoints in the host, we don't 3964 * care about the messed up debug address registers. But if 3965 * we have some of them active, restore the old state. 3966 */ 3967 if (hw_breakpoint_active()) 3968 hw_breakpoint_restore(); 3969 3970 set_bit(KVM_REQ_KICK, &vcpu->requests); 3971 local_irq_enable(); 3972 3973 ++vcpu->stat.exits; 3974 3975 /* 3976 * We must have an instruction between local_irq_enable() and 3977 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3978 * the interrupt shadow. The stat.exits increment will do nicely. 3979 * But we need to prevent reordering, hence this barrier(): 3980 */ 3981 barrier(); 3982 3983 kvm_guest_exit(); 3984 3985 preempt_enable(); 3986 3987 down_read(&vcpu->kvm->slots_lock); 3988 3989 /* 3990 * Profile KVM exit RIPs: 3991 */ 3992 if (unlikely(prof_on == KVM_PROFILING)) { 3993 unsigned long rip = kvm_rip_read(vcpu); 3994 profile_hit(KVM_PROFILING, (void *)rip); 3995 } 3996 3997 3998 kvm_lapic_sync_from_vapic(vcpu); 3999 4000 r = kvm_x86_ops->handle_exit(vcpu); 4001 out: 4002 return r; 4003 } 4004 4005 4006 static int __vcpu_run(struct kvm_vcpu *vcpu) 4007 { 4008 int r; 4009 4010 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4011 pr_debug("vcpu %d received sipi with vector # %x\n", 4012 vcpu->vcpu_id, vcpu->arch.sipi_vector); 4013 kvm_lapic_reset(vcpu); 4014 r = kvm_arch_vcpu_reset(vcpu); 4015 if (r) 4016 return r; 4017 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4018 } 4019 4020 down_read(&vcpu->kvm->slots_lock); 4021 vapic_enter(vcpu); 4022 4023 r = 1; 4024 while (r > 0) { 4025 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4026 r = vcpu_enter_guest(vcpu); 4027 else { 4028 up_read(&vcpu->kvm->slots_lock); 4029 kvm_vcpu_block(vcpu); 4030 down_read(&vcpu->kvm->slots_lock); 4031 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4032 { 4033 switch(vcpu->arch.mp_state) { 4034 case KVM_MP_STATE_HALTED: 4035 vcpu->arch.mp_state = 4036 KVM_MP_STATE_RUNNABLE; 4037 case KVM_MP_STATE_RUNNABLE: 4038 break; 4039 case KVM_MP_STATE_SIPI_RECEIVED: 4040 default: 4041 r = -EINTR; 4042 break; 4043 } 4044 } 4045 } 4046 4047 if (r <= 0) 4048 break; 4049 4050 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 4051 if (kvm_cpu_has_pending_timer(vcpu)) 4052 kvm_inject_pending_timer_irqs(vcpu); 4053 4054 if (dm_request_for_irq_injection(vcpu)) { 4055 r = -EINTR; 4056 vcpu->run->exit_reason = KVM_EXIT_INTR; 4057 ++vcpu->stat.request_irq_exits; 4058 } 4059 if (signal_pending(current)) { 4060 r = -EINTR; 4061 vcpu->run->exit_reason = KVM_EXIT_INTR; 4062 ++vcpu->stat.signal_exits; 4063 } 4064 if (need_resched()) { 4065 up_read(&vcpu->kvm->slots_lock); 4066 kvm_resched(vcpu); 4067 down_read(&vcpu->kvm->slots_lock); 4068 } 4069 } 4070 4071 up_read(&vcpu->kvm->slots_lock); 4072 post_kvm_run_save(vcpu); 4073 4074 vapic_exit(vcpu); 4075 4076 return r; 4077 } 4078 4079 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4080 { 4081 int r; 4082 sigset_t sigsaved; 4083 4084 vcpu_load(vcpu); 4085 4086 if (vcpu->sigset_active) 4087 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4088 4089 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 4090 kvm_vcpu_block(vcpu); 4091 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 4092 r = -EAGAIN; 4093 goto out; 4094 } 4095 4096 /* re-sync apic's tpr */ 4097 if (!irqchip_in_kernel(vcpu->kvm)) 4098 kvm_set_cr8(vcpu, kvm_run->cr8); 4099 4100 if (vcpu->arch.pio.cur_count) { 4101 r = complete_pio(vcpu); 4102 if (r) 4103 goto out; 4104 } 4105 if (vcpu->mmio_needed) { 4106 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4107 vcpu->mmio_read_completed = 1; 4108 vcpu->mmio_needed = 0; 4109 4110 down_read(&vcpu->kvm->slots_lock); 4111 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4112 EMULTYPE_NO_DECODE); 4113 up_read(&vcpu->kvm->slots_lock); 4114 if (r == EMULATE_DO_MMIO) { 4115 /* 4116 * Read-modify-write. Back to userspace. 4117 */ 4118 r = 0; 4119 goto out; 4120 } 4121 } 4122 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4123 kvm_register_write(vcpu, VCPU_REGS_RAX, 4124 kvm_run->hypercall.ret); 4125 4126 r = __vcpu_run(vcpu); 4127 4128 out: 4129 if (vcpu->sigset_active) 4130 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4131 4132 vcpu_put(vcpu); 4133 return r; 4134 } 4135 4136 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4137 { 4138 vcpu_load(vcpu); 4139 4140 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4141 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4142 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4143 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4144 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4145 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4146 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4147 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4148 #ifdef CONFIG_X86_64 4149 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 4150 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 4151 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 4152 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 4153 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 4154 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 4155 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 4156 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 4157 #endif 4158 4159 regs->rip = kvm_rip_read(vcpu); 4160 regs->rflags = kvm_get_rflags(vcpu); 4161 4162 vcpu_put(vcpu); 4163 4164 return 0; 4165 } 4166 4167 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4168 { 4169 vcpu_load(vcpu); 4170 4171 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4172 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4173 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4174 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 4175 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 4176 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 4177 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 4178 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 4179 #ifdef CONFIG_X86_64 4180 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 4181 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 4182 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 4183 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 4184 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 4185 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4186 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4187 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4188 #endif 4189 4190 kvm_rip_write(vcpu, regs->rip); 4191 kvm_set_rflags(vcpu, regs->rflags); 4192 4193 vcpu->arch.exception.pending = false; 4194 4195 vcpu_put(vcpu); 4196 4197 return 0; 4198 } 4199 4200 void kvm_get_segment(struct kvm_vcpu *vcpu, 4201 struct kvm_segment *var, int seg) 4202 { 4203 kvm_x86_ops->get_segment(vcpu, var, seg); 4204 } 4205 4206 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4207 { 4208 struct kvm_segment cs; 4209 4210 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 4211 *db = cs.db; 4212 *l = cs.l; 4213 } 4214 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 4215 4216 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4217 struct kvm_sregs *sregs) 4218 { 4219 struct descriptor_table dt; 4220 4221 vcpu_load(vcpu); 4222 4223 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4224 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4225 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4226 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4227 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4228 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4229 4230 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4231 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4232 4233 kvm_x86_ops->get_idt(vcpu, &dt); 4234 sregs->idt.limit = dt.limit; 4235 sregs->idt.base = dt.base; 4236 kvm_x86_ops->get_gdt(vcpu, &dt); 4237 sregs->gdt.limit = dt.limit; 4238 sregs->gdt.base = dt.base; 4239 4240 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4241 sregs->cr0 = vcpu->arch.cr0; 4242 sregs->cr2 = vcpu->arch.cr2; 4243 sregs->cr3 = vcpu->arch.cr3; 4244 sregs->cr4 = vcpu->arch.cr4; 4245 sregs->cr8 = kvm_get_cr8(vcpu); 4246 sregs->efer = vcpu->arch.shadow_efer; 4247 sregs->apic_base = kvm_get_apic_base(vcpu); 4248 4249 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4250 4251 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 4252 set_bit(vcpu->arch.interrupt.nr, 4253 (unsigned long *)sregs->interrupt_bitmap); 4254 4255 vcpu_put(vcpu); 4256 4257 return 0; 4258 } 4259 4260 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 4261 struct kvm_mp_state *mp_state) 4262 { 4263 vcpu_load(vcpu); 4264 mp_state->mp_state = vcpu->arch.mp_state; 4265 vcpu_put(vcpu); 4266 return 0; 4267 } 4268 4269 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 4270 struct kvm_mp_state *mp_state) 4271 { 4272 vcpu_load(vcpu); 4273 vcpu->arch.mp_state = mp_state->mp_state; 4274 vcpu_put(vcpu); 4275 return 0; 4276 } 4277 4278 static void kvm_set_segment(struct kvm_vcpu *vcpu, 4279 struct kvm_segment *var, int seg) 4280 { 4281 kvm_x86_ops->set_segment(vcpu, var, seg); 4282 } 4283 4284 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 4285 struct kvm_segment *kvm_desct) 4286 { 4287 kvm_desct->base = get_desc_base(seg_desc); 4288 kvm_desct->limit = get_desc_limit(seg_desc); 4289 if (seg_desc->g) { 4290 kvm_desct->limit <<= 12; 4291 kvm_desct->limit |= 0xfff; 4292 } 4293 kvm_desct->selector = selector; 4294 kvm_desct->type = seg_desc->type; 4295 kvm_desct->present = seg_desc->p; 4296 kvm_desct->dpl = seg_desc->dpl; 4297 kvm_desct->db = seg_desc->d; 4298 kvm_desct->s = seg_desc->s; 4299 kvm_desct->l = seg_desc->l; 4300 kvm_desct->g = seg_desc->g; 4301 kvm_desct->avl = seg_desc->avl; 4302 if (!selector) 4303 kvm_desct->unusable = 1; 4304 else 4305 kvm_desct->unusable = 0; 4306 kvm_desct->padding = 0; 4307 } 4308 4309 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 4310 u16 selector, 4311 struct descriptor_table *dtable) 4312 { 4313 if (selector & 1 << 2) { 4314 struct kvm_segment kvm_seg; 4315 4316 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 4317 4318 if (kvm_seg.unusable) 4319 dtable->limit = 0; 4320 else 4321 dtable->limit = kvm_seg.limit; 4322 dtable->base = kvm_seg.base; 4323 } 4324 else 4325 kvm_x86_ops->get_gdt(vcpu, dtable); 4326 } 4327 4328 /* allowed just for 8 bytes segments */ 4329 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4330 struct desc_struct *seg_desc) 4331 { 4332 struct descriptor_table dtable; 4333 u16 index = selector >> 3; 4334 4335 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4336 4337 if (dtable.limit < index * 8 + 7) { 4338 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4339 return 1; 4340 } 4341 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4342 } 4343 4344 /* allowed just for 8 bytes segments */ 4345 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4346 struct desc_struct *seg_desc) 4347 { 4348 struct descriptor_table dtable; 4349 u16 index = selector >> 3; 4350 4351 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4352 4353 if (dtable.limit < index * 8 + 7) 4354 return 1; 4355 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4356 } 4357 4358 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4359 struct desc_struct *seg_desc) 4360 { 4361 u32 base_addr = get_desc_base(seg_desc); 4362 4363 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4364 } 4365 4366 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4367 { 4368 struct kvm_segment kvm_seg; 4369 4370 kvm_get_segment(vcpu, &kvm_seg, seg); 4371 return kvm_seg.selector; 4372 } 4373 4374 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 4375 u16 selector, 4376 struct kvm_segment *kvm_seg) 4377 { 4378 struct desc_struct seg_desc; 4379 4380 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 4381 return 1; 4382 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 4383 return 0; 4384 } 4385 4386 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4387 { 4388 struct kvm_segment segvar = { 4389 .base = selector << 4, 4390 .limit = 0xffff, 4391 .selector = selector, 4392 .type = 3, 4393 .present = 1, 4394 .dpl = 3, 4395 .db = 0, 4396 .s = 1, 4397 .l = 0, 4398 .g = 0, 4399 .avl = 0, 4400 .unusable = 0, 4401 }; 4402 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4403 return 0; 4404 } 4405 4406 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4407 { 4408 return (seg != VCPU_SREG_LDTR) && 4409 (seg != VCPU_SREG_TR) && 4410 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); 4411 } 4412 4413 static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg, 4414 u16 selector) 4415 { 4416 /* NULL selector is not valid for CS and SS */ 4417 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 4418 if (!selector) 4419 kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3); 4420 } 4421 4422 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4423 int type_bits, int seg) 4424 { 4425 struct kvm_segment kvm_seg; 4426 4427 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4428 return kvm_load_realmode_segment(vcpu, selector, seg); 4429 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4430 return 1; 4431 4432 kvm_check_segment_descriptor(vcpu, seg, selector); 4433 kvm_seg.type |= type_bits; 4434 4435 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4436 seg != VCPU_SREG_LDTR) 4437 if (!kvm_seg.s) 4438 kvm_seg.unusable = 1; 4439 4440 kvm_set_segment(vcpu, &kvm_seg, seg); 4441 return 0; 4442 } 4443 4444 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4445 struct tss_segment_32 *tss) 4446 { 4447 tss->cr3 = vcpu->arch.cr3; 4448 tss->eip = kvm_rip_read(vcpu); 4449 tss->eflags = kvm_get_rflags(vcpu); 4450 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4451 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4452 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4453 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4454 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4455 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4456 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4457 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4458 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4459 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4460 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4461 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4462 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 4463 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 4464 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4465 } 4466 4467 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4468 struct tss_segment_32 *tss) 4469 { 4470 kvm_set_cr3(vcpu, tss->cr3); 4471 4472 kvm_rip_write(vcpu, tss->eip); 4473 kvm_set_rflags(vcpu, tss->eflags | 2); 4474 4475 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4476 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4477 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 4478 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 4479 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 4480 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 4481 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4482 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4483 4484 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4485 return 1; 4486 4487 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4488 return 1; 4489 4490 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4491 return 1; 4492 4493 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4494 return 1; 4495 4496 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4497 return 1; 4498 4499 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4500 return 1; 4501 4502 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4503 return 1; 4504 return 0; 4505 } 4506 4507 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 4508 struct tss_segment_16 *tss) 4509 { 4510 tss->ip = kvm_rip_read(vcpu); 4511 tss->flag = kvm_get_rflags(vcpu); 4512 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4513 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4514 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4515 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4516 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4517 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4518 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 4519 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 4520 4521 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 4522 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 4523 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4524 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4525 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4526 } 4527 4528 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4529 struct tss_segment_16 *tss) 4530 { 4531 kvm_rip_write(vcpu, tss->ip); 4532 kvm_set_rflags(vcpu, tss->flag | 2); 4533 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4534 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4535 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4536 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 4537 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 4538 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 4539 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 4540 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 4541 4542 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 4543 return 1; 4544 4545 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4546 return 1; 4547 4548 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4549 return 1; 4550 4551 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4552 return 1; 4553 4554 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4555 return 1; 4556 return 0; 4557 } 4558 4559 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 4560 u16 old_tss_sel, u32 old_tss_base, 4561 struct desc_struct *nseg_desc) 4562 { 4563 struct tss_segment_16 tss_segment_16; 4564 int ret = 0; 4565 4566 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4567 sizeof tss_segment_16)) 4568 goto out; 4569 4570 save_state_to_tss16(vcpu, &tss_segment_16); 4571 4572 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 4573 sizeof tss_segment_16)) 4574 goto out; 4575 4576 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4577 &tss_segment_16, sizeof tss_segment_16)) 4578 goto out; 4579 4580 if (old_tss_sel != 0xffff) { 4581 tss_segment_16.prev_task_link = old_tss_sel; 4582 4583 if (kvm_write_guest(vcpu->kvm, 4584 get_tss_base_addr(vcpu, nseg_desc), 4585 &tss_segment_16.prev_task_link, 4586 sizeof tss_segment_16.prev_task_link)) 4587 goto out; 4588 } 4589 4590 if (load_state_from_tss16(vcpu, &tss_segment_16)) 4591 goto out; 4592 4593 ret = 1; 4594 out: 4595 return ret; 4596 } 4597 4598 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 4599 u16 old_tss_sel, u32 old_tss_base, 4600 struct desc_struct *nseg_desc) 4601 { 4602 struct tss_segment_32 tss_segment_32; 4603 int ret = 0; 4604 4605 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4606 sizeof tss_segment_32)) 4607 goto out; 4608 4609 save_state_to_tss32(vcpu, &tss_segment_32); 4610 4611 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 4612 sizeof tss_segment_32)) 4613 goto out; 4614 4615 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 4616 &tss_segment_32, sizeof tss_segment_32)) 4617 goto out; 4618 4619 if (old_tss_sel != 0xffff) { 4620 tss_segment_32.prev_task_link = old_tss_sel; 4621 4622 if (kvm_write_guest(vcpu->kvm, 4623 get_tss_base_addr(vcpu, nseg_desc), 4624 &tss_segment_32.prev_task_link, 4625 sizeof tss_segment_32.prev_task_link)) 4626 goto out; 4627 } 4628 4629 if (load_state_from_tss32(vcpu, &tss_segment_32)) 4630 goto out; 4631 4632 ret = 1; 4633 out: 4634 return ret; 4635 } 4636 4637 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 4638 { 4639 struct kvm_segment tr_seg; 4640 struct desc_struct cseg_desc; 4641 struct desc_struct nseg_desc; 4642 int ret = 0; 4643 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 4644 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 4645 4646 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 4647 4648 /* FIXME: Handle errors. Failure to read either TSS or their 4649 * descriptors should generate a pagefault. 4650 */ 4651 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 4652 goto out; 4653 4654 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 4655 goto out; 4656 4657 if (reason != TASK_SWITCH_IRET) { 4658 int cpl; 4659 4660 cpl = kvm_x86_ops->get_cpl(vcpu); 4661 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 4662 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4663 return 1; 4664 } 4665 } 4666 4667 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4668 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4669 return 1; 4670 } 4671 4672 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4673 cseg_desc.type &= ~(1 << 1); //clear the B flag 4674 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4675 } 4676 4677 if (reason == TASK_SWITCH_IRET) { 4678 u32 eflags = kvm_get_rflags(vcpu); 4679 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4680 } 4681 4682 /* set back link to prev task only if NT bit is set in eflags 4683 note that old_tss_sel is not used afetr this point */ 4684 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4685 old_tss_sel = 0xffff; 4686 4687 if (nseg_desc.type & 8) 4688 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4689 old_tss_base, &nseg_desc); 4690 else 4691 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4692 old_tss_base, &nseg_desc); 4693 4694 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4695 u32 eflags = kvm_get_rflags(vcpu); 4696 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4697 } 4698 4699 if (reason != TASK_SWITCH_IRET) { 4700 nseg_desc.type |= (1 << 1); 4701 save_guest_segment_descriptor(vcpu, tss_selector, 4702 &nseg_desc); 4703 } 4704 4705 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4706 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4707 tr_seg.type = 11; 4708 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4709 out: 4710 return ret; 4711 } 4712 EXPORT_SYMBOL_GPL(kvm_task_switch); 4713 4714 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4715 struct kvm_sregs *sregs) 4716 { 4717 int mmu_reset_needed = 0; 4718 int pending_vec, max_bits; 4719 struct descriptor_table dt; 4720 4721 vcpu_load(vcpu); 4722 4723 dt.limit = sregs->idt.limit; 4724 dt.base = sregs->idt.base; 4725 kvm_x86_ops->set_idt(vcpu, &dt); 4726 dt.limit = sregs->gdt.limit; 4727 dt.base = sregs->gdt.base; 4728 kvm_x86_ops->set_gdt(vcpu, &dt); 4729 4730 vcpu->arch.cr2 = sregs->cr2; 4731 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4732 vcpu->arch.cr3 = sregs->cr3; 4733 4734 kvm_set_cr8(vcpu, sregs->cr8); 4735 4736 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4737 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4738 kvm_set_apic_base(vcpu, sregs->apic_base); 4739 4740 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4741 4742 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4743 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4744 vcpu->arch.cr0 = sregs->cr0; 4745 4746 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4747 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4748 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 4749 load_pdptrs(vcpu, vcpu->arch.cr3); 4750 mmu_reset_needed = 1; 4751 } 4752 4753 if (mmu_reset_needed) 4754 kvm_mmu_reset_context(vcpu); 4755 4756 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4757 pending_vec = find_first_bit( 4758 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4759 if (pending_vec < max_bits) { 4760 kvm_queue_interrupt(vcpu, pending_vec, false); 4761 pr_debug("Set back pending irq %d\n", pending_vec); 4762 if (irqchip_in_kernel(vcpu->kvm)) 4763 kvm_pic_clear_isr_ack(vcpu->kvm); 4764 } 4765 4766 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4767 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4768 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4769 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4770 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4771 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4772 4773 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4774 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4775 4776 update_cr8_intercept(vcpu); 4777 4778 /* Older userspace won't unhalt the vcpu on reset. */ 4779 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 4780 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4781 !(vcpu->arch.cr0 & X86_CR0_PE)) 4782 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4783 4784 vcpu_put(vcpu); 4785 4786 return 0; 4787 } 4788 4789 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4790 struct kvm_guest_debug *dbg) 4791 { 4792 unsigned long rflags; 4793 int i, r; 4794 4795 vcpu_load(vcpu); 4796 4797 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 4798 r = -EBUSY; 4799 if (vcpu->arch.exception.pending) 4800 goto unlock_out; 4801 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4802 kvm_queue_exception(vcpu, DB_VECTOR); 4803 else 4804 kvm_queue_exception(vcpu, BP_VECTOR); 4805 } 4806 4807 /* 4808 * Read rflags as long as potentially injected trace flags are still 4809 * filtered out. 4810 */ 4811 rflags = kvm_get_rflags(vcpu); 4812 4813 vcpu->guest_debug = dbg->control; 4814 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 4815 vcpu->guest_debug = 0; 4816 4817 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 4818 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4819 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4820 vcpu->arch.switch_db_regs = 4821 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4822 } else { 4823 for (i = 0; i < KVM_NR_DB_REGS; i++) 4824 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4825 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4826 } 4827 4828 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 4829 vcpu->arch.singlestep_cs = 4830 get_segment_selector(vcpu, VCPU_SREG_CS); 4831 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); 4832 } 4833 4834 /* 4835 * Trigger an rflags update that will inject or remove the trace 4836 * flags. 4837 */ 4838 kvm_set_rflags(vcpu, rflags); 4839 4840 kvm_x86_ops->set_guest_debug(vcpu, dbg); 4841 4842 r = 0; 4843 4844 unlock_out: 4845 vcpu_put(vcpu); 4846 4847 return r; 4848 } 4849 4850 /* 4851 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4852 * we have asm/x86/processor.h 4853 */ 4854 struct fxsave { 4855 u16 cwd; 4856 u16 swd; 4857 u16 twd; 4858 u16 fop; 4859 u64 rip; 4860 u64 rdp; 4861 u32 mxcsr; 4862 u32 mxcsr_mask; 4863 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4864 #ifdef CONFIG_X86_64 4865 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4866 #else 4867 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4868 #endif 4869 }; 4870 4871 /* 4872 * Translate a guest virtual address to a guest physical address. 4873 */ 4874 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4875 struct kvm_translation *tr) 4876 { 4877 unsigned long vaddr = tr->linear_address; 4878 gpa_t gpa; 4879 4880 vcpu_load(vcpu); 4881 down_read(&vcpu->kvm->slots_lock); 4882 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4883 up_read(&vcpu->kvm->slots_lock); 4884 tr->physical_address = gpa; 4885 tr->valid = gpa != UNMAPPED_GVA; 4886 tr->writeable = 1; 4887 tr->usermode = 0; 4888 vcpu_put(vcpu); 4889 4890 return 0; 4891 } 4892 4893 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4894 { 4895 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4896 4897 vcpu_load(vcpu); 4898 4899 memcpy(fpu->fpr, fxsave->st_space, 128); 4900 fpu->fcw = fxsave->cwd; 4901 fpu->fsw = fxsave->swd; 4902 fpu->ftwx = fxsave->twd; 4903 fpu->last_opcode = fxsave->fop; 4904 fpu->last_ip = fxsave->rip; 4905 fpu->last_dp = fxsave->rdp; 4906 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4907 4908 vcpu_put(vcpu); 4909 4910 return 0; 4911 } 4912 4913 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4914 { 4915 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4916 4917 vcpu_load(vcpu); 4918 4919 memcpy(fxsave->st_space, fpu->fpr, 128); 4920 fxsave->cwd = fpu->fcw; 4921 fxsave->swd = fpu->fsw; 4922 fxsave->twd = fpu->ftwx; 4923 fxsave->fop = fpu->last_opcode; 4924 fxsave->rip = fpu->last_ip; 4925 fxsave->rdp = fpu->last_dp; 4926 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4927 4928 vcpu_put(vcpu); 4929 4930 return 0; 4931 } 4932 4933 void fx_init(struct kvm_vcpu *vcpu) 4934 { 4935 unsigned after_mxcsr_mask; 4936 4937 /* 4938 * Touch the fpu the first time in non atomic context as if 4939 * this is the first fpu instruction the exception handler 4940 * will fire before the instruction returns and it'll have to 4941 * allocate ram with GFP_KERNEL. 4942 */ 4943 if (!used_math()) 4944 kvm_fx_save(&vcpu->arch.host_fx_image); 4945 4946 /* Initialize guest FPU by resetting ours and saving into guest's */ 4947 preempt_disable(); 4948 kvm_fx_save(&vcpu->arch.host_fx_image); 4949 kvm_fx_finit(); 4950 kvm_fx_save(&vcpu->arch.guest_fx_image); 4951 kvm_fx_restore(&vcpu->arch.host_fx_image); 4952 preempt_enable(); 4953 4954 vcpu->arch.cr0 |= X86_CR0_ET; 4955 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4956 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4957 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4958 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4959 } 4960 EXPORT_SYMBOL_GPL(fx_init); 4961 4962 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4963 { 4964 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4965 return; 4966 4967 vcpu->guest_fpu_loaded = 1; 4968 kvm_fx_save(&vcpu->arch.host_fx_image); 4969 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4970 } 4971 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4972 4973 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4974 { 4975 if (!vcpu->guest_fpu_loaded) 4976 return; 4977 4978 vcpu->guest_fpu_loaded = 0; 4979 kvm_fx_save(&vcpu->arch.guest_fx_image); 4980 kvm_fx_restore(&vcpu->arch.host_fx_image); 4981 ++vcpu->stat.fpu_reload; 4982 } 4983 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4984 4985 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4986 { 4987 if (vcpu->arch.time_page) { 4988 kvm_release_page_dirty(vcpu->arch.time_page); 4989 vcpu->arch.time_page = NULL; 4990 } 4991 4992 kvm_x86_ops->vcpu_free(vcpu); 4993 } 4994 4995 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4996 unsigned int id) 4997 { 4998 return kvm_x86_ops->vcpu_create(kvm, id); 4999 } 5000 5001 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 5002 { 5003 int r; 5004 5005 /* We do fxsave: this must be aligned. */ 5006 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 5007 5008 vcpu->arch.mtrr_state.have_fixed = 1; 5009 vcpu_load(vcpu); 5010 r = kvm_arch_vcpu_reset(vcpu); 5011 if (r == 0) 5012 r = kvm_mmu_setup(vcpu); 5013 vcpu_put(vcpu); 5014 if (r < 0) 5015 goto free_vcpu; 5016 5017 return 0; 5018 free_vcpu: 5019 kvm_x86_ops->vcpu_free(vcpu); 5020 return r; 5021 } 5022 5023 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5024 { 5025 vcpu_load(vcpu); 5026 kvm_mmu_unload(vcpu); 5027 vcpu_put(vcpu); 5028 5029 kvm_x86_ops->vcpu_free(vcpu); 5030 } 5031 5032 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 5033 { 5034 vcpu->arch.nmi_pending = false; 5035 vcpu->arch.nmi_injected = false; 5036 5037 vcpu->arch.switch_db_regs = 0; 5038 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 5039 vcpu->arch.dr6 = DR6_FIXED_1; 5040 vcpu->arch.dr7 = DR7_FIXED_1; 5041 5042 return kvm_x86_ops->vcpu_reset(vcpu); 5043 } 5044 5045 int kvm_arch_hardware_enable(void *garbage) 5046 { 5047 /* 5048 * Since this may be called from a hotplug notifcation, 5049 * we can't get the CPU frequency directly. 5050 */ 5051 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 5052 int cpu = raw_smp_processor_id(); 5053 per_cpu(cpu_tsc_khz, cpu) = 0; 5054 } 5055 5056 kvm_shared_msr_cpu_online(); 5057 5058 return kvm_x86_ops->hardware_enable(garbage); 5059 } 5060 5061 void kvm_arch_hardware_disable(void *garbage) 5062 { 5063 kvm_x86_ops->hardware_disable(garbage); 5064 drop_user_return_notifiers(garbage); 5065 } 5066 5067 int kvm_arch_hardware_setup(void) 5068 { 5069 return kvm_x86_ops->hardware_setup(); 5070 } 5071 5072 void kvm_arch_hardware_unsetup(void) 5073 { 5074 kvm_x86_ops->hardware_unsetup(); 5075 } 5076 5077 void kvm_arch_check_processor_compat(void *rtn) 5078 { 5079 kvm_x86_ops->check_processor_compatibility(rtn); 5080 } 5081 5082 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 5083 { 5084 struct page *page; 5085 struct kvm *kvm; 5086 int r; 5087 5088 BUG_ON(vcpu->kvm == NULL); 5089 kvm = vcpu->kvm; 5090 5091 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5092 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5093 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5094 else 5095 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 5096 5097 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 5098 if (!page) { 5099 r = -ENOMEM; 5100 goto fail; 5101 } 5102 vcpu->arch.pio_data = page_address(page); 5103 5104 r = kvm_mmu_create(vcpu); 5105 if (r < 0) 5106 goto fail_free_pio_data; 5107 5108 if (irqchip_in_kernel(kvm)) { 5109 r = kvm_create_lapic(vcpu); 5110 if (r < 0) 5111 goto fail_mmu_destroy; 5112 } 5113 5114 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 5115 GFP_KERNEL); 5116 if (!vcpu->arch.mce_banks) { 5117 r = -ENOMEM; 5118 goto fail_free_lapic; 5119 } 5120 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5121 5122 return 0; 5123 fail_free_lapic: 5124 kvm_free_lapic(vcpu); 5125 fail_mmu_destroy: 5126 kvm_mmu_destroy(vcpu); 5127 fail_free_pio_data: 5128 free_page((unsigned long)vcpu->arch.pio_data); 5129 fail: 5130 return r; 5131 } 5132 5133 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5134 { 5135 kfree(vcpu->arch.mce_banks); 5136 kvm_free_lapic(vcpu); 5137 down_read(&vcpu->kvm->slots_lock); 5138 kvm_mmu_destroy(vcpu); 5139 up_read(&vcpu->kvm->slots_lock); 5140 free_page((unsigned long)vcpu->arch.pio_data); 5141 } 5142 5143 struct kvm *kvm_arch_create_vm(void) 5144 { 5145 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 5146 5147 if (!kvm) 5148 return ERR_PTR(-ENOMEM); 5149 5150 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5151 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5152 5153 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5154 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5155 5156 rdtscll(kvm->arch.vm_init_tsc); 5157 5158 return kvm; 5159 } 5160 5161 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 5162 { 5163 vcpu_load(vcpu); 5164 kvm_mmu_unload(vcpu); 5165 vcpu_put(vcpu); 5166 } 5167 5168 static void kvm_free_vcpus(struct kvm *kvm) 5169 { 5170 unsigned int i; 5171 struct kvm_vcpu *vcpu; 5172 5173 /* 5174 * Unpin any mmu pages first. 5175 */ 5176 kvm_for_each_vcpu(i, vcpu, kvm) 5177 kvm_unload_vcpu_mmu(vcpu); 5178 kvm_for_each_vcpu(i, vcpu, kvm) 5179 kvm_arch_vcpu_free(vcpu); 5180 5181 mutex_lock(&kvm->lock); 5182 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 5183 kvm->vcpus[i] = NULL; 5184 5185 atomic_set(&kvm->online_vcpus, 0); 5186 mutex_unlock(&kvm->lock); 5187 } 5188 5189 void kvm_arch_sync_events(struct kvm *kvm) 5190 { 5191 kvm_free_all_assigned_devices(kvm); 5192 } 5193 5194 void kvm_arch_destroy_vm(struct kvm *kvm) 5195 { 5196 kvm_iommu_unmap_guest(kvm); 5197 kvm_free_pit(kvm); 5198 kfree(kvm->arch.vpic); 5199 kfree(kvm->arch.vioapic); 5200 kvm_free_vcpus(kvm); 5201 kvm_free_physmem(kvm); 5202 if (kvm->arch.apic_access_page) 5203 put_page(kvm->arch.apic_access_page); 5204 if (kvm->arch.ept_identity_pagetable) 5205 put_page(kvm->arch.ept_identity_pagetable); 5206 kfree(kvm); 5207 } 5208 5209 int kvm_arch_set_memory_region(struct kvm *kvm, 5210 struct kvm_userspace_memory_region *mem, 5211 struct kvm_memory_slot old, 5212 int user_alloc) 5213 { 5214 int npages = mem->memory_size >> PAGE_SHIFT; 5215 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 5216 5217 /*To keep backward compatibility with older userspace, 5218 *x86 needs to hanlde !user_alloc case. 5219 */ 5220 if (!user_alloc) { 5221 if (npages && !old.rmap) { 5222 unsigned long userspace_addr; 5223 5224 down_write(¤t->mm->mmap_sem); 5225 userspace_addr = do_mmap(NULL, 0, 5226 npages * PAGE_SIZE, 5227 PROT_READ | PROT_WRITE, 5228 MAP_PRIVATE | MAP_ANONYMOUS, 5229 0); 5230 up_write(¤t->mm->mmap_sem); 5231 5232 if (IS_ERR((void *)userspace_addr)) 5233 return PTR_ERR((void *)userspace_addr); 5234 5235 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 5236 spin_lock(&kvm->mmu_lock); 5237 memslot->userspace_addr = userspace_addr; 5238 spin_unlock(&kvm->mmu_lock); 5239 } else { 5240 if (!old.user_alloc && old.rmap) { 5241 int ret; 5242 5243 down_write(¤t->mm->mmap_sem); 5244 ret = do_munmap(current->mm, old.userspace_addr, 5245 old.npages * PAGE_SIZE); 5246 up_write(¤t->mm->mmap_sem); 5247 if (ret < 0) 5248 printk(KERN_WARNING 5249 "kvm_vm_ioctl_set_memory_region: " 5250 "failed to munmap memory\n"); 5251 } 5252 } 5253 } 5254 5255 spin_lock(&kvm->mmu_lock); 5256 if (!kvm->arch.n_requested_mmu_pages) { 5257 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5258 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 5259 } 5260 5261 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5262 spin_unlock(&kvm->mmu_lock); 5263 5264 return 0; 5265 } 5266 5267 void kvm_arch_flush_shadow(struct kvm *kvm) 5268 { 5269 kvm_mmu_zap_all(kvm); 5270 kvm_reload_remote_mmus(kvm); 5271 } 5272 5273 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 5274 { 5275 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 5276 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 5277 || vcpu->arch.nmi_pending || 5278 (kvm_arch_interrupt_allowed(vcpu) && 5279 kvm_cpu_has_interrupt(vcpu)); 5280 } 5281 5282 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 5283 { 5284 int me; 5285 int cpu = vcpu->cpu; 5286 5287 if (waitqueue_active(&vcpu->wq)) { 5288 wake_up_interruptible(&vcpu->wq); 5289 ++vcpu->stat.halt_wakeup; 5290 } 5291 5292 me = get_cpu(); 5293 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5294 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5295 smp_send_reschedule(cpu); 5296 put_cpu(); 5297 } 5298 5299 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 5300 { 5301 return kvm_x86_ops->interrupt_allowed(vcpu); 5302 } 5303 5304 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5305 { 5306 unsigned long rflags; 5307 5308 rflags = kvm_x86_ops->get_rflags(vcpu); 5309 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5310 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5311 return rflags; 5312 } 5313 EXPORT_SYMBOL_GPL(kvm_get_rflags); 5314 5315 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5316 { 5317 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5318 vcpu->arch.singlestep_cs == 5319 get_segment_selector(vcpu, VCPU_SREG_CS) && 5320 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) 5321 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; 5322 kvm_x86_ops->set_rflags(vcpu, rflags); 5323 } 5324 EXPORT_SYMBOL_GPL(kvm_set_rflags); 5325 5326 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5327 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5328 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5329 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5330 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5331 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 5332 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 5333 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 5334 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5335 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5336 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5337