1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * 8 * Authors: 9 * Avi Kivity <avi@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2. See 13 * the COPYING file in the top-level directory. 14 * 15 */ 16 17 #include <linux/kvm_host.h> 18 #include "irq.h" 19 #include "mmu.h" 20 #include "i8254.h" 21 #include "tss.h" 22 23 #include <linux/clocksource.h> 24 #include <linux/kvm.h> 25 #include <linux/fs.h> 26 #include <linux/vmalloc.h> 27 #include <linux/module.h> 28 #include <linux/mman.h> 29 #include <linux/highmem.h> 30 31 #include <asm/uaccess.h> 32 #include <asm/msr.h> 33 #include <asm/desc.h> 34 35 #define MAX_IO_MSRS 256 36 #define CR0_RESERVED_BITS \ 37 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 38 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 39 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 40 #define CR4_RESERVED_BITS \ 41 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 42 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 43 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 44 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 45 46 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 47 /* EFER defaults: 48 * - enable syscall per default because its emulated by KVM 49 * - enable LME and LMA per default on 64 bit KVM 50 */ 51 #ifdef CONFIG_X86_64 52 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 53 #else 54 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 55 #endif 56 57 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 58 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 59 60 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 61 struct kvm_cpuid_entry2 __user *entries); 62 63 struct kvm_x86_ops *kvm_x86_ops; 64 65 struct kvm_stats_debugfs_item debugfs_entries[] = { 66 { "pf_fixed", VCPU_STAT(pf_fixed) }, 67 { "pf_guest", VCPU_STAT(pf_guest) }, 68 { "tlb_flush", VCPU_STAT(tlb_flush) }, 69 { "invlpg", VCPU_STAT(invlpg) }, 70 { "exits", VCPU_STAT(exits) }, 71 { "io_exits", VCPU_STAT(io_exits) }, 72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 73 { "signal_exits", VCPU_STAT(signal_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) }, 75 { "halt_exits", VCPU_STAT(halt_exits) }, 76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 77 { "hypercalls", VCPU_STAT(hypercalls) }, 78 { "request_irq", VCPU_STAT(request_irq_exits) }, 79 { "irq_exits", VCPU_STAT(irq_exits) }, 80 { "host_state_reload", VCPU_STAT(host_state_reload) }, 81 { "efer_reload", VCPU_STAT(efer_reload) }, 82 { "fpu_reload", VCPU_STAT(fpu_reload) }, 83 { "insn_emulation", VCPU_STAT(insn_emulation) }, 84 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 85 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 86 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 87 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 88 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 89 { "mmu_flooded", VM_STAT(mmu_flooded) }, 90 { "mmu_recycled", VM_STAT(mmu_recycled) }, 91 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 92 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 93 { "largepages", VM_STAT(lpages) }, 94 { NULL } 95 }; 96 97 98 unsigned long segment_base(u16 selector) 99 { 100 struct descriptor_table gdt; 101 struct desc_struct *d; 102 unsigned long table_base; 103 unsigned long v; 104 105 if (selector == 0) 106 return 0; 107 108 asm("sgdt %0" : "=m"(gdt)); 109 table_base = gdt.base; 110 111 if (selector & 4) { /* from ldt */ 112 u16 ldt_selector; 113 114 asm("sldt %0" : "=g"(ldt_selector)); 115 table_base = segment_base(ldt_selector); 116 } 117 d = (struct desc_struct *)(table_base + (selector & ~7)); 118 v = d->base0 | ((unsigned long)d->base1 << 16) | 119 ((unsigned long)d->base2 << 24); 120 #ifdef CONFIG_X86_64 121 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 122 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 123 #endif 124 return v; 125 } 126 EXPORT_SYMBOL_GPL(segment_base); 127 128 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 129 { 130 if (irqchip_in_kernel(vcpu->kvm)) 131 return vcpu->arch.apic_base; 132 else 133 return vcpu->arch.apic_base; 134 } 135 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 136 137 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 138 { 139 /* TODO: reserve bits check */ 140 if (irqchip_in_kernel(vcpu->kvm)) 141 kvm_lapic_set_base(vcpu, data); 142 else 143 vcpu->arch.apic_base = data; 144 } 145 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 146 147 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 148 { 149 WARN_ON(vcpu->arch.exception.pending); 150 vcpu->arch.exception.pending = true; 151 vcpu->arch.exception.has_error_code = false; 152 vcpu->arch.exception.nr = nr; 153 } 154 EXPORT_SYMBOL_GPL(kvm_queue_exception); 155 156 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 157 u32 error_code) 158 { 159 ++vcpu->stat.pf_guest; 160 if (vcpu->arch.exception.pending) { 161 if (vcpu->arch.exception.nr == PF_VECTOR) { 162 printk(KERN_DEBUG "kvm: inject_page_fault:" 163 " double fault 0x%lx\n", addr); 164 vcpu->arch.exception.nr = DF_VECTOR; 165 vcpu->arch.exception.error_code = 0; 166 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 167 /* triple fault -> shutdown */ 168 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 169 } 170 return; 171 } 172 vcpu->arch.cr2 = addr; 173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 174 } 175 176 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 177 { 178 WARN_ON(vcpu->arch.exception.pending); 179 vcpu->arch.exception.pending = true; 180 vcpu->arch.exception.has_error_code = true; 181 vcpu->arch.exception.nr = nr; 182 vcpu->arch.exception.error_code = error_code; 183 } 184 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 185 186 static void __queue_exception(struct kvm_vcpu *vcpu) 187 { 188 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 189 vcpu->arch.exception.has_error_code, 190 vcpu->arch.exception.error_code); 191 } 192 193 /* 194 * Load the pae pdptrs. Return true is they are all valid. 195 */ 196 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 197 { 198 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 199 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 200 int i; 201 int ret; 202 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 203 204 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 205 offset * sizeof(u64), sizeof(pdpte)); 206 if (ret < 0) { 207 ret = 0; 208 goto out; 209 } 210 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 211 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 212 ret = 0; 213 goto out; 214 } 215 } 216 ret = 1; 217 218 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 219 out: 220 221 return ret; 222 } 223 EXPORT_SYMBOL_GPL(load_pdptrs); 224 225 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 226 { 227 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 228 bool changed = true; 229 int r; 230 231 if (is_long_mode(vcpu) || !is_pae(vcpu)) 232 return false; 233 234 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 235 if (r < 0) 236 goto out; 237 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 238 out: 239 240 return changed; 241 } 242 243 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 244 { 245 if (cr0 & CR0_RESERVED_BITS) { 246 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 247 cr0, vcpu->arch.cr0); 248 kvm_inject_gp(vcpu, 0); 249 return; 250 } 251 252 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 253 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 254 kvm_inject_gp(vcpu, 0); 255 return; 256 } 257 258 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 259 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 260 "and a clear PE flag\n"); 261 kvm_inject_gp(vcpu, 0); 262 return; 263 } 264 265 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 266 #ifdef CONFIG_X86_64 267 if ((vcpu->arch.shadow_efer & EFER_LME)) { 268 int cs_db, cs_l; 269 270 if (!is_pae(vcpu)) { 271 printk(KERN_DEBUG "set_cr0: #GP, start paging " 272 "in long mode while PAE is disabled\n"); 273 kvm_inject_gp(vcpu, 0); 274 return; 275 } 276 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 277 if (cs_l) { 278 printk(KERN_DEBUG "set_cr0: #GP, start paging " 279 "in long mode while CS.L == 1\n"); 280 kvm_inject_gp(vcpu, 0); 281 return; 282 283 } 284 } else 285 #endif 286 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 287 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 288 "reserved bits\n"); 289 kvm_inject_gp(vcpu, 0); 290 return; 291 } 292 293 } 294 295 kvm_x86_ops->set_cr0(vcpu, cr0); 296 vcpu->arch.cr0 = cr0; 297 298 kvm_mmu_reset_context(vcpu); 299 return; 300 } 301 EXPORT_SYMBOL_GPL(kvm_set_cr0); 302 303 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 304 { 305 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 306 KVMTRACE_1D(LMSW, vcpu, 307 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 308 handler); 309 } 310 EXPORT_SYMBOL_GPL(kvm_lmsw); 311 312 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 313 { 314 if (cr4 & CR4_RESERVED_BITS) { 315 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 316 kvm_inject_gp(vcpu, 0); 317 return; 318 } 319 320 if (is_long_mode(vcpu)) { 321 if (!(cr4 & X86_CR4_PAE)) { 322 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 323 "in long mode\n"); 324 kvm_inject_gp(vcpu, 0); 325 return; 326 } 327 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 328 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 329 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 330 kvm_inject_gp(vcpu, 0); 331 return; 332 } 333 334 if (cr4 & X86_CR4_VMXE) { 335 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 336 kvm_inject_gp(vcpu, 0); 337 return; 338 } 339 kvm_x86_ops->set_cr4(vcpu, cr4); 340 vcpu->arch.cr4 = cr4; 341 kvm_mmu_reset_context(vcpu); 342 } 343 EXPORT_SYMBOL_GPL(kvm_set_cr4); 344 345 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 346 { 347 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 348 kvm_mmu_flush_tlb(vcpu); 349 return; 350 } 351 352 if (is_long_mode(vcpu)) { 353 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 354 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 355 kvm_inject_gp(vcpu, 0); 356 return; 357 } 358 } else { 359 if (is_pae(vcpu)) { 360 if (cr3 & CR3_PAE_RESERVED_BITS) { 361 printk(KERN_DEBUG 362 "set_cr3: #GP, reserved bits\n"); 363 kvm_inject_gp(vcpu, 0); 364 return; 365 } 366 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 367 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 368 "reserved bits\n"); 369 kvm_inject_gp(vcpu, 0); 370 return; 371 } 372 } 373 /* 374 * We don't check reserved bits in nonpae mode, because 375 * this isn't enforced, and VMware depends on this. 376 */ 377 } 378 379 /* 380 * Does the new cr3 value map to physical memory? (Note, we 381 * catch an invalid cr3 even in real-mode, because it would 382 * cause trouble later on when we turn on paging anyway.) 383 * 384 * A real CPU would silently accept an invalid cr3 and would 385 * attempt to use it - with largely undefined (and often hard 386 * to debug) behavior on the guest side. 387 */ 388 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 389 kvm_inject_gp(vcpu, 0); 390 else { 391 vcpu->arch.cr3 = cr3; 392 vcpu->arch.mmu.new_cr3(vcpu); 393 } 394 } 395 EXPORT_SYMBOL_GPL(kvm_set_cr3); 396 397 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 398 { 399 if (cr8 & CR8_RESERVED_BITS) { 400 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 401 kvm_inject_gp(vcpu, 0); 402 return; 403 } 404 if (irqchip_in_kernel(vcpu->kvm)) 405 kvm_lapic_set_tpr(vcpu, cr8); 406 else 407 vcpu->arch.cr8 = cr8; 408 } 409 EXPORT_SYMBOL_GPL(kvm_set_cr8); 410 411 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 412 { 413 if (irqchip_in_kernel(vcpu->kvm)) 414 return kvm_lapic_get_cr8(vcpu); 415 else 416 return vcpu->arch.cr8; 417 } 418 EXPORT_SYMBOL_GPL(kvm_get_cr8); 419 420 /* 421 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 422 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 423 * 424 * This list is modified at module load time to reflect the 425 * capabilities of the host cpu. 426 */ 427 static u32 msrs_to_save[] = { 428 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 429 MSR_K6_STAR, 430 #ifdef CONFIG_X86_64 431 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 432 #endif 433 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 434 MSR_IA32_PERF_STATUS, 435 }; 436 437 static unsigned num_msrs_to_save; 438 439 static u32 emulated_msrs[] = { 440 MSR_IA32_MISC_ENABLE, 441 }; 442 443 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 444 { 445 if (efer & efer_reserved_bits) { 446 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 447 efer); 448 kvm_inject_gp(vcpu, 0); 449 return; 450 } 451 452 if (is_paging(vcpu) 453 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 454 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 455 kvm_inject_gp(vcpu, 0); 456 return; 457 } 458 459 kvm_x86_ops->set_efer(vcpu, efer); 460 461 efer &= ~EFER_LMA; 462 efer |= vcpu->arch.shadow_efer & EFER_LMA; 463 464 vcpu->arch.shadow_efer = efer; 465 } 466 467 void kvm_enable_efer_bits(u64 mask) 468 { 469 efer_reserved_bits &= ~mask; 470 } 471 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 472 473 474 /* 475 * Writes msr value into into the appropriate "register". 476 * Returns 0 on success, non-0 otherwise. 477 * Assumes vcpu_load() was already called. 478 */ 479 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 480 { 481 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 482 } 483 484 /* 485 * Adapt set_msr() to msr_io()'s calling convention 486 */ 487 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 488 { 489 return kvm_set_msr(vcpu, index, *data); 490 } 491 492 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 493 { 494 static int version; 495 struct kvm_wall_clock wc; 496 struct timespec wc_ts; 497 498 if (!wall_clock) 499 return; 500 501 version++; 502 503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 504 505 wc_ts = current_kernel_time(); 506 wc.wc_sec = wc_ts.tv_sec; 507 wc.wc_nsec = wc_ts.tv_nsec; 508 wc.wc_version = version; 509 510 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 511 512 version++; 513 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 514 } 515 516 static void kvm_write_guest_time(struct kvm_vcpu *v) 517 { 518 struct timespec ts; 519 unsigned long flags; 520 struct kvm_vcpu_arch *vcpu = &v->arch; 521 void *shared_kaddr; 522 523 if ((!vcpu->time_page)) 524 return; 525 526 /* Keep irq disabled to prevent changes to the clock */ 527 local_irq_save(flags); 528 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 529 &vcpu->hv_clock.tsc_timestamp); 530 ktime_get_ts(&ts); 531 local_irq_restore(flags); 532 533 /* With all the info we got, fill in the values */ 534 535 vcpu->hv_clock.system_time = ts.tv_nsec + 536 (NSEC_PER_SEC * (u64)ts.tv_sec); 537 /* 538 * The interface expects us to write an even number signaling that the 539 * update is finished. Since the guest won't see the intermediate 540 * state, we just write "2" at the end 541 */ 542 vcpu->hv_clock.version = 2; 543 544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 545 546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 547 sizeof(vcpu->hv_clock)); 548 549 kunmap_atomic(shared_kaddr, KM_USER0); 550 551 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 552 } 553 554 555 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 556 { 557 switch (msr) { 558 case MSR_EFER: 559 set_efer(vcpu, data); 560 break; 561 case MSR_IA32_MC0_STATUS: 562 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 563 __func__, data); 564 break; 565 case MSR_IA32_MCG_STATUS: 566 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 567 __func__, data); 568 break; 569 case MSR_IA32_MCG_CTL: 570 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 571 __func__, data); 572 break; 573 case MSR_IA32_UCODE_REV: 574 case MSR_IA32_UCODE_WRITE: 575 case 0x200 ... 0x2ff: /* MTRRs */ 576 break; 577 case MSR_IA32_APICBASE: 578 kvm_set_apic_base(vcpu, data); 579 break; 580 case MSR_IA32_MISC_ENABLE: 581 vcpu->arch.ia32_misc_enable_msr = data; 582 break; 583 case MSR_KVM_WALL_CLOCK: 584 vcpu->kvm->arch.wall_clock = data; 585 kvm_write_wall_clock(vcpu->kvm, data); 586 break; 587 case MSR_KVM_SYSTEM_TIME: { 588 if (vcpu->arch.time_page) { 589 kvm_release_page_dirty(vcpu->arch.time_page); 590 vcpu->arch.time_page = NULL; 591 } 592 593 vcpu->arch.time = data; 594 595 /* we verify if the enable bit is set... */ 596 if (!(data & 1)) 597 break; 598 599 /* ...but clean it before doing the actual write */ 600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 601 602 vcpu->arch.hv_clock.tsc_to_system_mul = 603 clocksource_khz2mult(tsc_khz, 22); 604 vcpu->arch.hv_clock.tsc_shift = 22; 605 606 down_read(¤t->mm->mmap_sem); 607 vcpu->arch.time_page = 608 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 609 up_read(¤t->mm->mmap_sem); 610 611 if (is_error_page(vcpu->arch.time_page)) { 612 kvm_release_page_clean(vcpu->arch.time_page); 613 vcpu->arch.time_page = NULL; 614 } 615 616 kvm_write_guest_time(vcpu); 617 break; 618 } 619 default: 620 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 621 return 1; 622 } 623 return 0; 624 } 625 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 626 627 628 /* 629 * Reads an msr value (of 'msr_index') into 'pdata'. 630 * Returns 0 on success, non-0 otherwise. 631 * Assumes vcpu_load() was already called. 632 */ 633 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 634 { 635 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 636 } 637 638 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 639 { 640 u64 data; 641 642 switch (msr) { 643 case 0xc0010010: /* SYSCFG */ 644 case 0xc0010015: /* HWCR */ 645 case MSR_IA32_PLATFORM_ID: 646 case MSR_IA32_P5_MC_ADDR: 647 case MSR_IA32_P5_MC_TYPE: 648 case MSR_IA32_MC0_CTL: 649 case MSR_IA32_MCG_STATUS: 650 case MSR_IA32_MCG_CAP: 651 case MSR_IA32_MCG_CTL: 652 case MSR_IA32_MC0_MISC: 653 case MSR_IA32_MC0_MISC+4: 654 case MSR_IA32_MC0_MISC+8: 655 case MSR_IA32_MC0_MISC+12: 656 case MSR_IA32_MC0_MISC+16: 657 case MSR_IA32_UCODE_REV: 658 case MSR_IA32_EBL_CR_POWERON: 659 /* MTRR registers */ 660 case 0xfe: 661 case 0x200 ... 0x2ff: 662 data = 0; 663 break; 664 case 0xcd: /* fsb frequency */ 665 data = 3; 666 break; 667 case MSR_IA32_APICBASE: 668 data = kvm_get_apic_base(vcpu); 669 break; 670 case MSR_IA32_MISC_ENABLE: 671 data = vcpu->arch.ia32_misc_enable_msr; 672 break; 673 case MSR_IA32_PERF_STATUS: 674 /* TSC increment by tick */ 675 data = 1000ULL; 676 /* CPU multiplier */ 677 data |= (((uint64_t)4ULL) << 40); 678 break; 679 case MSR_EFER: 680 data = vcpu->arch.shadow_efer; 681 break; 682 case MSR_KVM_WALL_CLOCK: 683 data = vcpu->kvm->arch.wall_clock; 684 break; 685 case MSR_KVM_SYSTEM_TIME: 686 data = vcpu->arch.time; 687 break; 688 default: 689 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 690 return 1; 691 } 692 *pdata = data; 693 return 0; 694 } 695 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 696 697 /* 698 * Read or write a bunch of msrs. All parameters are kernel addresses. 699 * 700 * @return number of msrs set successfully. 701 */ 702 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 703 struct kvm_msr_entry *entries, 704 int (*do_msr)(struct kvm_vcpu *vcpu, 705 unsigned index, u64 *data)) 706 { 707 int i; 708 709 vcpu_load(vcpu); 710 711 down_read(&vcpu->kvm->slots_lock); 712 for (i = 0; i < msrs->nmsrs; ++i) 713 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 714 break; 715 up_read(&vcpu->kvm->slots_lock); 716 717 vcpu_put(vcpu); 718 719 return i; 720 } 721 722 /* 723 * Read or write a bunch of msrs. Parameters are user addresses. 724 * 725 * @return number of msrs set successfully. 726 */ 727 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 728 int (*do_msr)(struct kvm_vcpu *vcpu, 729 unsigned index, u64 *data), 730 int writeback) 731 { 732 struct kvm_msrs msrs; 733 struct kvm_msr_entry *entries; 734 int r, n; 735 unsigned size; 736 737 r = -EFAULT; 738 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 739 goto out; 740 741 r = -E2BIG; 742 if (msrs.nmsrs >= MAX_IO_MSRS) 743 goto out; 744 745 r = -ENOMEM; 746 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 747 entries = vmalloc(size); 748 if (!entries) 749 goto out; 750 751 r = -EFAULT; 752 if (copy_from_user(entries, user_msrs->entries, size)) 753 goto out_free; 754 755 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 756 if (r < 0) 757 goto out_free; 758 759 r = -EFAULT; 760 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 761 goto out_free; 762 763 r = n; 764 765 out_free: 766 vfree(entries); 767 out: 768 return r; 769 } 770 771 /* 772 * Make sure that a cpu that is being hot-unplugged does not have any vcpus 773 * cached on it. 774 */ 775 void decache_vcpus_on_cpu(int cpu) 776 { 777 struct kvm *vm; 778 struct kvm_vcpu *vcpu; 779 int i; 780 781 spin_lock(&kvm_lock); 782 list_for_each_entry(vm, &vm_list, vm_list) 783 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 784 vcpu = vm->vcpus[i]; 785 if (!vcpu) 786 continue; 787 /* 788 * If the vcpu is locked, then it is running on some 789 * other cpu and therefore it is not cached on the 790 * cpu in question. 791 * 792 * If it's not locked, check the last cpu it executed 793 * on. 794 */ 795 if (mutex_trylock(&vcpu->mutex)) { 796 if (vcpu->cpu == cpu) { 797 kvm_x86_ops->vcpu_decache(vcpu); 798 vcpu->cpu = -1; 799 } 800 mutex_unlock(&vcpu->mutex); 801 } 802 } 803 spin_unlock(&kvm_lock); 804 } 805 806 int kvm_dev_ioctl_check_extension(long ext) 807 { 808 int r; 809 810 switch (ext) { 811 case KVM_CAP_IRQCHIP: 812 case KVM_CAP_HLT: 813 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 814 case KVM_CAP_USER_MEMORY: 815 case KVM_CAP_SET_TSS_ADDR: 816 case KVM_CAP_EXT_CPUID: 817 case KVM_CAP_CLOCKSOURCE: 818 case KVM_CAP_PIT: 819 case KVM_CAP_NOP_IO_DELAY: 820 case KVM_CAP_MP_STATE: 821 r = 1; 822 break; 823 case KVM_CAP_VAPIC: 824 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 825 break; 826 case KVM_CAP_NR_VCPUS: 827 r = KVM_MAX_VCPUS; 828 break; 829 case KVM_CAP_NR_MEMSLOTS: 830 r = KVM_MEMORY_SLOTS; 831 break; 832 case KVM_CAP_PV_MMU: 833 r = !tdp_enabled; 834 break; 835 default: 836 r = 0; 837 break; 838 } 839 return r; 840 841 } 842 843 long kvm_arch_dev_ioctl(struct file *filp, 844 unsigned int ioctl, unsigned long arg) 845 { 846 void __user *argp = (void __user *)arg; 847 long r; 848 849 switch (ioctl) { 850 case KVM_GET_MSR_INDEX_LIST: { 851 struct kvm_msr_list __user *user_msr_list = argp; 852 struct kvm_msr_list msr_list; 853 unsigned n; 854 855 r = -EFAULT; 856 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 857 goto out; 858 n = msr_list.nmsrs; 859 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 860 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 861 goto out; 862 r = -E2BIG; 863 if (n < num_msrs_to_save) 864 goto out; 865 r = -EFAULT; 866 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 867 num_msrs_to_save * sizeof(u32))) 868 goto out; 869 if (copy_to_user(user_msr_list->indices 870 + num_msrs_to_save * sizeof(u32), 871 &emulated_msrs, 872 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 873 goto out; 874 r = 0; 875 break; 876 } 877 case KVM_GET_SUPPORTED_CPUID: { 878 struct kvm_cpuid2 __user *cpuid_arg = argp; 879 struct kvm_cpuid2 cpuid; 880 881 r = -EFAULT; 882 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 883 goto out; 884 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 885 cpuid_arg->entries); 886 if (r) 887 goto out; 888 889 r = -EFAULT; 890 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 891 goto out; 892 r = 0; 893 break; 894 } 895 default: 896 r = -EINVAL; 897 } 898 out: 899 return r; 900 } 901 902 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 903 { 904 kvm_x86_ops->vcpu_load(vcpu, cpu); 905 kvm_write_guest_time(vcpu); 906 } 907 908 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 909 { 910 kvm_x86_ops->vcpu_put(vcpu); 911 kvm_put_guest_fpu(vcpu); 912 } 913 914 static int is_efer_nx(void) 915 { 916 u64 efer; 917 918 rdmsrl(MSR_EFER, efer); 919 return efer & EFER_NX; 920 } 921 922 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 923 { 924 int i; 925 struct kvm_cpuid_entry2 *e, *entry; 926 927 entry = NULL; 928 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 929 e = &vcpu->arch.cpuid_entries[i]; 930 if (e->function == 0x80000001) { 931 entry = e; 932 break; 933 } 934 } 935 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 936 entry->edx &= ~(1 << 20); 937 printk(KERN_INFO "kvm: guest NX capability removed\n"); 938 } 939 } 940 941 /* when an old userspace process fills a new kernel module */ 942 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 943 struct kvm_cpuid *cpuid, 944 struct kvm_cpuid_entry __user *entries) 945 { 946 int r, i; 947 struct kvm_cpuid_entry *cpuid_entries; 948 949 r = -E2BIG; 950 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 951 goto out; 952 r = -ENOMEM; 953 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 954 if (!cpuid_entries) 955 goto out; 956 r = -EFAULT; 957 if (copy_from_user(cpuid_entries, entries, 958 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 959 goto out_free; 960 for (i = 0; i < cpuid->nent; i++) { 961 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 962 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 963 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 964 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 965 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 966 vcpu->arch.cpuid_entries[i].index = 0; 967 vcpu->arch.cpuid_entries[i].flags = 0; 968 vcpu->arch.cpuid_entries[i].padding[0] = 0; 969 vcpu->arch.cpuid_entries[i].padding[1] = 0; 970 vcpu->arch.cpuid_entries[i].padding[2] = 0; 971 } 972 vcpu->arch.cpuid_nent = cpuid->nent; 973 cpuid_fix_nx_cap(vcpu); 974 r = 0; 975 976 out_free: 977 vfree(cpuid_entries); 978 out: 979 return r; 980 } 981 982 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 983 struct kvm_cpuid2 *cpuid, 984 struct kvm_cpuid_entry2 __user *entries) 985 { 986 int r; 987 988 r = -E2BIG; 989 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 990 goto out; 991 r = -EFAULT; 992 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 993 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 994 goto out; 995 vcpu->arch.cpuid_nent = cpuid->nent; 996 return 0; 997 998 out: 999 return r; 1000 } 1001 1002 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1003 struct kvm_cpuid2 *cpuid, 1004 struct kvm_cpuid_entry2 __user *entries) 1005 { 1006 int r; 1007 1008 r = -E2BIG; 1009 if (cpuid->nent < vcpu->arch.cpuid_nent) 1010 goto out; 1011 r = -EFAULT; 1012 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1013 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1014 goto out; 1015 return 0; 1016 1017 out: 1018 cpuid->nent = vcpu->arch.cpuid_nent; 1019 return r; 1020 } 1021 1022 static inline u32 bit(int bitno) 1023 { 1024 return 1 << (bitno & 31); 1025 } 1026 1027 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1028 u32 index) 1029 { 1030 entry->function = function; 1031 entry->index = index; 1032 cpuid_count(entry->function, entry->index, 1033 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1034 entry->flags = 0; 1035 } 1036 1037 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1038 u32 index, int *nent, int maxnent) 1039 { 1040 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1041 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1042 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1043 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1044 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1045 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | 1046 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1047 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | 1048 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | 1049 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); 1050 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | 1051 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1052 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1053 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1054 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1055 bit(X86_FEATURE_PGE) | 1056 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1057 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | 1058 bit(X86_FEATURE_SYSCALL) | 1059 (bit(X86_FEATURE_NX) && is_efer_nx()) | 1060 #ifdef CONFIG_X86_64 1061 bit(X86_FEATURE_LM) | 1062 #endif 1063 bit(X86_FEATURE_MMXEXT) | 1064 bit(X86_FEATURE_3DNOWEXT) | 1065 bit(X86_FEATURE_3DNOW); 1066 const u32 kvm_supported_word3_x86_features = 1067 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1068 const u32 kvm_supported_word6_x86_features = 1069 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); 1070 1071 /* all func 2 cpuid_count() should be called on the same cpu */ 1072 get_cpu(); 1073 do_cpuid_1_ent(entry, function, index); 1074 ++*nent; 1075 1076 switch (function) { 1077 case 0: 1078 entry->eax = min(entry->eax, (u32)0xb); 1079 break; 1080 case 1: 1081 entry->edx &= kvm_supported_word0_x86_features; 1082 entry->ecx &= kvm_supported_word3_x86_features; 1083 break; 1084 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1085 * may return different values. This forces us to get_cpu() before 1086 * issuing the first command, and also to emulate this annoying behavior 1087 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1088 case 2: { 1089 int t, times = entry->eax & 0xff; 1090 1091 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1092 for (t = 1; t < times && *nent < maxnent; ++t) { 1093 do_cpuid_1_ent(&entry[t], function, 0); 1094 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1095 ++*nent; 1096 } 1097 break; 1098 } 1099 /* function 4 and 0xb have additional index. */ 1100 case 4: { 1101 int i, cache_type; 1102 1103 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1104 /* read more entries until cache_type is zero */ 1105 for (i = 1; *nent < maxnent; ++i) { 1106 cache_type = entry[i - 1].eax & 0x1f; 1107 if (!cache_type) 1108 break; 1109 do_cpuid_1_ent(&entry[i], function, i); 1110 entry[i].flags |= 1111 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1112 ++*nent; 1113 } 1114 break; 1115 } 1116 case 0xb: { 1117 int i, level_type; 1118 1119 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1120 /* read more entries until level_type is zero */ 1121 for (i = 1; *nent < maxnent; ++i) { 1122 level_type = entry[i - 1].ecx & 0xff; 1123 if (!level_type) 1124 break; 1125 do_cpuid_1_ent(&entry[i], function, i); 1126 entry[i].flags |= 1127 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1128 ++*nent; 1129 } 1130 break; 1131 } 1132 case 0x80000000: 1133 entry->eax = min(entry->eax, 0x8000001a); 1134 break; 1135 case 0x80000001: 1136 entry->edx &= kvm_supported_word1_x86_features; 1137 entry->ecx &= kvm_supported_word6_x86_features; 1138 break; 1139 } 1140 put_cpu(); 1141 } 1142 1143 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1144 struct kvm_cpuid_entry2 __user *entries) 1145 { 1146 struct kvm_cpuid_entry2 *cpuid_entries; 1147 int limit, nent = 0, r = -E2BIG; 1148 u32 func; 1149 1150 if (cpuid->nent < 1) 1151 goto out; 1152 r = -ENOMEM; 1153 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1154 if (!cpuid_entries) 1155 goto out; 1156 1157 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1158 limit = cpuid_entries[0].eax; 1159 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1160 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1161 &nent, cpuid->nent); 1162 r = -E2BIG; 1163 if (nent >= cpuid->nent) 1164 goto out_free; 1165 1166 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1167 limit = cpuid_entries[nent - 1].eax; 1168 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1169 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1170 &nent, cpuid->nent); 1171 r = -EFAULT; 1172 if (copy_to_user(entries, cpuid_entries, 1173 nent * sizeof(struct kvm_cpuid_entry2))) 1174 goto out_free; 1175 cpuid->nent = nent; 1176 r = 0; 1177 1178 out_free: 1179 vfree(cpuid_entries); 1180 out: 1181 return r; 1182 } 1183 1184 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1185 struct kvm_lapic_state *s) 1186 { 1187 vcpu_load(vcpu); 1188 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1189 vcpu_put(vcpu); 1190 1191 return 0; 1192 } 1193 1194 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1195 struct kvm_lapic_state *s) 1196 { 1197 vcpu_load(vcpu); 1198 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1199 kvm_apic_post_state_restore(vcpu); 1200 vcpu_put(vcpu); 1201 1202 return 0; 1203 } 1204 1205 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1206 struct kvm_interrupt *irq) 1207 { 1208 if (irq->irq < 0 || irq->irq >= 256) 1209 return -EINVAL; 1210 if (irqchip_in_kernel(vcpu->kvm)) 1211 return -ENXIO; 1212 vcpu_load(vcpu); 1213 1214 set_bit(irq->irq, vcpu->arch.irq_pending); 1215 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 1216 1217 vcpu_put(vcpu); 1218 1219 return 0; 1220 } 1221 1222 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1223 struct kvm_tpr_access_ctl *tac) 1224 { 1225 if (tac->flags) 1226 return -EINVAL; 1227 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1228 return 0; 1229 } 1230 1231 long kvm_arch_vcpu_ioctl(struct file *filp, 1232 unsigned int ioctl, unsigned long arg) 1233 { 1234 struct kvm_vcpu *vcpu = filp->private_data; 1235 void __user *argp = (void __user *)arg; 1236 int r; 1237 1238 switch (ioctl) { 1239 case KVM_GET_LAPIC: { 1240 struct kvm_lapic_state lapic; 1241 1242 memset(&lapic, 0, sizeof lapic); 1243 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); 1244 if (r) 1245 goto out; 1246 r = -EFAULT; 1247 if (copy_to_user(argp, &lapic, sizeof lapic)) 1248 goto out; 1249 r = 0; 1250 break; 1251 } 1252 case KVM_SET_LAPIC: { 1253 struct kvm_lapic_state lapic; 1254 1255 r = -EFAULT; 1256 if (copy_from_user(&lapic, argp, sizeof lapic)) 1257 goto out; 1258 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; 1259 if (r) 1260 goto out; 1261 r = 0; 1262 break; 1263 } 1264 case KVM_INTERRUPT: { 1265 struct kvm_interrupt irq; 1266 1267 r = -EFAULT; 1268 if (copy_from_user(&irq, argp, sizeof irq)) 1269 goto out; 1270 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1271 if (r) 1272 goto out; 1273 r = 0; 1274 break; 1275 } 1276 case KVM_SET_CPUID: { 1277 struct kvm_cpuid __user *cpuid_arg = argp; 1278 struct kvm_cpuid cpuid; 1279 1280 r = -EFAULT; 1281 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1282 goto out; 1283 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1284 if (r) 1285 goto out; 1286 break; 1287 } 1288 case KVM_SET_CPUID2: { 1289 struct kvm_cpuid2 __user *cpuid_arg = argp; 1290 struct kvm_cpuid2 cpuid; 1291 1292 r = -EFAULT; 1293 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1294 goto out; 1295 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1296 cpuid_arg->entries); 1297 if (r) 1298 goto out; 1299 break; 1300 } 1301 case KVM_GET_CPUID2: { 1302 struct kvm_cpuid2 __user *cpuid_arg = argp; 1303 struct kvm_cpuid2 cpuid; 1304 1305 r = -EFAULT; 1306 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1307 goto out; 1308 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1309 cpuid_arg->entries); 1310 if (r) 1311 goto out; 1312 r = -EFAULT; 1313 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1314 goto out; 1315 r = 0; 1316 break; 1317 } 1318 case KVM_GET_MSRS: 1319 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1320 break; 1321 case KVM_SET_MSRS: 1322 r = msr_io(vcpu, argp, do_set_msr, 0); 1323 break; 1324 case KVM_TPR_ACCESS_REPORTING: { 1325 struct kvm_tpr_access_ctl tac; 1326 1327 r = -EFAULT; 1328 if (copy_from_user(&tac, argp, sizeof tac)) 1329 goto out; 1330 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1331 if (r) 1332 goto out; 1333 r = -EFAULT; 1334 if (copy_to_user(argp, &tac, sizeof tac)) 1335 goto out; 1336 r = 0; 1337 break; 1338 }; 1339 case KVM_SET_VAPIC_ADDR: { 1340 struct kvm_vapic_addr va; 1341 1342 r = -EINVAL; 1343 if (!irqchip_in_kernel(vcpu->kvm)) 1344 goto out; 1345 r = -EFAULT; 1346 if (copy_from_user(&va, argp, sizeof va)) 1347 goto out; 1348 r = 0; 1349 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1350 break; 1351 } 1352 default: 1353 r = -EINVAL; 1354 } 1355 out: 1356 return r; 1357 } 1358 1359 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1360 { 1361 int ret; 1362 1363 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1364 return -1; 1365 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1366 return ret; 1367 } 1368 1369 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1370 u32 kvm_nr_mmu_pages) 1371 { 1372 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1373 return -EINVAL; 1374 1375 down_write(&kvm->slots_lock); 1376 1377 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1378 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1379 1380 up_write(&kvm->slots_lock); 1381 return 0; 1382 } 1383 1384 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1385 { 1386 return kvm->arch.n_alloc_mmu_pages; 1387 } 1388 1389 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1390 { 1391 int i; 1392 struct kvm_mem_alias *alias; 1393 1394 for (i = 0; i < kvm->arch.naliases; ++i) { 1395 alias = &kvm->arch.aliases[i]; 1396 if (gfn >= alias->base_gfn 1397 && gfn < alias->base_gfn + alias->npages) 1398 return alias->target_gfn + gfn - alias->base_gfn; 1399 } 1400 return gfn; 1401 } 1402 1403 /* 1404 * Set a new alias region. Aliases map a portion of physical memory into 1405 * another portion. This is useful for memory windows, for example the PC 1406 * VGA region. 1407 */ 1408 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1409 struct kvm_memory_alias *alias) 1410 { 1411 int r, n; 1412 struct kvm_mem_alias *p; 1413 1414 r = -EINVAL; 1415 /* General sanity checks */ 1416 if (alias->memory_size & (PAGE_SIZE - 1)) 1417 goto out; 1418 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1419 goto out; 1420 if (alias->slot >= KVM_ALIAS_SLOTS) 1421 goto out; 1422 if (alias->guest_phys_addr + alias->memory_size 1423 < alias->guest_phys_addr) 1424 goto out; 1425 if (alias->target_phys_addr + alias->memory_size 1426 < alias->target_phys_addr) 1427 goto out; 1428 1429 down_write(&kvm->slots_lock); 1430 1431 p = &kvm->arch.aliases[alias->slot]; 1432 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1433 p->npages = alias->memory_size >> PAGE_SHIFT; 1434 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1435 1436 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1437 if (kvm->arch.aliases[n - 1].npages) 1438 break; 1439 kvm->arch.naliases = n; 1440 1441 kvm_mmu_zap_all(kvm); 1442 1443 up_write(&kvm->slots_lock); 1444 1445 return 0; 1446 1447 out: 1448 return r; 1449 } 1450 1451 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1452 { 1453 int r; 1454 1455 r = 0; 1456 switch (chip->chip_id) { 1457 case KVM_IRQCHIP_PIC_MASTER: 1458 memcpy(&chip->chip.pic, 1459 &pic_irqchip(kvm)->pics[0], 1460 sizeof(struct kvm_pic_state)); 1461 break; 1462 case KVM_IRQCHIP_PIC_SLAVE: 1463 memcpy(&chip->chip.pic, 1464 &pic_irqchip(kvm)->pics[1], 1465 sizeof(struct kvm_pic_state)); 1466 break; 1467 case KVM_IRQCHIP_IOAPIC: 1468 memcpy(&chip->chip.ioapic, 1469 ioapic_irqchip(kvm), 1470 sizeof(struct kvm_ioapic_state)); 1471 break; 1472 default: 1473 r = -EINVAL; 1474 break; 1475 } 1476 return r; 1477 } 1478 1479 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1480 { 1481 int r; 1482 1483 r = 0; 1484 switch (chip->chip_id) { 1485 case KVM_IRQCHIP_PIC_MASTER: 1486 memcpy(&pic_irqchip(kvm)->pics[0], 1487 &chip->chip.pic, 1488 sizeof(struct kvm_pic_state)); 1489 break; 1490 case KVM_IRQCHIP_PIC_SLAVE: 1491 memcpy(&pic_irqchip(kvm)->pics[1], 1492 &chip->chip.pic, 1493 sizeof(struct kvm_pic_state)); 1494 break; 1495 case KVM_IRQCHIP_IOAPIC: 1496 memcpy(ioapic_irqchip(kvm), 1497 &chip->chip.ioapic, 1498 sizeof(struct kvm_ioapic_state)); 1499 break; 1500 default: 1501 r = -EINVAL; 1502 break; 1503 } 1504 kvm_pic_update_irq(pic_irqchip(kvm)); 1505 return r; 1506 } 1507 1508 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1509 { 1510 int r = 0; 1511 1512 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1513 return r; 1514 } 1515 1516 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1517 { 1518 int r = 0; 1519 1520 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1521 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1522 return r; 1523 } 1524 1525 /* 1526 * Get (and clear) the dirty memory log for a memory slot. 1527 */ 1528 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1529 struct kvm_dirty_log *log) 1530 { 1531 int r; 1532 int n; 1533 struct kvm_memory_slot *memslot; 1534 int is_dirty = 0; 1535 1536 down_write(&kvm->slots_lock); 1537 1538 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1539 if (r) 1540 goto out; 1541 1542 /* If nothing is dirty, don't bother messing with page tables. */ 1543 if (is_dirty) { 1544 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1545 kvm_flush_remote_tlbs(kvm); 1546 memslot = &kvm->memslots[log->slot]; 1547 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1548 memset(memslot->dirty_bitmap, 0, n); 1549 } 1550 r = 0; 1551 out: 1552 up_write(&kvm->slots_lock); 1553 return r; 1554 } 1555 1556 long kvm_arch_vm_ioctl(struct file *filp, 1557 unsigned int ioctl, unsigned long arg) 1558 { 1559 struct kvm *kvm = filp->private_data; 1560 void __user *argp = (void __user *)arg; 1561 int r = -EINVAL; 1562 1563 switch (ioctl) { 1564 case KVM_SET_TSS_ADDR: 1565 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1566 if (r < 0) 1567 goto out; 1568 break; 1569 case KVM_SET_MEMORY_REGION: { 1570 struct kvm_memory_region kvm_mem; 1571 struct kvm_userspace_memory_region kvm_userspace_mem; 1572 1573 r = -EFAULT; 1574 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1575 goto out; 1576 kvm_userspace_mem.slot = kvm_mem.slot; 1577 kvm_userspace_mem.flags = kvm_mem.flags; 1578 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1579 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1580 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1581 if (r) 1582 goto out; 1583 break; 1584 } 1585 case KVM_SET_NR_MMU_PAGES: 1586 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1587 if (r) 1588 goto out; 1589 break; 1590 case KVM_GET_NR_MMU_PAGES: 1591 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1592 break; 1593 case KVM_SET_MEMORY_ALIAS: { 1594 struct kvm_memory_alias alias; 1595 1596 r = -EFAULT; 1597 if (copy_from_user(&alias, argp, sizeof alias)) 1598 goto out; 1599 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 1600 if (r) 1601 goto out; 1602 break; 1603 } 1604 case KVM_CREATE_IRQCHIP: 1605 r = -ENOMEM; 1606 kvm->arch.vpic = kvm_create_pic(kvm); 1607 if (kvm->arch.vpic) { 1608 r = kvm_ioapic_init(kvm); 1609 if (r) { 1610 kfree(kvm->arch.vpic); 1611 kvm->arch.vpic = NULL; 1612 goto out; 1613 } 1614 } else 1615 goto out; 1616 break; 1617 case KVM_CREATE_PIT: 1618 r = -ENOMEM; 1619 kvm->arch.vpit = kvm_create_pit(kvm); 1620 if (kvm->arch.vpit) 1621 r = 0; 1622 break; 1623 case KVM_IRQ_LINE: { 1624 struct kvm_irq_level irq_event; 1625 1626 r = -EFAULT; 1627 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1628 goto out; 1629 if (irqchip_in_kernel(kvm)) { 1630 mutex_lock(&kvm->lock); 1631 if (irq_event.irq < 16) 1632 kvm_pic_set_irq(pic_irqchip(kvm), 1633 irq_event.irq, 1634 irq_event.level); 1635 kvm_ioapic_set_irq(kvm->arch.vioapic, 1636 irq_event.irq, 1637 irq_event.level); 1638 mutex_unlock(&kvm->lock); 1639 r = 0; 1640 } 1641 break; 1642 } 1643 case KVM_GET_IRQCHIP: { 1644 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1645 struct kvm_irqchip chip; 1646 1647 r = -EFAULT; 1648 if (copy_from_user(&chip, argp, sizeof chip)) 1649 goto out; 1650 r = -ENXIO; 1651 if (!irqchip_in_kernel(kvm)) 1652 goto out; 1653 r = kvm_vm_ioctl_get_irqchip(kvm, &chip); 1654 if (r) 1655 goto out; 1656 r = -EFAULT; 1657 if (copy_to_user(argp, &chip, sizeof chip)) 1658 goto out; 1659 r = 0; 1660 break; 1661 } 1662 case KVM_SET_IRQCHIP: { 1663 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1664 struct kvm_irqchip chip; 1665 1666 r = -EFAULT; 1667 if (copy_from_user(&chip, argp, sizeof chip)) 1668 goto out; 1669 r = -ENXIO; 1670 if (!irqchip_in_kernel(kvm)) 1671 goto out; 1672 r = kvm_vm_ioctl_set_irqchip(kvm, &chip); 1673 if (r) 1674 goto out; 1675 r = 0; 1676 break; 1677 } 1678 case KVM_GET_PIT: { 1679 struct kvm_pit_state ps; 1680 r = -EFAULT; 1681 if (copy_from_user(&ps, argp, sizeof ps)) 1682 goto out; 1683 r = -ENXIO; 1684 if (!kvm->arch.vpit) 1685 goto out; 1686 r = kvm_vm_ioctl_get_pit(kvm, &ps); 1687 if (r) 1688 goto out; 1689 r = -EFAULT; 1690 if (copy_to_user(argp, &ps, sizeof ps)) 1691 goto out; 1692 r = 0; 1693 break; 1694 } 1695 case KVM_SET_PIT: { 1696 struct kvm_pit_state ps; 1697 r = -EFAULT; 1698 if (copy_from_user(&ps, argp, sizeof ps)) 1699 goto out; 1700 r = -ENXIO; 1701 if (!kvm->arch.vpit) 1702 goto out; 1703 r = kvm_vm_ioctl_set_pit(kvm, &ps); 1704 if (r) 1705 goto out; 1706 r = 0; 1707 break; 1708 } 1709 default: 1710 ; 1711 } 1712 out: 1713 return r; 1714 } 1715 1716 static void kvm_init_msr_list(void) 1717 { 1718 u32 dummy[2]; 1719 unsigned i, j; 1720 1721 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 1722 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 1723 continue; 1724 if (j < i) 1725 msrs_to_save[j] = msrs_to_save[i]; 1726 j++; 1727 } 1728 num_msrs_to_save = j; 1729 } 1730 1731 /* 1732 * Only apic need an MMIO device hook, so shortcut now.. 1733 */ 1734 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1735 gpa_t addr) 1736 { 1737 struct kvm_io_device *dev; 1738 1739 if (vcpu->arch.apic) { 1740 dev = &vcpu->arch.apic->dev; 1741 if (dev->in_range(dev, addr)) 1742 return dev; 1743 } 1744 return NULL; 1745 } 1746 1747 1748 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1749 gpa_t addr) 1750 { 1751 struct kvm_io_device *dev; 1752 1753 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1754 if (dev == NULL) 1755 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1756 return dev; 1757 } 1758 1759 int emulator_read_std(unsigned long addr, 1760 void *val, 1761 unsigned int bytes, 1762 struct kvm_vcpu *vcpu) 1763 { 1764 void *data = val; 1765 int r = X86EMUL_CONTINUE; 1766 1767 while (bytes) { 1768 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1769 unsigned offset = addr & (PAGE_SIZE-1); 1770 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1771 int ret; 1772 1773 if (gpa == UNMAPPED_GVA) { 1774 r = X86EMUL_PROPAGATE_FAULT; 1775 goto out; 1776 } 1777 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); 1778 if (ret < 0) { 1779 r = X86EMUL_UNHANDLEABLE; 1780 goto out; 1781 } 1782 1783 bytes -= tocopy; 1784 data += tocopy; 1785 addr += tocopy; 1786 } 1787 out: 1788 return r; 1789 } 1790 EXPORT_SYMBOL_GPL(emulator_read_std); 1791 1792 static int emulator_read_emulated(unsigned long addr, 1793 void *val, 1794 unsigned int bytes, 1795 struct kvm_vcpu *vcpu) 1796 { 1797 struct kvm_io_device *mmio_dev; 1798 gpa_t gpa; 1799 1800 if (vcpu->mmio_read_completed) { 1801 memcpy(val, vcpu->mmio_data, bytes); 1802 vcpu->mmio_read_completed = 0; 1803 return X86EMUL_CONTINUE; 1804 } 1805 1806 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1807 1808 /* For APIC access vmexit */ 1809 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1810 goto mmio; 1811 1812 if (emulator_read_std(addr, val, bytes, vcpu) 1813 == X86EMUL_CONTINUE) 1814 return X86EMUL_CONTINUE; 1815 if (gpa == UNMAPPED_GVA) 1816 return X86EMUL_PROPAGATE_FAULT; 1817 1818 mmio: 1819 /* 1820 * Is this MMIO handled locally? 1821 */ 1822 mutex_lock(&vcpu->kvm->lock); 1823 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1824 if (mmio_dev) { 1825 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1826 mutex_unlock(&vcpu->kvm->lock); 1827 return X86EMUL_CONTINUE; 1828 } 1829 mutex_unlock(&vcpu->kvm->lock); 1830 1831 vcpu->mmio_needed = 1; 1832 vcpu->mmio_phys_addr = gpa; 1833 vcpu->mmio_size = bytes; 1834 vcpu->mmio_is_write = 0; 1835 1836 return X86EMUL_UNHANDLEABLE; 1837 } 1838 1839 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1840 const void *val, int bytes) 1841 { 1842 int ret; 1843 1844 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 1845 if (ret < 0) 1846 return 0; 1847 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1848 return 1; 1849 } 1850 1851 static int emulator_write_emulated_onepage(unsigned long addr, 1852 const void *val, 1853 unsigned int bytes, 1854 struct kvm_vcpu *vcpu) 1855 { 1856 struct kvm_io_device *mmio_dev; 1857 gpa_t gpa; 1858 1859 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1860 1861 if (gpa == UNMAPPED_GVA) { 1862 kvm_inject_page_fault(vcpu, addr, 2); 1863 return X86EMUL_PROPAGATE_FAULT; 1864 } 1865 1866 /* For APIC access vmexit */ 1867 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1868 goto mmio; 1869 1870 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1871 return X86EMUL_CONTINUE; 1872 1873 mmio: 1874 /* 1875 * Is this MMIO handled locally? 1876 */ 1877 mutex_lock(&vcpu->kvm->lock); 1878 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1879 if (mmio_dev) { 1880 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1881 mutex_unlock(&vcpu->kvm->lock); 1882 return X86EMUL_CONTINUE; 1883 } 1884 mutex_unlock(&vcpu->kvm->lock); 1885 1886 vcpu->mmio_needed = 1; 1887 vcpu->mmio_phys_addr = gpa; 1888 vcpu->mmio_size = bytes; 1889 vcpu->mmio_is_write = 1; 1890 memcpy(vcpu->mmio_data, val, bytes); 1891 1892 return X86EMUL_CONTINUE; 1893 } 1894 1895 int emulator_write_emulated(unsigned long addr, 1896 const void *val, 1897 unsigned int bytes, 1898 struct kvm_vcpu *vcpu) 1899 { 1900 /* Crossing a page boundary? */ 1901 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 1902 int rc, now; 1903 1904 now = -addr & ~PAGE_MASK; 1905 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 1906 if (rc != X86EMUL_CONTINUE) 1907 return rc; 1908 addr += now; 1909 val += now; 1910 bytes -= now; 1911 } 1912 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 1913 } 1914 EXPORT_SYMBOL_GPL(emulator_write_emulated); 1915 1916 static int emulator_cmpxchg_emulated(unsigned long addr, 1917 const void *old, 1918 const void *new, 1919 unsigned int bytes, 1920 struct kvm_vcpu *vcpu) 1921 { 1922 static int reported; 1923 1924 if (!reported) { 1925 reported = 1; 1926 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1927 } 1928 #ifndef CONFIG_X86_64 1929 /* guests cmpxchg8b have to be emulated atomically */ 1930 if (bytes == 8) { 1931 gpa_t gpa; 1932 struct page *page; 1933 char *kaddr; 1934 u64 val; 1935 1936 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1937 1938 if (gpa == UNMAPPED_GVA || 1939 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1940 goto emul_write; 1941 1942 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 1943 goto emul_write; 1944 1945 val = *(u64 *)new; 1946 1947 down_read(¤t->mm->mmap_sem); 1948 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1949 up_read(¤t->mm->mmap_sem); 1950 1951 kaddr = kmap_atomic(page, KM_USER0); 1952 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 1953 kunmap_atomic(kaddr, KM_USER0); 1954 kvm_release_page_dirty(page); 1955 } 1956 emul_write: 1957 #endif 1958 1959 return emulator_write_emulated(addr, new, bytes, vcpu); 1960 } 1961 1962 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 1963 { 1964 return kvm_x86_ops->get_segment_base(vcpu, seg); 1965 } 1966 1967 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 1968 { 1969 return X86EMUL_CONTINUE; 1970 } 1971 1972 int emulate_clts(struct kvm_vcpu *vcpu) 1973 { 1974 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 1975 return X86EMUL_CONTINUE; 1976 } 1977 1978 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 1979 { 1980 struct kvm_vcpu *vcpu = ctxt->vcpu; 1981 1982 switch (dr) { 1983 case 0 ... 3: 1984 *dest = kvm_x86_ops->get_dr(vcpu, dr); 1985 return X86EMUL_CONTINUE; 1986 default: 1987 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 1988 return X86EMUL_UNHANDLEABLE; 1989 } 1990 } 1991 1992 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 1993 { 1994 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 1995 int exception; 1996 1997 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 1998 if (exception) { 1999 /* FIXME: better handling */ 2000 return X86EMUL_UNHANDLEABLE; 2001 } 2002 return X86EMUL_CONTINUE; 2003 } 2004 2005 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2006 { 2007 static int reported; 2008 u8 opcodes[4]; 2009 unsigned long rip = vcpu->arch.rip; 2010 unsigned long rip_linear; 2011 2012 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2013 2014 if (reported) 2015 return; 2016 2017 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2018 2019 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2020 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2021 reported = 1; 2022 } 2023 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2024 2025 static struct x86_emulate_ops emulate_ops = { 2026 .read_std = emulator_read_std, 2027 .read_emulated = emulator_read_emulated, 2028 .write_emulated = emulator_write_emulated, 2029 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2030 }; 2031 2032 int emulate_instruction(struct kvm_vcpu *vcpu, 2033 struct kvm_run *run, 2034 unsigned long cr2, 2035 u16 error_code, 2036 int emulation_type) 2037 { 2038 int r; 2039 struct decode_cache *c; 2040 2041 vcpu->arch.mmio_fault_cr2 = cr2; 2042 kvm_x86_ops->cache_regs(vcpu); 2043 2044 vcpu->mmio_is_write = 0; 2045 vcpu->arch.pio.string = 0; 2046 2047 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2048 int cs_db, cs_l; 2049 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2050 2051 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2052 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2053 vcpu->arch.emulate_ctxt.mode = 2054 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2055 ? X86EMUL_MODE_REAL : cs_l 2056 ? X86EMUL_MODE_PROT64 : cs_db 2057 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2058 2059 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { 2060 vcpu->arch.emulate_ctxt.cs_base = 0; 2061 vcpu->arch.emulate_ctxt.ds_base = 0; 2062 vcpu->arch.emulate_ctxt.es_base = 0; 2063 vcpu->arch.emulate_ctxt.ss_base = 0; 2064 } else { 2065 vcpu->arch.emulate_ctxt.cs_base = 2066 get_segment_base(vcpu, VCPU_SREG_CS); 2067 vcpu->arch.emulate_ctxt.ds_base = 2068 get_segment_base(vcpu, VCPU_SREG_DS); 2069 vcpu->arch.emulate_ctxt.es_base = 2070 get_segment_base(vcpu, VCPU_SREG_ES); 2071 vcpu->arch.emulate_ctxt.ss_base = 2072 get_segment_base(vcpu, VCPU_SREG_SS); 2073 } 2074 2075 vcpu->arch.emulate_ctxt.gs_base = 2076 get_segment_base(vcpu, VCPU_SREG_GS); 2077 vcpu->arch.emulate_ctxt.fs_base = 2078 get_segment_base(vcpu, VCPU_SREG_FS); 2079 2080 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2081 2082 /* Reject the instructions other than VMCALL/VMMCALL when 2083 * try to emulate invalid opcode */ 2084 c = &vcpu->arch.emulate_ctxt.decode; 2085 if ((emulation_type & EMULTYPE_TRAP_UD) && 2086 (!(c->twobyte && c->b == 0x01 && 2087 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2088 c->modrm_mod == 3 && c->modrm_rm == 1))) 2089 return EMULATE_FAIL; 2090 2091 ++vcpu->stat.insn_emulation; 2092 if (r) { 2093 ++vcpu->stat.insn_emulation_fail; 2094 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2095 return EMULATE_DONE; 2096 return EMULATE_FAIL; 2097 } 2098 } 2099 2100 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2101 2102 if (vcpu->arch.pio.string) 2103 return EMULATE_DO_MMIO; 2104 2105 if ((r || vcpu->mmio_is_write) && run) { 2106 run->exit_reason = KVM_EXIT_MMIO; 2107 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2108 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2109 run->mmio.len = vcpu->mmio_size; 2110 run->mmio.is_write = vcpu->mmio_is_write; 2111 } 2112 2113 if (r) { 2114 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2115 return EMULATE_DONE; 2116 if (!vcpu->mmio_needed) { 2117 kvm_report_emulation_failure(vcpu, "mmio"); 2118 return EMULATE_FAIL; 2119 } 2120 return EMULATE_DO_MMIO; 2121 } 2122 2123 kvm_x86_ops->decache_regs(vcpu); 2124 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2125 2126 if (vcpu->mmio_is_write) { 2127 vcpu->mmio_needed = 0; 2128 return EMULATE_DO_MMIO; 2129 } 2130 2131 return EMULATE_DONE; 2132 } 2133 EXPORT_SYMBOL_GPL(emulate_instruction); 2134 2135 static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 2136 { 2137 int i; 2138 2139 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) 2140 if (vcpu->arch.pio.guest_pages[i]) { 2141 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); 2142 vcpu->arch.pio.guest_pages[i] = NULL; 2143 } 2144 } 2145 2146 static int pio_copy_data(struct kvm_vcpu *vcpu) 2147 { 2148 void *p = vcpu->arch.pio_data; 2149 void *q; 2150 unsigned bytes; 2151 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; 2152 2153 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 2154 PAGE_KERNEL); 2155 if (!q) { 2156 free_pio_guest_pages(vcpu); 2157 return -ENOMEM; 2158 } 2159 q += vcpu->arch.pio.guest_page_offset; 2160 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2161 if (vcpu->arch.pio.in) 2162 memcpy(q, p, bytes); 2163 else 2164 memcpy(p, q, bytes); 2165 q -= vcpu->arch.pio.guest_page_offset; 2166 vunmap(q); 2167 free_pio_guest_pages(vcpu); 2168 return 0; 2169 } 2170 2171 int complete_pio(struct kvm_vcpu *vcpu) 2172 { 2173 struct kvm_pio_request *io = &vcpu->arch.pio; 2174 long delta; 2175 int r; 2176 2177 kvm_x86_ops->cache_regs(vcpu); 2178 2179 if (!io->string) { 2180 if (io->in) 2181 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, 2182 io->size); 2183 } else { 2184 if (io->in) { 2185 r = pio_copy_data(vcpu); 2186 if (r) { 2187 kvm_x86_ops->cache_regs(vcpu); 2188 return r; 2189 } 2190 } 2191 2192 delta = 1; 2193 if (io->rep) { 2194 delta *= io->cur_count; 2195 /* 2196 * The size of the register should really depend on 2197 * current address size. 2198 */ 2199 vcpu->arch.regs[VCPU_REGS_RCX] -= delta; 2200 } 2201 if (io->down) 2202 delta = -delta; 2203 delta *= io->size; 2204 if (io->in) 2205 vcpu->arch.regs[VCPU_REGS_RDI] += delta; 2206 else 2207 vcpu->arch.regs[VCPU_REGS_RSI] += delta; 2208 } 2209 2210 kvm_x86_ops->decache_regs(vcpu); 2211 2212 io->count -= io->cur_count; 2213 io->cur_count = 0; 2214 2215 return 0; 2216 } 2217 2218 static void kernel_pio(struct kvm_io_device *pio_dev, 2219 struct kvm_vcpu *vcpu, 2220 void *pd) 2221 { 2222 /* TODO: String I/O for in kernel device */ 2223 2224 mutex_lock(&vcpu->kvm->lock); 2225 if (vcpu->arch.pio.in) 2226 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2227 vcpu->arch.pio.size, 2228 pd); 2229 else 2230 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2231 vcpu->arch.pio.size, 2232 pd); 2233 mutex_unlock(&vcpu->kvm->lock); 2234 } 2235 2236 static void pio_string_write(struct kvm_io_device *pio_dev, 2237 struct kvm_vcpu *vcpu) 2238 { 2239 struct kvm_pio_request *io = &vcpu->arch.pio; 2240 void *pd = vcpu->arch.pio_data; 2241 int i; 2242 2243 mutex_lock(&vcpu->kvm->lock); 2244 for (i = 0; i < io->cur_count; i++) { 2245 kvm_iodevice_write(pio_dev, io->port, 2246 io->size, 2247 pd); 2248 pd += io->size; 2249 } 2250 mutex_unlock(&vcpu->kvm->lock); 2251 } 2252 2253 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2254 gpa_t addr) 2255 { 2256 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2257 } 2258 2259 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2260 int size, unsigned port) 2261 { 2262 struct kvm_io_device *pio_dev; 2263 2264 vcpu->run->exit_reason = KVM_EXIT_IO; 2265 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2266 vcpu->run->io.size = vcpu->arch.pio.size = size; 2267 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2268 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2269 vcpu->run->io.port = vcpu->arch.pio.port = port; 2270 vcpu->arch.pio.in = in; 2271 vcpu->arch.pio.string = 0; 2272 vcpu->arch.pio.down = 0; 2273 vcpu->arch.pio.guest_page_offset = 0; 2274 vcpu->arch.pio.rep = 0; 2275 2276 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2277 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2278 handler); 2279 else 2280 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2281 handler); 2282 2283 kvm_x86_ops->cache_regs(vcpu); 2284 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2285 kvm_x86_ops->decache_regs(vcpu); 2286 2287 kvm_x86_ops->skip_emulated_instruction(vcpu); 2288 2289 pio_dev = vcpu_find_pio_dev(vcpu, port); 2290 if (pio_dev) { 2291 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2292 complete_pio(vcpu); 2293 return 1; 2294 } 2295 return 0; 2296 } 2297 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2298 2299 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2300 int size, unsigned long count, int down, 2301 gva_t address, int rep, unsigned port) 2302 { 2303 unsigned now, in_page; 2304 int i, ret = 0; 2305 int nr_pages = 1; 2306 struct page *page; 2307 struct kvm_io_device *pio_dev; 2308 2309 vcpu->run->exit_reason = KVM_EXIT_IO; 2310 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2311 vcpu->run->io.size = vcpu->arch.pio.size = size; 2312 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2313 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2314 vcpu->run->io.port = vcpu->arch.pio.port = port; 2315 vcpu->arch.pio.in = in; 2316 vcpu->arch.pio.string = 1; 2317 vcpu->arch.pio.down = down; 2318 vcpu->arch.pio.guest_page_offset = offset_in_page(address); 2319 vcpu->arch.pio.rep = rep; 2320 2321 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2322 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2323 handler); 2324 else 2325 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2326 handler); 2327 2328 if (!count) { 2329 kvm_x86_ops->skip_emulated_instruction(vcpu); 2330 return 1; 2331 } 2332 2333 if (!down) 2334 in_page = PAGE_SIZE - offset_in_page(address); 2335 else 2336 in_page = offset_in_page(address) + size; 2337 now = min(count, (unsigned long)in_page / size); 2338 if (!now) { 2339 /* 2340 * String I/O straddles page boundary. Pin two guest pages 2341 * so that we satisfy atomicity constraints. Do just one 2342 * transaction to avoid complexity. 2343 */ 2344 nr_pages = 2; 2345 now = 1; 2346 } 2347 if (down) { 2348 /* 2349 * String I/O in reverse. Yuck. Kill the guest, fix later. 2350 */ 2351 pr_unimpl(vcpu, "guest string pio down\n"); 2352 kvm_inject_gp(vcpu, 0); 2353 return 1; 2354 } 2355 vcpu->run->io.count = now; 2356 vcpu->arch.pio.cur_count = now; 2357 2358 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2359 kvm_x86_ops->skip_emulated_instruction(vcpu); 2360 2361 for (i = 0; i < nr_pages; ++i) { 2362 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2363 vcpu->arch.pio.guest_pages[i] = page; 2364 if (!page) { 2365 kvm_inject_gp(vcpu, 0); 2366 free_pio_guest_pages(vcpu); 2367 return 1; 2368 } 2369 } 2370 2371 pio_dev = vcpu_find_pio_dev(vcpu, port); 2372 if (!vcpu->arch.pio.in) { 2373 /* string PIO write */ 2374 ret = pio_copy_data(vcpu); 2375 if (ret >= 0 && pio_dev) { 2376 pio_string_write(pio_dev, vcpu); 2377 complete_pio(vcpu); 2378 if (vcpu->arch.pio.count == 0) 2379 ret = 1; 2380 } 2381 } else if (pio_dev) 2382 pr_unimpl(vcpu, "no string pio read support yet, " 2383 "port %x size %d count %ld\n", 2384 port, size, count); 2385 2386 return ret; 2387 } 2388 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2389 2390 int kvm_arch_init(void *opaque) 2391 { 2392 int r; 2393 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2394 2395 if (kvm_x86_ops) { 2396 printk(KERN_ERR "kvm: already loaded the other module\n"); 2397 r = -EEXIST; 2398 goto out; 2399 } 2400 2401 if (!ops->cpu_has_kvm_support()) { 2402 printk(KERN_ERR "kvm: no hardware support\n"); 2403 r = -EOPNOTSUPP; 2404 goto out; 2405 } 2406 if (ops->disabled_by_bios()) { 2407 printk(KERN_ERR "kvm: disabled by bios\n"); 2408 r = -EOPNOTSUPP; 2409 goto out; 2410 } 2411 2412 r = kvm_mmu_module_init(); 2413 if (r) 2414 goto out; 2415 2416 kvm_init_msr_list(); 2417 2418 kvm_x86_ops = ops; 2419 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2420 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2421 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2422 PT_DIRTY_MASK, PT64_NX_MASK, 0); 2423 return 0; 2424 2425 out: 2426 return r; 2427 } 2428 2429 void kvm_arch_exit(void) 2430 { 2431 kvm_x86_ops = NULL; 2432 kvm_mmu_module_exit(); 2433 } 2434 2435 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2436 { 2437 ++vcpu->stat.halt_exits; 2438 KVMTRACE_0D(HLT, vcpu, handler); 2439 if (irqchip_in_kernel(vcpu->kvm)) { 2440 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2441 up_read(&vcpu->kvm->slots_lock); 2442 kvm_vcpu_block(vcpu); 2443 down_read(&vcpu->kvm->slots_lock); 2444 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 2445 return -EINTR; 2446 return 1; 2447 } else { 2448 vcpu->run->exit_reason = KVM_EXIT_HLT; 2449 return 0; 2450 } 2451 } 2452 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2453 2454 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2455 unsigned long a1) 2456 { 2457 if (is_long_mode(vcpu)) 2458 return a0; 2459 else 2460 return a0 | ((gpa_t)a1 << 32); 2461 } 2462 2463 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2464 { 2465 unsigned long nr, a0, a1, a2, a3, ret; 2466 int r = 1; 2467 2468 kvm_x86_ops->cache_regs(vcpu); 2469 2470 nr = vcpu->arch.regs[VCPU_REGS_RAX]; 2471 a0 = vcpu->arch.regs[VCPU_REGS_RBX]; 2472 a1 = vcpu->arch.regs[VCPU_REGS_RCX]; 2473 a2 = vcpu->arch.regs[VCPU_REGS_RDX]; 2474 a3 = vcpu->arch.regs[VCPU_REGS_RSI]; 2475 2476 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2477 2478 if (!is_long_mode(vcpu)) { 2479 nr &= 0xFFFFFFFF; 2480 a0 &= 0xFFFFFFFF; 2481 a1 &= 0xFFFFFFFF; 2482 a2 &= 0xFFFFFFFF; 2483 a3 &= 0xFFFFFFFF; 2484 } 2485 2486 switch (nr) { 2487 case KVM_HC_VAPIC_POLL_IRQ: 2488 ret = 0; 2489 break; 2490 case KVM_HC_MMU_OP: 2491 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2492 break; 2493 default: 2494 ret = -KVM_ENOSYS; 2495 break; 2496 } 2497 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2498 kvm_x86_ops->decache_regs(vcpu); 2499 ++vcpu->stat.hypercalls; 2500 return r; 2501 } 2502 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2503 2504 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2505 { 2506 char instruction[3]; 2507 int ret = 0; 2508 2509 2510 /* 2511 * Blow out the MMU to ensure that no other VCPU has an active mapping 2512 * to ensure that the updated hypercall appears atomically across all 2513 * VCPUs. 2514 */ 2515 kvm_mmu_zap_all(vcpu->kvm); 2516 2517 kvm_x86_ops->cache_regs(vcpu); 2518 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2519 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) 2520 != X86EMUL_CONTINUE) 2521 ret = -EFAULT; 2522 2523 return ret; 2524 } 2525 2526 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2527 { 2528 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2529 } 2530 2531 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2532 { 2533 struct descriptor_table dt = { limit, base }; 2534 2535 kvm_x86_ops->set_gdt(vcpu, &dt); 2536 } 2537 2538 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2539 { 2540 struct descriptor_table dt = { limit, base }; 2541 2542 kvm_x86_ops->set_idt(vcpu, &dt); 2543 } 2544 2545 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2546 unsigned long *rflags) 2547 { 2548 kvm_lmsw(vcpu, msw); 2549 *rflags = kvm_x86_ops->get_rflags(vcpu); 2550 } 2551 2552 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2553 { 2554 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2555 switch (cr) { 2556 case 0: 2557 return vcpu->arch.cr0; 2558 case 2: 2559 return vcpu->arch.cr2; 2560 case 3: 2561 return vcpu->arch.cr3; 2562 case 4: 2563 return vcpu->arch.cr4; 2564 case 8: 2565 return kvm_get_cr8(vcpu); 2566 default: 2567 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2568 return 0; 2569 } 2570 } 2571 2572 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2573 unsigned long *rflags) 2574 { 2575 switch (cr) { 2576 case 0: 2577 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2578 *rflags = kvm_x86_ops->get_rflags(vcpu); 2579 break; 2580 case 2: 2581 vcpu->arch.cr2 = val; 2582 break; 2583 case 3: 2584 kvm_set_cr3(vcpu, val); 2585 break; 2586 case 4: 2587 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2588 break; 2589 case 8: 2590 kvm_set_cr8(vcpu, val & 0xfUL); 2591 break; 2592 default: 2593 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2594 } 2595 } 2596 2597 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2598 { 2599 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2600 int j, nent = vcpu->arch.cpuid_nent; 2601 2602 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2603 /* when no next entry is found, the current entry[i] is reselected */ 2604 for (j = i + 1; j == i; j = (j + 1) % nent) { 2605 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2606 if (ej->function == e->function) { 2607 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2608 return j; 2609 } 2610 } 2611 return 0; /* silence gcc, even though control never reaches here */ 2612 } 2613 2614 /* find an entry with matching function, matching index (if needed), and that 2615 * should be read next (if it's stateful) */ 2616 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 2617 u32 function, u32 index) 2618 { 2619 if (e->function != function) 2620 return 0; 2621 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 2622 return 0; 2623 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 2624 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 2625 return 0; 2626 return 1; 2627 } 2628 2629 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 2630 { 2631 int i; 2632 u32 function, index; 2633 struct kvm_cpuid_entry2 *e, *best; 2634 2635 kvm_x86_ops->cache_regs(vcpu); 2636 function = vcpu->arch.regs[VCPU_REGS_RAX]; 2637 index = vcpu->arch.regs[VCPU_REGS_RCX]; 2638 vcpu->arch.regs[VCPU_REGS_RAX] = 0; 2639 vcpu->arch.regs[VCPU_REGS_RBX] = 0; 2640 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 2641 vcpu->arch.regs[VCPU_REGS_RDX] = 0; 2642 best = NULL; 2643 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2644 e = &vcpu->arch.cpuid_entries[i]; 2645 if (is_matching_cpuid_entry(e, function, index)) { 2646 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 2647 move_to_next_stateful_cpuid_entry(vcpu, i); 2648 best = e; 2649 break; 2650 } 2651 /* 2652 * Both basic or both extended? 2653 */ 2654 if (((e->function ^ function) & 0x80000000) == 0) 2655 if (!best || e->function > best->function) 2656 best = e; 2657 } 2658 if (best) { 2659 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; 2660 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; 2661 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; 2662 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; 2663 } 2664 kvm_x86_ops->decache_regs(vcpu); 2665 kvm_x86_ops->skip_emulated_instruction(vcpu); 2666 KVMTRACE_5D(CPUID, vcpu, function, 2667 (u32)vcpu->arch.regs[VCPU_REGS_RAX], 2668 (u32)vcpu->arch.regs[VCPU_REGS_RBX], 2669 (u32)vcpu->arch.regs[VCPU_REGS_RCX], 2670 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); 2671 } 2672 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2673 2674 /* 2675 * Check if userspace requested an interrupt window, and that the 2676 * interrupt window is open. 2677 * 2678 * No need to exit to userspace if we already have an interrupt queued. 2679 */ 2680 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 2681 struct kvm_run *kvm_run) 2682 { 2683 return (!vcpu->arch.irq_summary && 2684 kvm_run->request_interrupt_window && 2685 vcpu->arch.interrupt_window_open && 2686 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 2687 } 2688 2689 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 2690 struct kvm_run *kvm_run) 2691 { 2692 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 2693 kvm_run->cr8 = kvm_get_cr8(vcpu); 2694 kvm_run->apic_base = kvm_get_apic_base(vcpu); 2695 if (irqchip_in_kernel(vcpu->kvm)) 2696 kvm_run->ready_for_interrupt_injection = 1; 2697 else 2698 kvm_run->ready_for_interrupt_injection = 2699 (vcpu->arch.interrupt_window_open && 2700 vcpu->arch.irq_summary == 0); 2701 } 2702 2703 static void vapic_enter(struct kvm_vcpu *vcpu) 2704 { 2705 struct kvm_lapic *apic = vcpu->arch.apic; 2706 struct page *page; 2707 2708 if (!apic || !apic->vapic_addr) 2709 return; 2710 2711 down_read(¤t->mm->mmap_sem); 2712 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2713 up_read(¤t->mm->mmap_sem); 2714 2715 vcpu->arch.apic->vapic_page = page; 2716 } 2717 2718 static void vapic_exit(struct kvm_vcpu *vcpu) 2719 { 2720 struct kvm_lapic *apic = vcpu->arch.apic; 2721 2722 if (!apic || !apic->vapic_addr) 2723 return; 2724 2725 kvm_release_page_dirty(apic->vapic_page); 2726 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2727 } 2728 2729 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2730 { 2731 int r; 2732 2733 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 2734 pr_debug("vcpu %d received sipi with vector # %x\n", 2735 vcpu->vcpu_id, vcpu->arch.sipi_vector); 2736 kvm_lapic_reset(vcpu); 2737 r = kvm_x86_ops->vcpu_reset(vcpu); 2738 if (r) 2739 return r; 2740 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2741 } 2742 2743 down_read(&vcpu->kvm->slots_lock); 2744 vapic_enter(vcpu); 2745 2746 preempted: 2747 if (vcpu->guest_debug.enabled) 2748 kvm_x86_ops->guest_debug_pre(vcpu); 2749 2750 again: 2751 if (vcpu->requests) 2752 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2753 kvm_mmu_unload(vcpu); 2754 2755 r = kvm_mmu_reload(vcpu); 2756 if (unlikely(r)) 2757 goto out; 2758 2759 if (vcpu->requests) { 2760 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2761 __kvm_migrate_apic_timer(vcpu); 2762 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2763 &vcpu->requests)) { 2764 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2765 r = 0; 2766 goto out; 2767 } 2768 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 2769 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2770 r = 0; 2771 goto out; 2772 } 2773 } 2774 2775 kvm_inject_pending_timer_irqs(vcpu); 2776 2777 preempt_disable(); 2778 2779 kvm_x86_ops->prepare_guest_switch(vcpu); 2780 kvm_load_guest_fpu(vcpu); 2781 2782 local_irq_disable(); 2783 2784 if (need_resched()) { 2785 local_irq_enable(); 2786 preempt_enable(); 2787 r = 1; 2788 goto out; 2789 } 2790 2791 if (vcpu->requests) 2792 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { 2793 local_irq_enable(); 2794 preempt_enable(); 2795 r = 1; 2796 goto out; 2797 } 2798 2799 if (signal_pending(current)) { 2800 local_irq_enable(); 2801 preempt_enable(); 2802 r = -EINTR; 2803 kvm_run->exit_reason = KVM_EXIT_INTR; 2804 ++vcpu->stat.signal_exits; 2805 goto out; 2806 } 2807 2808 vcpu->guest_mode = 1; 2809 /* 2810 * Make sure that guest_mode assignment won't happen after 2811 * testing the pending IRQ vector bitmap. 2812 */ 2813 smp_wmb(); 2814 2815 if (vcpu->arch.exception.pending) 2816 __queue_exception(vcpu); 2817 else if (irqchip_in_kernel(vcpu->kvm)) 2818 kvm_x86_ops->inject_pending_irq(vcpu); 2819 else 2820 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2821 2822 kvm_lapic_sync_to_vapic(vcpu); 2823 2824 up_read(&vcpu->kvm->slots_lock); 2825 2826 kvm_guest_enter(); 2827 2828 if (vcpu->requests) 2829 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2830 kvm_x86_ops->tlb_flush(vcpu); 2831 2832 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2833 kvm_x86_ops->run(vcpu, kvm_run); 2834 2835 vcpu->guest_mode = 0; 2836 local_irq_enable(); 2837 2838 ++vcpu->stat.exits; 2839 2840 /* 2841 * We must have an instruction between local_irq_enable() and 2842 * kvm_guest_exit(), so the timer interrupt isn't delayed by 2843 * the interrupt shadow. The stat.exits increment will do nicely. 2844 * But we need to prevent reordering, hence this barrier(): 2845 */ 2846 barrier(); 2847 2848 kvm_guest_exit(); 2849 2850 preempt_enable(); 2851 2852 down_read(&vcpu->kvm->slots_lock); 2853 2854 /* 2855 * Profile KVM exit RIPs: 2856 */ 2857 if (unlikely(prof_on == KVM_PROFILING)) { 2858 kvm_x86_ops->cache_regs(vcpu); 2859 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); 2860 } 2861 2862 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2863 vcpu->arch.exception.pending = false; 2864 2865 kvm_lapic_sync_from_vapic(vcpu); 2866 2867 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2868 2869 if (r > 0) { 2870 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2871 r = -EINTR; 2872 kvm_run->exit_reason = KVM_EXIT_INTR; 2873 ++vcpu->stat.request_irq_exits; 2874 goto out; 2875 } 2876 if (!need_resched()) 2877 goto again; 2878 } 2879 2880 out: 2881 up_read(&vcpu->kvm->slots_lock); 2882 if (r > 0) { 2883 kvm_resched(vcpu); 2884 down_read(&vcpu->kvm->slots_lock); 2885 goto preempted; 2886 } 2887 2888 post_kvm_run_save(vcpu, kvm_run); 2889 2890 down_read(&vcpu->kvm->slots_lock); 2891 vapic_exit(vcpu); 2892 up_read(&vcpu->kvm->slots_lock); 2893 2894 return r; 2895 } 2896 2897 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2898 { 2899 int r; 2900 sigset_t sigsaved; 2901 2902 vcpu_load(vcpu); 2903 2904 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 2905 kvm_vcpu_block(vcpu); 2906 vcpu_put(vcpu); 2907 return -EAGAIN; 2908 } 2909 2910 if (vcpu->sigset_active) 2911 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 2912 2913 /* re-sync apic's tpr */ 2914 if (!irqchip_in_kernel(vcpu->kvm)) 2915 kvm_set_cr8(vcpu, kvm_run->cr8); 2916 2917 if (vcpu->arch.pio.cur_count) { 2918 r = complete_pio(vcpu); 2919 if (r) 2920 goto out; 2921 } 2922 #if CONFIG_HAS_IOMEM 2923 if (vcpu->mmio_needed) { 2924 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 2925 vcpu->mmio_read_completed = 1; 2926 vcpu->mmio_needed = 0; 2927 2928 down_read(&vcpu->kvm->slots_lock); 2929 r = emulate_instruction(vcpu, kvm_run, 2930 vcpu->arch.mmio_fault_cr2, 0, 2931 EMULTYPE_NO_DECODE); 2932 up_read(&vcpu->kvm->slots_lock); 2933 if (r == EMULATE_DO_MMIO) { 2934 /* 2935 * Read-modify-write. Back to userspace. 2936 */ 2937 r = 0; 2938 goto out; 2939 } 2940 } 2941 #endif 2942 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 2943 kvm_x86_ops->cache_regs(vcpu); 2944 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 2945 kvm_x86_ops->decache_regs(vcpu); 2946 } 2947 2948 r = __vcpu_run(vcpu, kvm_run); 2949 2950 out: 2951 if (vcpu->sigset_active) 2952 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 2953 2954 vcpu_put(vcpu); 2955 return r; 2956 } 2957 2958 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 2959 { 2960 vcpu_load(vcpu); 2961 2962 kvm_x86_ops->cache_regs(vcpu); 2963 2964 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2965 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; 2966 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; 2967 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; 2968 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; 2969 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; 2970 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2971 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; 2972 #ifdef CONFIG_X86_64 2973 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; 2974 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; 2975 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; 2976 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; 2977 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; 2978 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; 2979 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; 2980 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; 2981 #endif 2982 2983 regs->rip = vcpu->arch.rip; 2984 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 2985 2986 /* 2987 * Don't leak debug flags in case they were set for guest debugging 2988 */ 2989 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) 2990 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 2991 2992 vcpu_put(vcpu); 2993 2994 return 0; 2995 } 2996 2997 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 2998 { 2999 vcpu_load(vcpu); 3000 3001 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; 3002 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; 3003 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; 3004 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; 3005 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; 3006 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; 3007 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; 3008 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; 3009 #ifdef CONFIG_X86_64 3010 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; 3011 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; 3012 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; 3013 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; 3014 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; 3015 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; 3016 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; 3017 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; 3018 #endif 3019 3020 vcpu->arch.rip = regs->rip; 3021 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3022 3023 kvm_x86_ops->decache_regs(vcpu); 3024 3025 vcpu->arch.exception.pending = false; 3026 3027 vcpu_put(vcpu); 3028 3029 return 0; 3030 } 3031 3032 static void get_segment(struct kvm_vcpu *vcpu, 3033 struct kvm_segment *var, int seg) 3034 { 3035 kvm_x86_ops->get_segment(vcpu, var, seg); 3036 } 3037 3038 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3039 { 3040 struct kvm_segment cs; 3041 3042 get_segment(vcpu, &cs, VCPU_SREG_CS); 3043 *db = cs.db; 3044 *l = cs.l; 3045 } 3046 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3047 3048 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3049 struct kvm_sregs *sregs) 3050 { 3051 struct descriptor_table dt; 3052 int pending_vec; 3053 3054 vcpu_load(vcpu); 3055 3056 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3057 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3058 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3059 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3060 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3061 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3062 3063 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3064 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3065 3066 kvm_x86_ops->get_idt(vcpu, &dt); 3067 sregs->idt.limit = dt.limit; 3068 sregs->idt.base = dt.base; 3069 kvm_x86_ops->get_gdt(vcpu, &dt); 3070 sregs->gdt.limit = dt.limit; 3071 sregs->gdt.base = dt.base; 3072 3073 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3074 sregs->cr0 = vcpu->arch.cr0; 3075 sregs->cr2 = vcpu->arch.cr2; 3076 sregs->cr3 = vcpu->arch.cr3; 3077 sregs->cr4 = vcpu->arch.cr4; 3078 sregs->cr8 = kvm_get_cr8(vcpu); 3079 sregs->efer = vcpu->arch.shadow_efer; 3080 sregs->apic_base = kvm_get_apic_base(vcpu); 3081 3082 if (irqchip_in_kernel(vcpu->kvm)) { 3083 memset(sregs->interrupt_bitmap, 0, 3084 sizeof sregs->interrupt_bitmap); 3085 pending_vec = kvm_x86_ops->get_irq(vcpu); 3086 if (pending_vec >= 0) 3087 set_bit(pending_vec, 3088 (unsigned long *)sregs->interrupt_bitmap); 3089 } else 3090 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, 3091 sizeof sregs->interrupt_bitmap); 3092 3093 vcpu_put(vcpu); 3094 3095 return 0; 3096 } 3097 3098 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3099 struct kvm_mp_state *mp_state) 3100 { 3101 vcpu_load(vcpu); 3102 mp_state->mp_state = vcpu->arch.mp_state; 3103 vcpu_put(vcpu); 3104 return 0; 3105 } 3106 3107 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3108 struct kvm_mp_state *mp_state) 3109 { 3110 vcpu_load(vcpu); 3111 vcpu->arch.mp_state = mp_state->mp_state; 3112 vcpu_put(vcpu); 3113 return 0; 3114 } 3115 3116 static void set_segment(struct kvm_vcpu *vcpu, 3117 struct kvm_segment *var, int seg) 3118 { 3119 kvm_x86_ops->set_segment(vcpu, var, seg); 3120 } 3121 3122 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3123 struct kvm_segment *kvm_desct) 3124 { 3125 kvm_desct->base = seg_desc->base0; 3126 kvm_desct->base |= seg_desc->base1 << 16; 3127 kvm_desct->base |= seg_desc->base2 << 24; 3128 kvm_desct->limit = seg_desc->limit0; 3129 kvm_desct->limit |= seg_desc->limit << 16; 3130 kvm_desct->selector = selector; 3131 kvm_desct->type = seg_desc->type; 3132 kvm_desct->present = seg_desc->p; 3133 kvm_desct->dpl = seg_desc->dpl; 3134 kvm_desct->db = seg_desc->d; 3135 kvm_desct->s = seg_desc->s; 3136 kvm_desct->l = seg_desc->l; 3137 kvm_desct->g = seg_desc->g; 3138 kvm_desct->avl = seg_desc->avl; 3139 if (!selector) 3140 kvm_desct->unusable = 1; 3141 else 3142 kvm_desct->unusable = 0; 3143 kvm_desct->padding = 0; 3144 } 3145 3146 static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, 3147 u16 selector, 3148 struct descriptor_table *dtable) 3149 { 3150 if (selector & 1 << 2) { 3151 struct kvm_segment kvm_seg; 3152 3153 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3154 3155 if (kvm_seg.unusable) 3156 dtable->limit = 0; 3157 else 3158 dtable->limit = kvm_seg.limit; 3159 dtable->base = kvm_seg.base; 3160 } 3161 else 3162 kvm_x86_ops->get_gdt(vcpu, dtable); 3163 } 3164 3165 /* allowed just for 8 bytes segments */ 3166 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3167 struct desc_struct *seg_desc) 3168 { 3169 struct descriptor_table dtable; 3170 u16 index = selector >> 3; 3171 3172 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3173 3174 if (dtable.limit < index * 8 + 7) { 3175 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3176 return 1; 3177 } 3178 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3179 } 3180 3181 /* allowed just for 8 bytes segments */ 3182 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3183 struct desc_struct *seg_desc) 3184 { 3185 struct descriptor_table dtable; 3186 u16 index = selector >> 3; 3187 3188 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3189 3190 if (dtable.limit < index * 8 + 7) 3191 return 1; 3192 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3193 } 3194 3195 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3196 struct desc_struct *seg_desc) 3197 { 3198 u32 base_addr; 3199 3200 base_addr = seg_desc->base0; 3201 base_addr |= (seg_desc->base1 << 16); 3202 base_addr |= (seg_desc->base2 << 24); 3203 3204 return base_addr; 3205 } 3206 3207 static int load_tss_segment32(struct kvm_vcpu *vcpu, 3208 struct desc_struct *seg_desc, 3209 struct tss_segment_32 *tss) 3210 { 3211 u32 base_addr; 3212 3213 base_addr = get_tss_base_addr(vcpu, seg_desc); 3214 3215 return kvm_read_guest(vcpu->kvm, base_addr, tss, 3216 sizeof(struct tss_segment_32)); 3217 } 3218 3219 static int save_tss_segment32(struct kvm_vcpu *vcpu, 3220 struct desc_struct *seg_desc, 3221 struct tss_segment_32 *tss) 3222 { 3223 u32 base_addr; 3224 3225 base_addr = get_tss_base_addr(vcpu, seg_desc); 3226 3227 return kvm_write_guest(vcpu->kvm, base_addr, tss, 3228 sizeof(struct tss_segment_32)); 3229 } 3230 3231 static int load_tss_segment16(struct kvm_vcpu *vcpu, 3232 struct desc_struct *seg_desc, 3233 struct tss_segment_16 *tss) 3234 { 3235 u32 base_addr; 3236 3237 base_addr = get_tss_base_addr(vcpu, seg_desc); 3238 3239 return kvm_read_guest(vcpu->kvm, base_addr, tss, 3240 sizeof(struct tss_segment_16)); 3241 } 3242 3243 static int save_tss_segment16(struct kvm_vcpu *vcpu, 3244 struct desc_struct *seg_desc, 3245 struct tss_segment_16 *tss) 3246 { 3247 u32 base_addr; 3248 3249 base_addr = get_tss_base_addr(vcpu, seg_desc); 3250 3251 return kvm_write_guest(vcpu->kvm, base_addr, tss, 3252 sizeof(struct tss_segment_16)); 3253 } 3254 3255 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3256 { 3257 struct kvm_segment kvm_seg; 3258 3259 get_segment(vcpu, &kvm_seg, seg); 3260 return kvm_seg.selector; 3261 } 3262 3263 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3264 u16 selector, 3265 struct kvm_segment *kvm_seg) 3266 { 3267 struct desc_struct seg_desc; 3268 3269 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3270 return 1; 3271 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3272 return 0; 3273 } 3274 3275 static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3276 int type_bits, int seg) 3277 { 3278 struct kvm_segment kvm_seg; 3279 3280 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3281 return 1; 3282 kvm_seg.type |= type_bits; 3283 3284 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3285 seg != VCPU_SREG_LDTR) 3286 if (!kvm_seg.s) 3287 kvm_seg.unusable = 1; 3288 3289 set_segment(vcpu, &kvm_seg, seg); 3290 return 0; 3291 } 3292 3293 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3294 struct tss_segment_32 *tss) 3295 { 3296 tss->cr3 = vcpu->arch.cr3; 3297 tss->eip = vcpu->arch.rip; 3298 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3299 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; 3300 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3301 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; 3302 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; 3303 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; 3304 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; 3305 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; 3306 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; 3307 3308 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3309 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3310 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3311 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3312 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3313 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3314 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3315 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3316 } 3317 3318 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3319 struct tss_segment_32 *tss) 3320 { 3321 kvm_set_cr3(vcpu, tss->cr3); 3322 3323 vcpu->arch.rip = tss->eip; 3324 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3325 3326 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; 3327 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; 3328 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; 3329 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; 3330 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; 3331 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; 3332 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3333 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3334 3335 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3336 return 1; 3337 3338 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3339 return 1; 3340 3341 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3342 return 1; 3343 3344 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3345 return 1; 3346 3347 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3348 return 1; 3349 3350 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3351 return 1; 3352 3353 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3354 return 1; 3355 return 0; 3356 } 3357 3358 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3359 struct tss_segment_16 *tss) 3360 { 3361 tss->ip = vcpu->arch.rip; 3362 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3363 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; 3364 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; 3365 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; 3366 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; 3367 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; 3368 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; 3369 tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; 3370 tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; 3371 3372 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3373 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3374 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3375 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3376 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3377 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3378 } 3379 3380 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3381 struct tss_segment_16 *tss) 3382 { 3383 vcpu->arch.rip = tss->ip; 3384 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3385 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; 3386 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; 3387 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; 3388 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; 3389 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; 3390 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; 3391 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3392 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3393 3394 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3395 return 1; 3396 3397 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3398 return 1; 3399 3400 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3401 return 1; 3402 3403 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3404 return 1; 3405 3406 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3407 return 1; 3408 return 0; 3409 } 3410 3411 int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3412 struct desc_struct *cseg_desc, 3413 struct desc_struct *nseg_desc) 3414 { 3415 struct tss_segment_16 tss_segment_16; 3416 int ret = 0; 3417 3418 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3419 goto out; 3420 3421 save_state_to_tss16(vcpu, &tss_segment_16); 3422 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16); 3423 3424 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3425 goto out; 3426 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3427 goto out; 3428 3429 ret = 1; 3430 out: 3431 return ret; 3432 } 3433 3434 int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3435 struct desc_struct *cseg_desc, 3436 struct desc_struct *nseg_desc) 3437 { 3438 struct tss_segment_32 tss_segment_32; 3439 int ret = 0; 3440 3441 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3442 goto out; 3443 3444 save_state_to_tss32(vcpu, &tss_segment_32); 3445 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32); 3446 3447 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3448 goto out; 3449 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3450 goto out; 3451 3452 ret = 1; 3453 out: 3454 return ret; 3455 } 3456 3457 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3458 { 3459 struct kvm_segment tr_seg; 3460 struct desc_struct cseg_desc; 3461 struct desc_struct nseg_desc; 3462 int ret = 0; 3463 3464 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3465 3466 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3467 goto out; 3468 3469 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3470 goto out; 3471 3472 3473 if (reason != TASK_SWITCH_IRET) { 3474 int cpl; 3475 3476 cpl = kvm_x86_ops->get_cpl(vcpu); 3477 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3478 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3479 return 1; 3480 } 3481 } 3482 3483 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3484 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3485 return 1; 3486 } 3487 3488 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3489 cseg_desc.type &= ~(1 << 1); //clear the B flag 3490 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3491 &cseg_desc); 3492 } 3493 3494 if (reason == TASK_SWITCH_IRET) { 3495 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3496 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 3497 } 3498 3499 kvm_x86_ops->skip_emulated_instruction(vcpu); 3500 kvm_x86_ops->cache_regs(vcpu); 3501 3502 if (nseg_desc.type & 8) 3503 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3504 &nseg_desc); 3505 else 3506 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3507 &nseg_desc); 3508 3509 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3510 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3511 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 3512 } 3513 3514 if (reason != TASK_SWITCH_IRET) { 3515 nseg_desc.type |= (1 << 1); 3516 save_guest_segment_descriptor(vcpu, tss_selector, 3517 &nseg_desc); 3518 } 3519 3520 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3521 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3522 tr_seg.type = 11; 3523 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3524 out: 3525 kvm_x86_ops->decache_regs(vcpu); 3526 return ret; 3527 } 3528 EXPORT_SYMBOL_GPL(kvm_task_switch); 3529 3530 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 3531 struct kvm_sregs *sregs) 3532 { 3533 int mmu_reset_needed = 0; 3534 int i, pending_vec, max_bits; 3535 struct descriptor_table dt; 3536 3537 vcpu_load(vcpu); 3538 3539 dt.limit = sregs->idt.limit; 3540 dt.base = sregs->idt.base; 3541 kvm_x86_ops->set_idt(vcpu, &dt); 3542 dt.limit = sregs->gdt.limit; 3543 dt.base = sregs->gdt.base; 3544 kvm_x86_ops->set_gdt(vcpu, &dt); 3545 3546 vcpu->arch.cr2 = sregs->cr2; 3547 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 3548 vcpu->arch.cr3 = sregs->cr3; 3549 3550 kvm_set_cr8(vcpu, sregs->cr8); 3551 3552 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 3553 kvm_x86_ops->set_efer(vcpu, sregs->efer); 3554 kvm_set_apic_base(vcpu, sregs->apic_base); 3555 3556 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3557 3558 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 3559 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 3560 vcpu->arch.cr0 = sregs->cr0; 3561 3562 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 3563 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 3564 if (!is_long_mode(vcpu) && is_pae(vcpu)) 3565 load_pdptrs(vcpu, vcpu->arch.cr3); 3566 3567 if (mmu_reset_needed) 3568 kvm_mmu_reset_context(vcpu); 3569 3570 if (!irqchip_in_kernel(vcpu->kvm)) { 3571 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 3572 sizeof vcpu->arch.irq_pending); 3573 vcpu->arch.irq_summary = 0; 3574 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 3575 if (vcpu->arch.irq_pending[i]) 3576 __set_bit(i, &vcpu->arch.irq_summary); 3577 } else { 3578 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 3579 pending_vec = find_first_bit( 3580 (const unsigned long *)sregs->interrupt_bitmap, 3581 max_bits); 3582 /* Only pending external irq is handled here */ 3583 if (pending_vec < max_bits) { 3584 kvm_x86_ops->set_irq(vcpu, pending_vec); 3585 pr_debug("Set back pending irq %d\n", 3586 pending_vec); 3587 } 3588 } 3589 3590 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3591 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3592 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3593 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3594 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3595 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3596 3597 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3598 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3599 3600 vcpu_put(vcpu); 3601 3602 return 0; 3603 } 3604 3605 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 3606 struct kvm_debug_guest *dbg) 3607 { 3608 int r; 3609 3610 vcpu_load(vcpu); 3611 3612 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 3613 3614 vcpu_put(vcpu); 3615 3616 return r; 3617 } 3618 3619 /* 3620 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 3621 * we have asm/x86/processor.h 3622 */ 3623 struct fxsave { 3624 u16 cwd; 3625 u16 swd; 3626 u16 twd; 3627 u16 fop; 3628 u64 rip; 3629 u64 rdp; 3630 u32 mxcsr; 3631 u32 mxcsr_mask; 3632 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 3633 #ifdef CONFIG_X86_64 3634 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 3635 #else 3636 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 3637 #endif 3638 }; 3639 3640 /* 3641 * Translate a guest virtual address to a guest physical address. 3642 */ 3643 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 3644 struct kvm_translation *tr) 3645 { 3646 unsigned long vaddr = tr->linear_address; 3647 gpa_t gpa; 3648 3649 vcpu_load(vcpu); 3650 down_read(&vcpu->kvm->slots_lock); 3651 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 3652 up_read(&vcpu->kvm->slots_lock); 3653 tr->physical_address = gpa; 3654 tr->valid = gpa != UNMAPPED_GVA; 3655 tr->writeable = 1; 3656 tr->usermode = 0; 3657 vcpu_put(vcpu); 3658 3659 return 0; 3660 } 3661 3662 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 3663 { 3664 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 3665 3666 vcpu_load(vcpu); 3667 3668 memcpy(fpu->fpr, fxsave->st_space, 128); 3669 fpu->fcw = fxsave->cwd; 3670 fpu->fsw = fxsave->swd; 3671 fpu->ftwx = fxsave->twd; 3672 fpu->last_opcode = fxsave->fop; 3673 fpu->last_ip = fxsave->rip; 3674 fpu->last_dp = fxsave->rdp; 3675 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 3676 3677 vcpu_put(vcpu); 3678 3679 return 0; 3680 } 3681 3682 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 3683 { 3684 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 3685 3686 vcpu_load(vcpu); 3687 3688 memcpy(fxsave->st_space, fpu->fpr, 128); 3689 fxsave->cwd = fpu->fcw; 3690 fxsave->swd = fpu->fsw; 3691 fxsave->twd = fpu->ftwx; 3692 fxsave->fop = fpu->last_opcode; 3693 fxsave->rip = fpu->last_ip; 3694 fxsave->rdp = fpu->last_dp; 3695 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 3696 3697 vcpu_put(vcpu); 3698 3699 return 0; 3700 } 3701 3702 void fx_init(struct kvm_vcpu *vcpu) 3703 { 3704 unsigned after_mxcsr_mask; 3705 3706 /* 3707 * Touch the fpu the first time in non atomic context as if 3708 * this is the first fpu instruction the exception handler 3709 * will fire before the instruction returns and it'll have to 3710 * allocate ram with GFP_KERNEL. 3711 */ 3712 if (!used_math()) 3713 fx_save(&vcpu->arch.host_fx_image); 3714 3715 /* Initialize guest FPU by resetting ours and saving into guest's */ 3716 preempt_disable(); 3717 fx_save(&vcpu->arch.host_fx_image); 3718 fx_finit(); 3719 fx_save(&vcpu->arch.guest_fx_image); 3720 fx_restore(&vcpu->arch.host_fx_image); 3721 preempt_enable(); 3722 3723 vcpu->arch.cr0 |= X86_CR0_ET; 3724 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 3725 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 3726 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 3727 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 3728 } 3729 EXPORT_SYMBOL_GPL(fx_init); 3730 3731 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 3732 { 3733 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 3734 return; 3735 3736 vcpu->guest_fpu_loaded = 1; 3737 fx_save(&vcpu->arch.host_fx_image); 3738 fx_restore(&vcpu->arch.guest_fx_image); 3739 } 3740 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3741 3742 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 3743 { 3744 if (!vcpu->guest_fpu_loaded) 3745 return; 3746 3747 vcpu->guest_fpu_loaded = 0; 3748 fx_save(&vcpu->arch.guest_fx_image); 3749 fx_restore(&vcpu->arch.host_fx_image); 3750 ++vcpu->stat.fpu_reload; 3751 } 3752 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3753 3754 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 3755 { 3756 kvm_x86_ops->vcpu_free(vcpu); 3757 } 3758 3759 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 3760 unsigned int id) 3761 { 3762 return kvm_x86_ops->vcpu_create(kvm, id); 3763 } 3764 3765 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 3766 { 3767 int r; 3768 3769 /* We do fxsave: this must be aligned. */ 3770 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 3771 3772 vcpu_load(vcpu); 3773 r = kvm_arch_vcpu_reset(vcpu); 3774 if (r == 0) 3775 r = kvm_mmu_setup(vcpu); 3776 vcpu_put(vcpu); 3777 if (r < 0) 3778 goto free_vcpu; 3779 3780 return 0; 3781 free_vcpu: 3782 kvm_x86_ops->vcpu_free(vcpu); 3783 return r; 3784 } 3785 3786 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 3787 { 3788 vcpu_load(vcpu); 3789 kvm_mmu_unload(vcpu); 3790 vcpu_put(vcpu); 3791 3792 kvm_x86_ops->vcpu_free(vcpu); 3793 } 3794 3795 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 3796 { 3797 return kvm_x86_ops->vcpu_reset(vcpu); 3798 } 3799 3800 void kvm_arch_hardware_enable(void *garbage) 3801 { 3802 kvm_x86_ops->hardware_enable(garbage); 3803 } 3804 3805 void kvm_arch_hardware_disable(void *garbage) 3806 { 3807 kvm_x86_ops->hardware_disable(garbage); 3808 } 3809 3810 int kvm_arch_hardware_setup(void) 3811 { 3812 return kvm_x86_ops->hardware_setup(); 3813 } 3814 3815 void kvm_arch_hardware_unsetup(void) 3816 { 3817 kvm_x86_ops->hardware_unsetup(); 3818 } 3819 3820 void kvm_arch_check_processor_compat(void *rtn) 3821 { 3822 kvm_x86_ops->check_processor_compatibility(rtn); 3823 } 3824 3825 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 3826 { 3827 struct page *page; 3828 struct kvm *kvm; 3829 int r; 3830 3831 BUG_ON(vcpu->kvm == NULL); 3832 kvm = vcpu->kvm; 3833 3834 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3835 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 3836 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3837 else 3838 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 3839 3840 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3841 if (!page) { 3842 r = -ENOMEM; 3843 goto fail; 3844 } 3845 vcpu->arch.pio_data = page_address(page); 3846 3847 r = kvm_mmu_create(vcpu); 3848 if (r < 0) 3849 goto fail_free_pio_data; 3850 3851 if (irqchip_in_kernel(kvm)) { 3852 r = kvm_create_lapic(vcpu); 3853 if (r < 0) 3854 goto fail_mmu_destroy; 3855 } 3856 3857 return 0; 3858 3859 fail_mmu_destroy: 3860 kvm_mmu_destroy(vcpu); 3861 fail_free_pio_data: 3862 free_page((unsigned long)vcpu->arch.pio_data); 3863 fail: 3864 return r; 3865 } 3866 3867 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 3868 { 3869 kvm_free_lapic(vcpu); 3870 down_read(&vcpu->kvm->slots_lock); 3871 kvm_mmu_destroy(vcpu); 3872 up_read(&vcpu->kvm->slots_lock); 3873 free_page((unsigned long)vcpu->arch.pio_data); 3874 } 3875 3876 struct kvm *kvm_arch_create_vm(void) 3877 { 3878 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 3879 3880 if (!kvm) 3881 return ERR_PTR(-ENOMEM); 3882 3883 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 3884 3885 return kvm; 3886 } 3887 3888 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 3889 { 3890 vcpu_load(vcpu); 3891 kvm_mmu_unload(vcpu); 3892 vcpu_put(vcpu); 3893 } 3894 3895 static void kvm_free_vcpus(struct kvm *kvm) 3896 { 3897 unsigned int i; 3898 3899 /* 3900 * Unpin any mmu pages first. 3901 */ 3902 for (i = 0; i < KVM_MAX_VCPUS; ++i) 3903 if (kvm->vcpus[i]) 3904 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 3905 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 3906 if (kvm->vcpus[i]) { 3907 kvm_arch_vcpu_free(kvm->vcpus[i]); 3908 kvm->vcpus[i] = NULL; 3909 } 3910 } 3911 3912 } 3913 3914 void kvm_arch_destroy_vm(struct kvm *kvm) 3915 { 3916 kvm_free_pit(kvm); 3917 kfree(kvm->arch.vpic); 3918 kfree(kvm->arch.vioapic); 3919 kvm_free_vcpus(kvm); 3920 kvm_free_physmem(kvm); 3921 if (kvm->arch.apic_access_page) 3922 put_page(kvm->arch.apic_access_page); 3923 if (kvm->arch.ept_identity_pagetable) 3924 put_page(kvm->arch.ept_identity_pagetable); 3925 kfree(kvm); 3926 } 3927 3928 int kvm_arch_set_memory_region(struct kvm *kvm, 3929 struct kvm_userspace_memory_region *mem, 3930 struct kvm_memory_slot old, 3931 int user_alloc) 3932 { 3933 int npages = mem->memory_size >> PAGE_SHIFT; 3934 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 3935 3936 /*To keep backward compatibility with older userspace, 3937 *x86 needs to hanlde !user_alloc case. 3938 */ 3939 if (!user_alloc) { 3940 if (npages && !old.rmap) { 3941 down_write(¤t->mm->mmap_sem); 3942 memslot->userspace_addr = do_mmap(NULL, 0, 3943 npages * PAGE_SIZE, 3944 PROT_READ | PROT_WRITE, 3945 MAP_SHARED | MAP_ANONYMOUS, 3946 0); 3947 up_write(¤t->mm->mmap_sem); 3948 3949 if (IS_ERR((void *)memslot->userspace_addr)) 3950 return PTR_ERR((void *)memslot->userspace_addr); 3951 } else { 3952 if (!old.user_alloc && old.rmap) { 3953 int ret; 3954 3955 down_write(¤t->mm->mmap_sem); 3956 ret = do_munmap(current->mm, old.userspace_addr, 3957 old.npages * PAGE_SIZE); 3958 up_write(¤t->mm->mmap_sem); 3959 if (ret < 0) 3960 printk(KERN_WARNING 3961 "kvm_vm_ioctl_set_memory_region: " 3962 "failed to munmap memory\n"); 3963 } 3964 } 3965 } 3966 3967 if (!kvm->arch.n_requested_mmu_pages) { 3968 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 3969 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 3970 } 3971 3972 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 3973 kvm_flush_remote_tlbs(kvm); 3974 3975 return 0; 3976 } 3977 3978 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 3979 { 3980 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 3981 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; 3982 } 3983 3984 static void vcpu_kick_intr(void *info) 3985 { 3986 #ifdef DEBUG 3987 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; 3988 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); 3989 #endif 3990 } 3991 3992 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 3993 { 3994 int ipi_pcpu = vcpu->cpu; 3995 int cpu = get_cpu(); 3996 3997 if (waitqueue_active(&vcpu->wq)) { 3998 wake_up_interruptible(&vcpu->wq); 3999 ++vcpu->stat.halt_wakeup; 4000 } 4001 /* 4002 * We may be called synchronously with irqs disabled in guest mode, 4003 * So need not to call smp_call_function_single() in that case. 4004 */ 4005 if (vcpu->guest_mode && vcpu->cpu != cpu) 4006 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); 4007 put_cpu(); 4008 } 4009