1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/intel-iommu.h> 38 39 #include <asm/uaccess.h> 40 #include <asm/msr.h> 41 #include <asm/desc.h> 42 43 #define MAX_IO_MSRS 256 44 #define CR0_RESERVED_BITS \ 45 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 46 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 47 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 48 #define CR4_RESERVED_BITS \ 49 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 50 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 51 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 53 54 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 55 /* EFER defaults: 56 * - enable syscall per default because its emulated by KVM 57 * - enable LME and LMA per default on 64 bit KVM 58 */ 59 #ifdef CONFIG_X86_64 60 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 61 #else 62 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 63 #endif 64 65 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 66 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 67 68 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 69 struct kvm_cpuid_entry2 __user *entries); 70 71 struct kvm_x86_ops *kvm_x86_ops; 72 EXPORT_SYMBOL_GPL(kvm_x86_ops); 73 74 struct kvm_stats_debugfs_item debugfs_entries[] = { 75 { "pf_fixed", VCPU_STAT(pf_fixed) }, 76 { "pf_guest", VCPU_STAT(pf_guest) }, 77 { "tlb_flush", VCPU_STAT(tlb_flush) }, 78 { "invlpg", VCPU_STAT(invlpg) }, 79 { "exits", VCPU_STAT(exits) }, 80 { "io_exits", VCPU_STAT(io_exits) }, 81 { "mmio_exits", VCPU_STAT(mmio_exits) }, 82 { "signal_exits", VCPU_STAT(signal_exits) }, 83 { "irq_window", VCPU_STAT(irq_window_exits) }, 84 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 85 { "halt_exits", VCPU_STAT(halt_exits) }, 86 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 87 { "hypercalls", VCPU_STAT(hypercalls) }, 88 { "request_irq", VCPU_STAT(request_irq_exits) }, 89 { "irq_exits", VCPU_STAT(irq_exits) }, 90 { "host_state_reload", VCPU_STAT(host_state_reload) }, 91 { "efer_reload", VCPU_STAT(efer_reload) }, 92 { "fpu_reload", VCPU_STAT(fpu_reload) }, 93 { "insn_emulation", VCPU_STAT(insn_emulation) }, 94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 95 { "irq_injections", VCPU_STAT(irq_injections) }, 96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 97 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 99 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 100 { "mmu_flooded", VM_STAT(mmu_flooded) }, 101 { "mmu_recycled", VM_STAT(mmu_recycled) }, 102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 103 { "mmu_unsync", VM_STAT(mmu_unsync) }, 104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 105 { "largepages", VM_STAT(lpages) }, 106 { NULL } 107 }; 108 109 unsigned long segment_base(u16 selector) 110 { 111 struct descriptor_table gdt; 112 struct desc_struct *d; 113 unsigned long table_base; 114 unsigned long v; 115 116 if (selector == 0) 117 return 0; 118 119 asm("sgdt %0" : "=m"(gdt)); 120 table_base = gdt.base; 121 122 if (selector & 4) { /* from ldt */ 123 u16 ldt_selector; 124 125 asm("sldt %0" : "=g"(ldt_selector)); 126 table_base = segment_base(ldt_selector); 127 } 128 d = (struct desc_struct *)(table_base + (selector & ~7)); 129 v = d->base0 | ((unsigned long)d->base1 << 16) | 130 ((unsigned long)d->base2 << 24); 131 #ifdef CONFIG_X86_64 132 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 133 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 134 #endif 135 return v; 136 } 137 EXPORT_SYMBOL_GPL(segment_base); 138 139 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 140 { 141 if (irqchip_in_kernel(vcpu->kvm)) 142 return vcpu->arch.apic_base; 143 else 144 return vcpu->arch.apic_base; 145 } 146 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 147 148 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 149 { 150 /* TODO: reserve bits check */ 151 if (irqchip_in_kernel(vcpu->kvm)) 152 kvm_lapic_set_base(vcpu, data); 153 else 154 vcpu->arch.apic_base = data; 155 } 156 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 157 158 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 159 { 160 WARN_ON(vcpu->arch.exception.pending); 161 vcpu->arch.exception.pending = true; 162 vcpu->arch.exception.has_error_code = false; 163 vcpu->arch.exception.nr = nr; 164 } 165 EXPORT_SYMBOL_GPL(kvm_queue_exception); 166 167 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 168 u32 error_code) 169 { 170 ++vcpu->stat.pf_guest; 171 if (vcpu->arch.exception.pending) { 172 if (vcpu->arch.exception.nr == PF_VECTOR) { 173 printk(KERN_DEBUG "kvm: inject_page_fault:" 174 " double fault 0x%lx\n", addr); 175 vcpu->arch.exception.nr = DF_VECTOR; 176 vcpu->arch.exception.error_code = 0; 177 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 178 /* triple fault -> shutdown */ 179 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 180 } 181 return; 182 } 183 vcpu->arch.cr2 = addr; 184 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 185 } 186 187 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 188 { 189 vcpu->arch.nmi_pending = 1; 190 } 191 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 192 193 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 194 { 195 WARN_ON(vcpu->arch.exception.pending); 196 vcpu->arch.exception.pending = true; 197 vcpu->arch.exception.has_error_code = true; 198 vcpu->arch.exception.nr = nr; 199 vcpu->arch.exception.error_code = error_code; 200 } 201 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 202 203 static void __queue_exception(struct kvm_vcpu *vcpu) 204 { 205 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 206 vcpu->arch.exception.has_error_code, 207 vcpu->arch.exception.error_code); 208 } 209 210 /* 211 * Load the pae pdptrs. Return true is they are all valid. 212 */ 213 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 214 { 215 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 216 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 217 int i; 218 int ret; 219 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 220 221 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 222 offset * sizeof(u64), sizeof(pdpte)); 223 if (ret < 0) { 224 ret = 0; 225 goto out; 226 } 227 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 228 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 229 ret = 0; 230 goto out; 231 } 232 } 233 ret = 1; 234 235 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 236 out: 237 238 return ret; 239 } 240 EXPORT_SYMBOL_GPL(load_pdptrs); 241 242 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 243 { 244 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 245 bool changed = true; 246 int r; 247 248 if (is_long_mode(vcpu) || !is_pae(vcpu)) 249 return false; 250 251 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 252 if (r < 0) 253 goto out; 254 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 255 out: 256 257 return changed; 258 } 259 260 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 261 { 262 if (cr0 & CR0_RESERVED_BITS) { 263 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 264 cr0, vcpu->arch.cr0); 265 kvm_inject_gp(vcpu, 0); 266 return; 267 } 268 269 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 270 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 271 kvm_inject_gp(vcpu, 0); 272 return; 273 } 274 275 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 276 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 277 "and a clear PE flag\n"); 278 kvm_inject_gp(vcpu, 0); 279 return; 280 } 281 282 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 283 #ifdef CONFIG_X86_64 284 if ((vcpu->arch.shadow_efer & EFER_LME)) { 285 int cs_db, cs_l; 286 287 if (!is_pae(vcpu)) { 288 printk(KERN_DEBUG "set_cr0: #GP, start paging " 289 "in long mode while PAE is disabled\n"); 290 kvm_inject_gp(vcpu, 0); 291 return; 292 } 293 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 294 if (cs_l) { 295 printk(KERN_DEBUG "set_cr0: #GP, start paging " 296 "in long mode while CS.L == 1\n"); 297 kvm_inject_gp(vcpu, 0); 298 return; 299 300 } 301 } else 302 #endif 303 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 304 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 305 "reserved bits\n"); 306 kvm_inject_gp(vcpu, 0); 307 return; 308 } 309 310 } 311 312 kvm_x86_ops->set_cr0(vcpu, cr0); 313 vcpu->arch.cr0 = cr0; 314 315 kvm_mmu_reset_context(vcpu); 316 return; 317 } 318 EXPORT_SYMBOL_GPL(kvm_set_cr0); 319 320 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 321 { 322 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 323 KVMTRACE_1D(LMSW, vcpu, 324 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 325 handler); 326 } 327 EXPORT_SYMBOL_GPL(kvm_lmsw); 328 329 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 330 { 331 if (cr4 & CR4_RESERVED_BITS) { 332 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 333 kvm_inject_gp(vcpu, 0); 334 return; 335 } 336 337 if (is_long_mode(vcpu)) { 338 if (!(cr4 & X86_CR4_PAE)) { 339 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 340 "in long mode\n"); 341 kvm_inject_gp(vcpu, 0); 342 return; 343 } 344 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 345 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 346 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 347 kvm_inject_gp(vcpu, 0); 348 return; 349 } 350 351 if (cr4 & X86_CR4_VMXE) { 352 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 353 kvm_inject_gp(vcpu, 0); 354 return; 355 } 356 kvm_x86_ops->set_cr4(vcpu, cr4); 357 vcpu->arch.cr4 = cr4; 358 kvm_mmu_reset_context(vcpu); 359 } 360 EXPORT_SYMBOL_GPL(kvm_set_cr4); 361 362 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 363 { 364 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 365 kvm_mmu_sync_roots(vcpu); 366 kvm_mmu_flush_tlb(vcpu); 367 return; 368 } 369 370 if (is_long_mode(vcpu)) { 371 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 372 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 373 kvm_inject_gp(vcpu, 0); 374 return; 375 } 376 } else { 377 if (is_pae(vcpu)) { 378 if (cr3 & CR3_PAE_RESERVED_BITS) { 379 printk(KERN_DEBUG 380 "set_cr3: #GP, reserved bits\n"); 381 kvm_inject_gp(vcpu, 0); 382 return; 383 } 384 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 385 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 386 "reserved bits\n"); 387 kvm_inject_gp(vcpu, 0); 388 return; 389 } 390 } 391 /* 392 * We don't check reserved bits in nonpae mode, because 393 * this isn't enforced, and VMware depends on this. 394 */ 395 } 396 397 /* 398 * Does the new cr3 value map to physical memory? (Note, we 399 * catch an invalid cr3 even in real-mode, because it would 400 * cause trouble later on when we turn on paging anyway.) 401 * 402 * A real CPU would silently accept an invalid cr3 and would 403 * attempt to use it - with largely undefined (and often hard 404 * to debug) behavior on the guest side. 405 */ 406 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 407 kvm_inject_gp(vcpu, 0); 408 else { 409 vcpu->arch.cr3 = cr3; 410 vcpu->arch.mmu.new_cr3(vcpu); 411 } 412 } 413 EXPORT_SYMBOL_GPL(kvm_set_cr3); 414 415 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 416 { 417 if (cr8 & CR8_RESERVED_BITS) { 418 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 419 kvm_inject_gp(vcpu, 0); 420 return; 421 } 422 if (irqchip_in_kernel(vcpu->kvm)) 423 kvm_lapic_set_tpr(vcpu, cr8); 424 else 425 vcpu->arch.cr8 = cr8; 426 } 427 EXPORT_SYMBOL_GPL(kvm_set_cr8); 428 429 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 430 { 431 if (irqchip_in_kernel(vcpu->kvm)) 432 return kvm_lapic_get_cr8(vcpu); 433 else 434 return vcpu->arch.cr8; 435 } 436 EXPORT_SYMBOL_GPL(kvm_get_cr8); 437 438 /* 439 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 440 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 441 * 442 * This list is modified at module load time to reflect the 443 * capabilities of the host cpu. 444 */ 445 static u32 msrs_to_save[] = { 446 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 447 MSR_K6_STAR, 448 #ifdef CONFIG_X86_64 449 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 450 #endif 451 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 452 MSR_IA32_PERF_STATUS, 453 }; 454 455 static unsigned num_msrs_to_save; 456 457 static u32 emulated_msrs[] = { 458 MSR_IA32_MISC_ENABLE, 459 }; 460 461 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 462 { 463 if (efer & efer_reserved_bits) { 464 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 465 efer); 466 kvm_inject_gp(vcpu, 0); 467 return; 468 } 469 470 if (is_paging(vcpu) 471 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 472 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 473 kvm_inject_gp(vcpu, 0); 474 return; 475 } 476 477 kvm_x86_ops->set_efer(vcpu, efer); 478 479 efer &= ~EFER_LMA; 480 efer |= vcpu->arch.shadow_efer & EFER_LMA; 481 482 vcpu->arch.shadow_efer = efer; 483 } 484 485 void kvm_enable_efer_bits(u64 mask) 486 { 487 efer_reserved_bits &= ~mask; 488 } 489 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 490 491 492 /* 493 * Writes msr value into into the appropriate "register". 494 * Returns 0 on success, non-0 otherwise. 495 * Assumes vcpu_load() was already called. 496 */ 497 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 498 { 499 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 500 } 501 502 /* 503 * Adapt set_msr() to msr_io()'s calling convention 504 */ 505 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 506 { 507 return kvm_set_msr(vcpu, index, *data); 508 } 509 510 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 511 { 512 static int version; 513 struct pvclock_wall_clock wc; 514 struct timespec now, sys, boot; 515 516 if (!wall_clock) 517 return; 518 519 version++; 520 521 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 522 523 /* 524 * The guest calculates current wall clock time by adding 525 * system time (updated by kvm_write_guest_time below) to the 526 * wall clock specified here. guest system time equals host 527 * system time for us, thus we must fill in host boot time here. 528 */ 529 now = current_kernel_time(); 530 ktime_get_ts(&sys); 531 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 532 533 wc.sec = boot.tv_sec; 534 wc.nsec = boot.tv_nsec; 535 wc.version = version; 536 537 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 538 539 version++; 540 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 541 } 542 543 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 544 { 545 uint32_t quotient, remainder; 546 547 /* Don't try to replace with do_div(), this one calculates 548 * "(dividend << 32) / divisor" */ 549 __asm__ ( "divl %4" 550 : "=a" (quotient), "=d" (remainder) 551 : "0" (0), "1" (dividend), "r" (divisor) ); 552 return quotient; 553 } 554 555 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 556 { 557 uint64_t nsecs = 1000000000LL; 558 int32_t shift = 0; 559 uint64_t tps64; 560 uint32_t tps32; 561 562 tps64 = tsc_khz * 1000LL; 563 while (tps64 > nsecs*2) { 564 tps64 >>= 1; 565 shift--; 566 } 567 568 tps32 = (uint32_t)tps64; 569 while (tps32 <= (uint32_t)nsecs) { 570 tps32 <<= 1; 571 shift++; 572 } 573 574 hv_clock->tsc_shift = shift; 575 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 576 577 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 578 __func__, tsc_khz, hv_clock->tsc_shift, 579 hv_clock->tsc_to_system_mul); 580 } 581 582 static void kvm_write_guest_time(struct kvm_vcpu *v) 583 { 584 struct timespec ts; 585 unsigned long flags; 586 struct kvm_vcpu_arch *vcpu = &v->arch; 587 void *shared_kaddr; 588 589 if ((!vcpu->time_page)) 590 return; 591 592 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { 593 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); 594 vcpu->hv_clock_tsc_khz = tsc_khz; 595 } 596 597 /* Keep irq disabled to prevent changes to the clock */ 598 local_irq_save(flags); 599 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 600 &vcpu->hv_clock.tsc_timestamp); 601 ktime_get_ts(&ts); 602 local_irq_restore(flags); 603 604 /* With all the info we got, fill in the values */ 605 606 vcpu->hv_clock.system_time = ts.tv_nsec + 607 (NSEC_PER_SEC * (u64)ts.tv_sec); 608 /* 609 * The interface expects us to write an even number signaling that the 610 * update is finished. Since the guest won't see the intermediate 611 * state, we just increase by 2 at the end. 612 */ 613 vcpu->hv_clock.version += 2; 614 615 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 616 617 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 618 sizeof(vcpu->hv_clock)); 619 620 kunmap_atomic(shared_kaddr, KM_USER0); 621 622 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 623 } 624 625 static bool msr_mtrr_valid(unsigned msr) 626 { 627 switch (msr) { 628 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 629 case MSR_MTRRfix64K_00000: 630 case MSR_MTRRfix16K_80000: 631 case MSR_MTRRfix16K_A0000: 632 case MSR_MTRRfix4K_C0000: 633 case MSR_MTRRfix4K_C8000: 634 case MSR_MTRRfix4K_D0000: 635 case MSR_MTRRfix4K_D8000: 636 case MSR_MTRRfix4K_E0000: 637 case MSR_MTRRfix4K_E8000: 638 case MSR_MTRRfix4K_F0000: 639 case MSR_MTRRfix4K_F8000: 640 case MSR_MTRRdefType: 641 case MSR_IA32_CR_PAT: 642 return true; 643 case 0x2f8: 644 return true; 645 } 646 return false; 647 } 648 649 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 650 { 651 if (!msr_mtrr_valid(msr)) 652 return 1; 653 654 vcpu->arch.mtrr[msr - 0x200] = data; 655 return 0; 656 } 657 658 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 659 { 660 switch (msr) { 661 case MSR_EFER: 662 set_efer(vcpu, data); 663 break; 664 case MSR_IA32_MC0_STATUS: 665 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 666 __func__, data); 667 break; 668 case MSR_IA32_MCG_STATUS: 669 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 670 __func__, data); 671 break; 672 case MSR_IA32_MCG_CTL: 673 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 674 __func__, data); 675 break; 676 case MSR_IA32_DEBUGCTLMSR: 677 if (!data) { 678 /* We support the non-activated case already */ 679 break; 680 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 681 /* Values other than LBR and BTF are vendor-specific, 682 thus reserved and should throw a #GP */ 683 return 1; 684 } 685 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 686 __func__, data); 687 break; 688 case MSR_IA32_UCODE_REV: 689 case MSR_IA32_UCODE_WRITE: 690 break; 691 case 0x200 ... 0x2ff: 692 return set_msr_mtrr(vcpu, msr, data); 693 case MSR_IA32_APICBASE: 694 kvm_set_apic_base(vcpu, data); 695 break; 696 case MSR_IA32_MISC_ENABLE: 697 vcpu->arch.ia32_misc_enable_msr = data; 698 break; 699 case MSR_KVM_WALL_CLOCK: 700 vcpu->kvm->arch.wall_clock = data; 701 kvm_write_wall_clock(vcpu->kvm, data); 702 break; 703 case MSR_KVM_SYSTEM_TIME: { 704 if (vcpu->arch.time_page) { 705 kvm_release_page_dirty(vcpu->arch.time_page); 706 vcpu->arch.time_page = NULL; 707 } 708 709 vcpu->arch.time = data; 710 711 /* we verify if the enable bit is set... */ 712 if (!(data & 1)) 713 break; 714 715 /* ...but clean it before doing the actual write */ 716 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 717 718 vcpu->arch.time_page = 719 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 720 721 if (is_error_page(vcpu->arch.time_page)) { 722 kvm_release_page_clean(vcpu->arch.time_page); 723 vcpu->arch.time_page = NULL; 724 } 725 726 kvm_write_guest_time(vcpu); 727 break; 728 } 729 default: 730 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 731 return 1; 732 } 733 return 0; 734 } 735 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 736 737 738 /* 739 * Reads an msr value (of 'msr_index') into 'pdata'. 740 * Returns 0 on success, non-0 otherwise. 741 * Assumes vcpu_load() was already called. 742 */ 743 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 744 { 745 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 746 } 747 748 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 749 { 750 if (!msr_mtrr_valid(msr)) 751 return 1; 752 753 *pdata = vcpu->arch.mtrr[msr - 0x200]; 754 return 0; 755 } 756 757 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 758 { 759 u64 data; 760 761 switch (msr) { 762 case 0xc0010010: /* SYSCFG */ 763 case 0xc0010015: /* HWCR */ 764 case MSR_IA32_PLATFORM_ID: 765 case MSR_IA32_P5_MC_ADDR: 766 case MSR_IA32_P5_MC_TYPE: 767 case MSR_IA32_MC0_CTL: 768 case MSR_IA32_MCG_STATUS: 769 case MSR_IA32_MCG_CAP: 770 case MSR_IA32_MCG_CTL: 771 case MSR_IA32_MC0_MISC: 772 case MSR_IA32_MC0_MISC+4: 773 case MSR_IA32_MC0_MISC+8: 774 case MSR_IA32_MC0_MISC+12: 775 case MSR_IA32_MC0_MISC+16: 776 case MSR_IA32_MC0_MISC+20: 777 case MSR_IA32_UCODE_REV: 778 case MSR_IA32_EBL_CR_POWERON: 779 case MSR_IA32_DEBUGCTLMSR: 780 case MSR_IA32_LASTBRANCHFROMIP: 781 case MSR_IA32_LASTBRANCHTOIP: 782 case MSR_IA32_LASTINTFROMIP: 783 case MSR_IA32_LASTINTTOIP: 784 data = 0; 785 break; 786 case MSR_MTRRcap: 787 data = 0x500 | KVM_NR_VAR_MTRR; 788 break; 789 case 0x200 ... 0x2ff: 790 return get_msr_mtrr(vcpu, msr, pdata); 791 case 0xcd: /* fsb frequency */ 792 data = 3; 793 break; 794 case MSR_IA32_APICBASE: 795 data = kvm_get_apic_base(vcpu); 796 break; 797 case MSR_IA32_MISC_ENABLE: 798 data = vcpu->arch.ia32_misc_enable_msr; 799 break; 800 case MSR_IA32_PERF_STATUS: 801 /* TSC increment by tick */ 802 data = 1000ULL; 803 /* CPU multiplier */ 804 data |= (((uint64_t)4ULL) << 40); 805 break; 806 case MSR_EFER: 807 data = vcpu->arch.shadow_efer; 808 break; 809 case MSR_KVM_WALL_CLOCK: 810 data = vcpu->kvm->arch.wall_clock; 811 break; 812 case MSR_KVM_SYSTEM_TIME: 813 data = vcpu->arch.time; 814 break; 815 default: 816 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 817 return 1; 818 } 819 *pdata = data; 820 return 0; 821 } 822 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 823 824 /* 825 * Read or write a bunch of msrs. All parameters are kernel addresses. 826 * 827 * @return number of msrs set successfully. 828 */ 829 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 830 struct kvm_msr_entry *entries, 831 int (*do_msr)(struct kvm_vcpu *vcpu, 832 unsigned index, u64 *data)) 833 { 834 int i; 835 836 vcpu_load(vcpu); 837 838 down_read(&vcpu->kvm->slots_lock); 839 for (i = 0; i < msrs->nmsrs; ++i) 840 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 841 break; 842 up_read(&vcpu->kvm->slots_lock); 843 844 vcpu_put(vcpu); 845 846 return i; 847 } 848 849 /* 850 * Read or write a bunch of msrs. Parameters are user addresses. 851 * 852 * @return number of msrs set successfully. 853 */ 854 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 855 int (*do_msr)(struct kvm_vcpu *vcpu, 856 unsigned index, u64 *data), 857 int writeback) 858 { 859 struct kvm_msrs msrs; 860 struct kvm_msr_entry *entries; 861 int r, n; 862 unsigned size; 863 864 r = -EFAULT; 865 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 866 goto out; 867 868 r = -E2BIG; 869 if (msrs.nmsrs >= MAX_IO_MSRS) 870 goto out; 871 872 r = -ENOMEM; 873 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 874 entries = vmalloc(size); 875 if (!entries) 876 goto out; 877 878 r = -EFAULT; 879 if (copy_from_user(entries, user_msrs->entries, size)) 880 goto out_free; 881 882 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 883 if (r < 0) 884 goto out_free; 885 886 r = -EFAULT; 887 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 888 goto out_free; 889 890 r = n; 891 892 out_free: 893 vfree(entries); 894 out: 895 return r; 896 } 897 898 int kvm_dev_ioctl_check_extension(long ext) 899 { 900 int r; 901 902 switch (ext) { 903 case KVM_CAP_IRQCHIP: 904 case KVM_CAP_HLT: 905 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 906 case KVM_CAP_USER_MEMORY: 907 case KVM_CAP_SET_TSS_ADDR: 908 case KVM_CAP_EXT_CPUID: 909 case KVM_CAP_CLOCKSOURCE: 910 case KVM_CAP_PIT: 911 case KVM_CAP_NOP_IO_DELAY: 912 case KVM_CAP_MP_STATE: 913 case KVM_CAP_SYNC_MMU: 914 r = 1; 915 break; 916 case KVM_CAP_COALESCED_MMIO: 917 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 918 break; 919 case KVM_CAP_VAPIC: 920 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 921 break; 922 case KVM_CAP_NR_VCPUS: 923 r = KVM_MAX_VCPUS; 924 break; 925 case KVM_CAP_NR_MEMSLOTS: 926 r = KVM_MEMORY_SLOTS; 927 break; 928 case KVM_CAP_PV_MMU: 929 r = !tdp_enabled; 930 break; 931 case KVM_CAP_IOMMU: 932 r = intel_iommu_found(); 933 break; 934 default: 935 r = 0; 936 break; 937 } 938 return r; 939 940 } 941 942 long kvm_arch_dev_ioctl(struct file *filp, 943 unsigned int ioctl, unsigned long arg) 944 { 945 void __user *argp = (void __user *)arg; 946 long r; 947 948 switch (ioctl) { 949 case KVM_GET_MSR_INDEX_LIST: { 950 struct kvm_msr_list __user *user_msr_list = argp; 951 struct kvm_msr_list msr_list; 952 unsigned n; 953 954 r = -EFAULT; 955 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 956 goto out; 957 n = msr_list.nmsrs; 958 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 959 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 960 goto out; 961 r = -E2BIG; 962 if (n < num_msrs_to_save) 963 goto out; 964 r = -EFAULT; 965 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 966 num_msrs_to_save * sizeof(u32))) 967 goto out; 968 if (copy_to_user(user_msr_list->indices 969 + num_msrs_to_save * sizeof(u32), 970 &emulated_msrs, 971 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 972 goto out; 973 r = 0; 974 break; 975 } 976 case KVM_GET_SUPPORTED_CPUID: { 977 struct kvm_cpuid2 __user *cpuid_arg = argp; 978 struct kvm_cpuid2 cpuid; 979 980 r = -EFAULT; 981 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 982 goto out; 983 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 984 cpuid_arg->entries); 985 if (r) 986 goto out; 987 988 r = -EFAULT; 989 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 990 goto out; 991 r = 0; 992 break; 993 } 994 default: 995 r = -EINVAL; 996 } 997 out: 998 return r; 999 } 1000 1001 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1002 { 1003 kvm_x86_ops->vcpu_load(vcpu, cpu); 1004 kvm_write_guest_time(vcpu); 1005 } 1006 1007 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1008 { 1009 kvm_x86_ops->vcpu_put(vcpu); 1010 kvm_put_guest_fpu(vcpu); 1011 } 1012 1013 static int is_efer_nx(void) 1014 { 1015 u64 efer; 1016 1017 rdmsrl(MSR_EFER, efer); 1018 return efer & EFER_NX; 1019 } 1020 1021 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1022 { 1023 int i; 1024 struct kvm_cpuid_entry2 *e, *entry; 1025 1026 entry = NULL; 1027 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1028 e = &vcpu->arch.cpuid_entries[i]; 1029 if (e->function == 0x80000001) { 1030 entry = e; 1031 break; 1032 } 1033 } 1034 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1035 entry->edx &= ~(1 << 20); 1036 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1037 } 1038 } 1039 1040 /* when an old userspace process fills a new kernel module */ 1041 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1042 struct kvm_cpuid *cpuid, 1043 struct kvm_cpuid_entry __user *entries) 1044 { 1045 int r, i; 1046 struct kvm_cpuid_entry *cpuid_entries; 1047 1048 r = -E2BIG; 1049 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1050 goto out; 1051 r = -ENOMEM; 1052 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1053 if (!cpuid_entries) 1054 goto out; 1055 r = -EFAULT; 1056 if (copy_from_user(cpuid_entries, entries, 1057 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1058 goto out_free; 1059 for (i = 0; i < cpuid->nent; i++) { 1060 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1061 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1062 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1063 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1064 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1065 vcpu->arch.cpuid_entries[i].index = 0; 1066 vcpu->arch.cpuid_entries[i].flags = 0; 1067 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1068 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1069 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1070 } 1071 vcpu->arch.cpuid_nent = cpuid->nent; 1072 cpuid_fix_nx_cap(vcpu); 1073 r = 0; 1074 1075 out_free: 1076 vfree(cpuid_entries); 1077 out: 1078 return r; 1079 } 1080 1081 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1082 struct kvm_cpuid2 *cpuid, 1083 struct kvm_cpuid_entry2 __user *entries) 1084 { 1085 int r; 1086 1087 r = -E2BIG; 1088 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1089 goto out; 1090 r = -EFAULT; 1091 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1092 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1093 goto out; 1094 vcpu->arch.cpuid_nent = cpuid->nent; 1095 return 0; 1096 1097 out: 1098 return r; 1099 } 1100 1101 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1102 struct kvm_cpuid2 *cpuid, 1103 struct kvm_cpuid_entry2 __user *entries) 1104 { 1105 int r; 1106 1107 r = -E2BIG; 1108 if (cpuid->nent < vcpu->arch.cpuid_nent) 1109 goto out; 1110 r = -EFAULT; 1111 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1112 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1113 goto out; 1114 return 0; 1115 1116 out: 1117 cpuid->nent = vcpu->arch.cpuid_nent; 1118 return r; 1119 } 1120 1121 static inline u32 bit(int bitno) 1122 { 1123 return 1 << (bitno & 31); 1124 } 1125 1126 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1127 u32 index) 1128 { 1129 entry->function = function; 1130 entry->index = index; 1131 cpuid_count(entry->function, entry->index, 1132 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1133 entry->flags = 0; 1134 } 1135 1136 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1137 u32 index, int *nent, int maxnent) 1138 { 1139 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1140 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1141 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1142 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1143 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1144 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | 1145 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1146 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | 1147 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | 1148 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); 1149 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | 1150 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1151 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1152 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1153 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1154 bit(X86_FEATURE_PGE) | 1155 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1156 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | 1157 bit(X86_FEATURE_SYSCALL) | 1158 (bit(X86_FEATURE_NX) && is_efer_nx()) | 1159 #ifdef CONFIG_X86_64 1160 bit(X86_FEATURE_LM) | 1161 #endif 1162 bit(X86_FEATURE_MMXEXT) | 1163 bit(X86_FEATURE_3DNOWEXT) | 1164 bit(X86_FEATURE_3DNOW); 1165 const u32 kvm_supported_word3_x86_features = 1166 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1167 const u32 kvm_supported_word6_x86_features = 1168 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); 1169 1170 /* all func 2 cpuid_count() should be called on the same cpu */ 1171 get_cpu(); 1172 do_cpuid_1_ent(entry, function, index); 1173 ++*nent; 1174 1175 switch (function) { 1176 case 0: 1177 entry->eax = min(entry->eax, (u32)0xb); 1178 break; 1179 case 1: 1180 entry->edx &= kvm_supported_word0_x86_features; 1181 entry->ecx &= kvm_supported_word3_x86_features; 1182 break; 1183 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1184 * may return different values. This forces us to get_cpu() before 1185 * issuing the first command, and also to emulate this annoying behavior 1186 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1187 case 2: { 1188 int t, times = entry->eax & 0xff; 1189 1190 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1191 for (t = 1; t < times && *nent < maxnent; ++t) { 1192 do_cpuid_1_ent(&entry[t], function, 0); 1193 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1194 ++*nent; 1195 } 1196 break; 1197 } 1198 /* function 4 and 0xb have additional index. */ 1199 case 4: { 1200 int i, cache_type; 1201 1202 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1203 /* read more entries until cache_type is zero */ 1204 for (i = 1; *nent < maxnent; ++i) { 1205 cache_type = entry[i - 1].eax & 0x1f; 1206 if (!cache_type) 1207 break; 1208 do_cpuid_1_ent(&entry[i], function, i); 1209 entry[i].flags |= 1210 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1211 ++*nent; 1212 } 1213 break; 1214 } 1215 case 0xb: { 1216 int i, level_type; 1217 1218 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1219 /* read more entries until level_type is zero */ 1220 for (i = 1; *nent < maxnent; ++i) { 1221 level_type = entry[i - 1].ecx & 0xff; 1222 if (!level_type) 1223 break; 1224 do_cpuid_1_ent(&entry[i], function, i); 1225 entry[i].flags |= 1226 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1227 ++*nent; 1228 } 1229 break; 1230 } 1231 case 0x80000000: 1232 entry->eax = min(entry->eax, 0x8000001a); 1233 break; 1234 case 0x80000001: 1235 entry->edx &= kvm_supported_word1_x86_features; 1236 entry->ecx &= kvm_supported_word6_x86_features; 1237 break; 1238 } 1239 put_cpu(); 1240 } 1241 1242 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1243 struct kvm_cpuid_entry2 __user *entries) 1244 { 1245 struct kvm_cpuid_entry2 *cpuid_entries; 1246 int limit, nent = 0, r = -E2BIG; 1247 u32 func; 1248 1249 if (cpuid->nent < 1) 1250 goto out; 1251 r = -ENOMEM; 1252 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1253 if (!cpuid_entries) 1254 goto out; 1255 1256 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1257 limit = cpuid_entries[0].eax; 1258 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1259 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1260 &nent, cpuid->nent); 1261 r = -E2BIG; 1262 if (nent >= cpuid->nent) 1263 goto out_free; 1264 1265 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1266 limit = cpuid_entries[nent - 1].eax; 1267 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1268 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1269 &nent, cpuid->nent); 1270 r = -EFAULT; 1271 if (copy_to_user(entries, cpuid_entries, 1272 nent * sizeof(struct kvm_cpuid_entry2))) 1273 goto out_free; 1274 cpuid->nent = nent; 1275 r = 0; 1276 1277 out_free: 1278 vfree(cpuid_entries); 1279 out: 1280 return r; 1281 } 1282 1283 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1284 struct kvm_lapic_state *s) 1285 { 1286 vcpu_load(vcpu); 1287 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1288 vcpu_put(vcpu); 1289 1290 return 0; 1291 } 1292 1293 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1294 struct kvm_lapic_state *s) 1295 { 1296 vcpu_load(vcpu); 1297 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1298 kvm_apic_post_state_restore(vcpu); 1299 vcpu_put(vcpu); 1300 1301 return 0; 1302 } 1303 1304 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1305 struct kvm_interrupt *irq) 1306 { 1307 if (irq->irq < 0 || irq->irq >= 256) 1308 return -EINVAL; 1309 if (irqchip_in_kernel(vcpu->kvm)) 1310 return -ENXIO; 1311 vcpu_load(vcpu); 1312 1313 set_bit(irq->irq, vcpu->arch.irq_pending); 1314 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 1315 1316 vcpu_put(vcpu); 1317 1318 return 0; 1319 } 1320 1321 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1322 struct kvm_tpr_access_ctl *tac) 1323 { 1324 if (tac->flags) 1325 return -EINVAL; 1326 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1327 return 0; 1328 } 1329 1330 long kvm_arch_vcpu_ioctl(struct file *filp, 1331 unsigned int ioctl, unsigned long arg) 1332 { 1333 struct kvm_vcpu *vcpu = filp->private_data; 1334 void __user *argp = (void __user *)arg; 1335 int r; 1336 struct kvm_lapic_state *lapic = NULL; 1337 1338 switch (ioctl) { 1339 case KVM_GET_LAPIC: { 1340 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1341 1342 r = -ENOMEM; 1343 if (!lapic) 1344 goto out; 1345 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1346 if (r) 1347 goto out; 1348 r = -EFAULT; 1349 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1350 goto out; 1351 r = 0; 1352 break; 1353 } 1354 case KVM_SET_LAPIC: { 1355 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1356 r = -ENOMEM; 1357 if (!lapic) 1358 goto out; 1359 r = -EFAULT; 1360 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1361 goto out; 1362 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1363 if (r) 1364 goto out; 1365 r = 0; 1366 break; 1367 } 1368 case KVM_INTERRUPT: { 1369 struct kvm_interrupt irq; 1370 1371 r = -EFAULT; 1372 if (copy_from_user(&irq, argp, sizeof irq)) 1373 goto out; 1374 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1375 if (r) 1376 goto out; 1377 r = 0; 1378 break; 1379 } 1380 case KVM_SET_CPUID: { 1381 struct kvm_cpuid __user *cpuid_arg = argp; 1382 struct kvm_cpuid cpuid; 1383 1384 r = -EFAULT; 1385 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1386 goto out; 1387 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1388 if (r) 1389 goto out; 1390 break; 1391 } 1392 case KVM_SET_CPUID2: { 1393 struct kvm_cpuid2 __user *cpuid_arg = argp; 1394 struct kvm_cpuid2 cpuid; 1395 1396 r = -EFAULT; 1397 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1398 goto out; 1399 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1400 cpuid_arg->entries); 1401 if (r) 1402 goto out; 1403 break; 1404 } 1405 case KVM_GET_CPUID2: { 1406 struct kvm_cpuid2 __user *cpuid_arg = argp; 1407 struct kvm_cpuid2 cpuid; 1408 1409 r = -EFAULT; 1410 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1411 goto out; 1412 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1413 cpuid_arg->entries); 1414 if (r) 1415 goto out; 1416 r = -EFAULT; 1417 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1418 goto out; 1419 r = 0; 1420 break; 1421 } 1422 case KVM_GET_MSRS: 1423 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1424 break; 1425 case KVM_SET_MSRS: 1426 r = msr_io(vcpu, argp, do_set_msr, 0); 1427 break; 1428 case KVM_TPR_ACCESS_REPORTING: { 1429 struct kvm_tpr_access_ctl tac; 1430 1431 r = -EFAULT; 1432 if (copy_from_user(&tac, argp, sizeof tac)) 1433 goto out; 1434 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1435 if (r) 1436 goto out; 1437 r = -EFAULT; 1438 if (copy_to_user(argp, &tac, sizeof tac)) 1439 goto out; 1440 r = 0; 1441 break; 1442 }; 1443 case KVM_SET_VAPIC_ADDR: { 1444 struct kvm_vapic_addr va; 1445 1446 r = -EINVAL; 1447 if (!irqchip_in_kernel(vcpu->kvm)) 1448 goto out; 1449 r = -EFAULT; 1450 if (copy_from_user(&va, argp, sizeof va)) 1451 goto out; 1452 r = 0; 1453 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1454 break; 1455 } 1456 default: 1457 r = -EINVAL; 1458 } 1459 out: 1460 if (lapic) 1461 kfree(lapic); 1462 return r; 1463 } 1464 1465 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1466 { 1467 int ret; 1468 1469 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1470 return -1; 1471 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1472 return ret; 1473 } 1474 1475 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1476 u32 kvm_nr_mmu_pages) 1477 { 1478 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1479 return -EINVAL; 1480 1481 down_write(&kvm->slots_lock); 1482 1483 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1484 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1485 1486 up_write(&kvm->slots_lock); 1487 return 0; 1488 } 1489 1490 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1491 { 1492 return kvm->arch.n_alloc_mmu_pages; 1493 } 1494 1495 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1496 { 1497 int i; 1498 struct kvm_mem_alias *alias; 1499 1500 for (i = 0; i < kvm->arch.naliases; ++i) { 1501 alias = &kvm->arch.aliases[i]; 1502 if (gfn >= alias->base_gfn 1503 && gfn < alias->base_gfn + alias->npages) 1504 return alias->target_gfn + gfn - alias->base_gfn; 1505 } 1506 return gfn; 1507 } 1508 1509 /* 1510 * Set a new alias region. Aliases map a portion of physical memory into 1511 * another portion. This is useful for memory windows, for example the PC 1512 * VGA region. 1513 */ 1514 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1515 struct kvm_memory_alias *alias) 1516 { 1517 int r, n; 1518 struct kvm_mem_alias *p; 1519 1520 r = -EINVAL; 1521 /* General sanity checks */ 1522 if (alias->memory_size & (PAGE_SIZE - 1)) 1523 goto out; 1524 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1525 goto out; 1526 if (alias->slot >= KVM_ALIAS_SLOTS) 1527 goto out; 1528 if (alias->guest_phys_addr + alias->memory_size 1529 < alias->guest_phys_addr) 1530 goto out; 1531 if (alias->target_phys_addr + alias->memory_size 1532 < alias->target_phys_addr) 1533 goto out; 1534 1535 down_write(&kvm->slots_lock); 1536 spin_lock(&kvm->mmu_lock); 1537 1538 p = &kvm->arch.aliases[alias->slot]; 1539 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1540 p->npages = alias->memory_size >> PAGE_SHIFT; 1541 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1542 1543 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1544 if (kvm->arch.aliases[n - 1].npages) 1545 break; 1546 kvm->arch.naliases = n; 1547 1548 spin_unlock(&kvm->mmu_lock); 1549 kvm_mmu_zap_all(kvm); 1550 1551 up_write(&kvm->slots_lock); 1552 1553 return 0; 1554 1555 out: 1556 return r; 1557 } 1558 1559 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1560 { 1561 int r; 1562 1563 r = 0; 1564 switch (chip->chip_id) { 1565 case KVM_IRQCHIP_PIC_MASTER: 1566 memcpy(&chip->chip.pic, 1567 &pic_irqchip(kvm)->pics[0], 1568 sizeof(struct kvm_pic_state)); 1569 break; 1570 case KVM_IRQCHIP_PIC_SLAVE: 1571 memcpy(&chip->chip.pic, 1572 &pic_irqchip(kvm)->pics[1], 1573 sizeof(struct kvm_pic_state)); 1574 break; 1575 case KVM_IRQCHIP_IOAPIC: 1576 memcpy(&chip->chip.ioapic, 1577 ioapic_irqchip(kvm), 1578 sizeof(struct kvm_ioapic_state)); 1579 break; 1580 default: 1581 r = -EINVAL; 1582 break; 1583 } 1584 return r; 1585 } 1586 1587 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1588 { 1589 int r; 1590 1591 r = 0; 1592 switch (chip->chip_id) { 1593 case KVM_IRQCHIP_PIC_MASTER: 1594 memcpy(&pic_irqchip(kvm)->pics[0], 1595 &chip->chip.pic, 1596 sizeof(struct kvm_pic_state)); 1597 break; 1598 case KVM_IRQCHIP_PIC_SLAVE: 1599 memcpy(&pic_irqchip(kvm)->pics[1], 1600 &chip->chip.pic, 1601 sizeof(struct kvm_pic_state)); 1602 break; 1603 case KVM_IRQCHIP_IOAPIC: 1604 memcpy(ioapic_irqchip(kvm), 1605 &chip->chip.ioapic, 1606 sizeof(struct kvm_ioapic_state)); 1607 break; 1608 default: 1609 r = -EINVAL; 1610 break; 1611 } 1612 kvm_pic_update_irq(pic_irqchip(kvm)); 1613 return r; 1614 } 1615 1616 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1617 { 1618 int r = 0; 1619 1620 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1621 return r; 1622 } 1623 1624 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1625 { 1626 int r = 0; 1627 1628 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1629 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1630 return r; 1631 } 1632 1633 /* 1634 * Get (and clear) the dirty memory log for a memory slot. 1635 */ 1636 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1637 struct kvm_dirty_log *log) 1638 { 1639 int r; 1640 int n; 1641 struct kvm_memory_slot *memslot; 1642 int is_dirty = 0; 1643 1644 down_write(&kvm->slots_lock); 1645 1646 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1647 if (r) 1648 goto out; 1649 1650 /* If nothing is dirty, don't bother messing with page tables. */ 1651 if (is_dirty) { 1652 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1653 kvm_flush_remote_tlbs(kvm); 1654 memslot = &kvm->memslots[log->slot]; 1655 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1656 memset(memslot->dirty_bitmap, 0, n); 1657 } 1658 r = 0; 1659 out: 1660 up_write(&kvm->slots_lock); 1661 return r; 1662 } 1663 1664 long kvm_arch_vm_ioctl(struct file *filp, 1665 unsigned int ioctl, unsigned long arg) 1666 { 1667 struct kvm *kvm = filp->private_data; 1668 void __user *argp = (void __user *)arg; 1669 int r = -EINVAL; 1670 /* 1671 * This union makes it completely explicit to gcc-3.x 1672 * that these two variables' stack usage should be 1673 * combined, not added together. 1674 */ 1675 union { 1676 struct kvm_pit_state ps; 1677 struct kvm_memory_alias alias; 1678 } u; 1679 1680 switch (ioctl) { 1681 case KVM_SET_TSS_ADDR: 1682 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1683 if (r < 0) 1684 goto out; 1685 break; 1686 case KVM_SET_MEMORY_REGION: { 1687 struct kvm_memory_region kvm_mem; 1688 struct kvm_userspace_memory_region kvm_userspace_mem; 1689 1690 r = -EFAULT; 1691 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1692 goto out; 1693 kvm_userspace_mem.slot = kvm_mem.slot; 1694 kvm_userspace_mem.flags = kvm_mem.flags; 1695 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1696 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1697 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1698 if (r) 1699 goto out; 1700 break; 1701 } 1702 case KVM_SET_NR_MMU_PAGES: 1703 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1704 if (r) 1705 goto out; 1706 break; 1707 case KVM_GET_NR_MMU_PAGES: 1708 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1709 break; 1710 case KVM_SET_MEMORY_ALIAS: 1711 r = -EFAULT; 1712 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 1713 goto out; 1714 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 1715 if (r) 1716 goto out; 1717 break; 1718 case KVM_CREATE_IRQCHIP: 1719 r = -ENOMEM; 1720 kvm->arch.vpic = kvm_create_pic(kvm); 1721 if (kvm->arch.vpic) { 1722 r = kvm_ioapic_init(kvm); 1723 if (r) { 1724 kfree(kvm->arch.vpic); 1725 kvm->arch.vpic = NULL; 1726 goto out; 1727 } 1728 } else 1729 goto out; 1730 break; 1731 case KVM_CREATE_PIT: 1732 r = -ENOMEM; 1733 kvm->arch.vpit = kvm_create_pit(kvm); 1734 if (kvm->arch.vpit) 1735 r = 0; 1736 break; 1737 case KVM_IRQ_LINE: { 1738 struct kvm_irq_level irq_event; 1739 1740 r = -EFAULT; 1741 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1742 goto out; 1743 if (irqchip_in_kernel(kvm)) { 1744 mutex_lock(&kvm->lock); 1745 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1746 irq_event.irq, irq_event.level); 1747 mutex_unlock(&kvm->lock); 1748 r = 0; 1749 } 1750 break; 1751 } 1752 case KVM_GET_IRQCHIP: { 1753 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1754 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1755 1756 r = -ENOMEM; 1757 if (!chip) 1758 goto out; 1759 r = -EFAULT; 1760 if (copy_from_user(chip, argp, sizeof *chip)) 1761 goto get_irqchip_out; 1762 r = -ENXIO; 1763 if (!irqchip_in_kernel(kvm)) 1764 goto get_irqchip_out; 1765 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 1766 if (r) 1767 goto get_irqchip_out; 1768 r = -EFAULT; 1769 if (copy_to_user(argp, chip, sizeof *chip)) 1770 goto get_irqchip_out; 1771 r = 0; 1772 get_irqchip_out: 1773 kfree(chip); 1774 if (r) 1775 goto out; 1776 break; 1777 } 1778 case KVM_SET_IRQCHIP: { 1779 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1780 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1781 1782 r = -ENOMEM; 1783 if (!chip) 1784 goto out; 1785 r = -EFAULT; 1786 if (copy_from_user(chip, argp, sizeof *chip)) 1787 goto set_irqchip_out; 1788 r = -ENXIO; 1789 if (!irqchip_in_kernel(kvm)) 1790 goto set_irqchip_out; 1791 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 1792 if (r) 1793 goto set_irqchip_out; 1794 r = 0; 1795 set_irqchip_out: 1796 kfree(chip); 1797 if (r) 1798 goto out; 1799 break; 1800 } 1801 case KVM_GET_PIT: { 1802 r = -EFAULT; 1803 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 1804 goto out; 1805 r = -ENXIO; 1806 if (!kvm->arch.vpit) 1807 goto out; 1808 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 1809 if (r) 1810 goto out; 1811 r = -EFAULT; 1812 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 1813 goto out; 1814 r = 0; 1815 break; 1816 } 1817 case KVM_SET_PIT: { 1818 r = -EFAULT; 1819 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 1820 goto out; 1821 r = -ENXIO; 1822 if (!kvm->arch.vpit) 1823 goto out; 1824 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 1825 if (r) 1826 goto out; 1827 r = 0; 1828 break; 1829 } 1830 default: 1831 ; 1832 } 1833 out: 1834 return r; 1835 } 1836 1837 static void kvm_init_msr_list(void) 1838 { 1839 u32 dummy[2]; 1840 unsigned i, j; 1841 1842 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 1843 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 1844 continue; 1845 if (j < i) 1846 msrs_to_save[j] = msrs_to_save[i]; 1847 j++; 1848 } 1849 num_msrs_to_save = j; 1850 } 1851 1852 /* 1853 * Only apic need an MMIO device hook, so shortcut now.. 1854 */ 1855 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1856 gpa_t addr, int len, 1857 int is_write) 1858 { 1859 struct kvm_io_device *dev; 1860 1861 if (vcpu->arch.apic) { 1862 dev = &vcpu->arch.apic->dev; 1863 if (dev->in_range(dev, addr, len, is_write)) 1864 return dev; 1865 } 1866 return NULL; 1867 } 1868 1869 1870 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1871 gpa_t addr, int len, 1872 int is_write) 1873 { 1874 struct kvm_io_device *dev; 1875 1876 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 1877 if (dev == NULL) 1878 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, 1879 is_write); 1880 return dev; 1881 } 1882 1883 int emulator_read_std(unsigned long addr, 1884 void *val, 1885 unsigned int bytes, 1886 struct kvm_vcpu *vcpu) 1887 { 1888 void *data = val; 1889 int r = X86EMUL_CONTINUE; 1890 1891 while (bytes) { 1892 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1893 unsigned offset = addr & (PAGE_SIZE-1); 1894 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1895 int ret; 1896 1897 if (gpa == UNMAPPED_GVA) { 1898 r = X86EMUL_PROPAGATE_FAULT; 1899 goto out; 1900 } 1901 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); 1902 if (ret < 0) { 1903 r = X86EMUL_UNHANDLEABLE; 1904 goto out; 1905 } 1906 1907 bytes -= tocopy; 1908 data += tocopy; 1909 addr += tocopy; 1910 } 1911 out: 1912 return r; 1913 } 1914 EXPORT_SYMBOL_GPL(emulator_read_std); 1915 1916 static int emulator_read_emulated(unsigned long addr, 1917 void *val, 1918 unsigned int bytes, 1919 struct kvm_vcpu *vcpu) 1920 { 1921 struct kvm_io_device *mmio_dev; 1922 gpa_t gpa; 1923 1924 if (vcpu->mmio_read_completed) { 1925 memcpy(val, vcpu->mmio_data, bytes); 1926 vcpu->mmio_read_completed = 0; 1927 return X86EMUL_CONTINUE; 1928 } 1929 1930 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1931 1932 /* For APIC access vmexit */ 1933 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1934 goto mmio; 1935 1936 if (emulator_read_std(addr, val, bytes, vcpu) 1937 == X86EMUL_CONTINUE) 1938 return X86EMUL_CONTINUE; 1939 if (gpa == UNMAPPED_GVA) 1940 return X86EMUL_PROPAGATE_FAULT; 1941 1942 mmio: 1943 /* 1944 * Is this MMIO handled locally? 1945 */ 1946 mutex_lock(&vcpu->kvm->lock); 1947 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 1948 if (mmio_dev) { 1949 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1950 mutex_unlock(&vcpu->kvm->lock); 1951 return X86EMUL_CONTINUE; 1952 } 1953 mutex_unlock(&vcpu->kvm->lock); 1954 1955 vcpu->mmio_needed = 1; 1956 vcpu->mmio_phys_addr = gpa; 1957 vcpu->mmio_size = bytes; 1958 vcpu->mmio_is_write = 0; 1959 1960 return X86EMUL_UNHANDLEABLE; 1961 } 1962 1963 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1964 const void *val, int bytes) 1965 { 1966 int ret; 1967 1968 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 1969 if (ret < 0) 1970 return 0; 1971 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1972 return 1; 1973 } 1974 1975 static int emulator_write_emulated_onepage(unsigned long addr, 1976 const void *val, 1977 unsigned int bytes, 1978 struct kvm_vcpu *vcpu) 1979 { 1980 struct kvm_io_device *mmio_dev; 1981 gpa_t gpa; 1982 1983 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1984 1985 if (gpa == UNMAPPED_GVA) { 1986 kvm_inject_page_fault(vcpu, addr, 2); 1987 return X86EMUL_PROPAGATE_FAULT; 1988 } 1989 1990 /* For APIC access vmexit */ 1991 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1992 goto mmio; 1993 1994 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1995 return X86EMUL_CONTINUE; 1996 1997 mmio: 1998 /* 1999 * Is this MMIO handled locally? 2000 */ 2001 mutex_lock(&vcpu->kvm->lock); 2002 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); 2003 if (mmio_dev) { 2004 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2005 mutex_unlock(&vcpu->kvm->lock); 2006 return X86EMUL_CONTINUE; 2007 } 2008 mutex_unlock(&vcpu->kvm->lock); 2009 2010 vcpu->mmio_needed = 1; 2011 vcpu->mmio_phys_addr = gpa; 2012 vcpu->mmio_size = bytes; 2013 vcpu->mmio_is_write = 1; 2014 memcpy(vcpu->mmio_data, val, bytes); 2015 2016 return X86EMUL_CONTINUE; 2017 } 2018 2019 int emulator_write_emulated(unsigned long addr, 2020 const void *val, 2021 unsigned int bytes, 2022 struct kvm_vcpu *vcpu) 2023 { 2024 /* Crossing a page boundary? */ 2025 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2026 int rc, now; 2027 2028 now = -addr & ~PAGE_MASK; 2029 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2030 if (rc != X86EMUL_CONTINUE) 2031 return rc; 2032 addr += now; 2033 val += now; 2034 bytes -= now; 2035 } 2036 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2037 } 2038 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2039 2040 static int emulator_cmpxchg_emulated(unsigned long addr, 2041 const void *old, 2042 const void *new, 2043 unsigned int bytes, 2044 struct kvm_vcpu *vcpu) 2045 { 2046 static int reported; 2047 2048 if (!reported) { 2049 reported = 1; 2050 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 2051 } 2052 #ifndef CONFIG_X86_64 2053 /* guests cmpxchg8b have to be emulated atomically */ 2054 if (bytes == 8) { 2055 gpa_t gpa; 2056 struct page *page; 2057 char *kaddr; 2058 u64 val; 2059 2060 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2061 2062 if (gpa == UNMAPPED_GVA || 2063 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2064 goto emul_write; 2065 2066 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2067 goto emul_write; 2068 2069 val = *(u64 *)new; 2070 2071 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2072 2073 kaddr = kmap_atomic(page, KM_USER0); 2074 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2075 kunmap_atomic(kaddr, KM_USER0); 2076 kvm_release_page_dirty(page); 2077 } 2078 emul_write: 2079 #endif 2080 2081 return emulator_write_emulated(addr, new, bytes, vcpu); 2082 } 2083 2084 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2085 { 2086 return kvm_x86_ops->get_segment_base(vcpu, seg); 2087 } 2088 2089 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2090 { 2091 kvm_mmu_invlpg(vcpu, address); 2092 return X86EMUL_CONTINUE; 2093 } 2094 2095 int emulate_clts(struct kvm_vcpu *vcpu) 2096 { 2097 KVMTRACE_0D(CLTS, vcpu, handler); 2098 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2099 return X86EMUL_CONTINUE; 2100 } 2101 2102 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2103 { 2104 struct kvm_vcpu *vcpu = ctxt->vcpu; 2105 2106 switch (dr) { 2107 case 0 ... 3: 2108 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2109 return X86EMUL_CONTINUE; 2110 default: 2111 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2112 return X86EMUL_UNHANDLEABLE; 2113 } 2114 } 2115 2116 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2117 { 2118 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2119 int exception; 2120 2121 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2122 if (exception) { 2123 /* FIXME: better handling */ 2124 return X86EMUL_UNHANDLEABLE; 2125 } 2126 return X86EMUL_CONTINUE; 2127 } 2128 2129 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2130 { 2131 u8 opcodes[4]; 2132 unsigned long rip = kvm_rip_read(vcpu); 2133 unsigned long rip_linear; 2134 2135 if (!printk_ratelimit()) 2136 return; 2137 2138 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2139 2140 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2141 2142 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2143 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2144 } 2145 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2146 2147 static struct x86_emulate_ops emulate_ops = { 2148 .read_std = emulator_read_std, 2149 .read_emulated = emulator_read_emulated, 2150 .write_emulated = emulator_write_emulated, 2151 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2152 }; 2153 2154 static void cache_all_regs(struct kvm_vcpu *vcpu) 2155 { 2156 kvm_register_read(vcpu, VCPU_REGS_RAX); 2157 kvm_register_read(vcpu, VCPU_REGS_RSP); 2158 kvm_register_read(vcpu, VCPU_REGS_RIP); 2159 vcpu->arch.regs_dirty = ~0; 2160 } 2161 2162 int emulate_instruction(struct kvm_vcpu *vcpu, 2163 struct kvm_run *run, 2164 unsigned long cr2, 2165 u16 error_code, 2166 int emulation_type) 2167 { 2168 int r; 2169 struct decode_cache *c; 2170 2171 kvm_clear_exception_queue(vcpu); 2172 vcpu->arch.mmio_fault_cr2 = cr2; 2173 /* 2174 * TODO: fix x86_emulate.c to use guest_read/write_register 2175 * instead of direct ->regs accesses, can save hundred cycles 2176 * on Intel for instructions that don't read/change RSP, for 2177 * for example. 2178 */ 2179 cache_all_regs(vcpu); 2180 2181 vcpu->mmio_is_write = 0; 2182 vcpu->arch.pio.string = 0; 2183 2184 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2185 int cs_db, cs_l; 2186 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2187 2188 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2189 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2190 vcpu->arch.emulate_ctxt.mode = 2191 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2192 ? X86EMUL_MODE_REAL : cs_l 2193 ? X86EMUL_MODE_PROT64 : cs_db 2194 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2195 2196 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2197 2198 /* Reject the instructions other than VMCALL/VMMCALL when 2199 * try to emulate invalid opcode */ 2200 c = &vcpu->arch.emulate_ctxt.decode; 2201 if ((emulation_type & EMULTYPE_TRAP_UD) && 2202 (!(c->twobyte && c->b == 0x01 && 2203 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2204 c->modrm_mod == 3 && c->modrm_rm == 1))) 2205 return EMULATE_FAIL; 2206 2207 ++vcpu->stat.insn_emulation; 2208 if (r) { 2209 ++vcpu->stat.insn_emulation_fail; 2210 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2211 return EMULATE_DONE; 2212 return EMULATE_FAIL; 2213 } 2214 } 2215 2216 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2217 2218 if (vcpu->arch.pio.string) 2219 return EMULATE_DO_MMIO; 2220 2221 if ((r || vcpu->mmio_is_write) && run) { 2222 run->exit_reason = KVM_EXIT_MMIO; 2223 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2224 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2225 run->mmio.len = vcpu->mmio_size; 2226 run->mmio.is_write = vcpu->mmio_is_write; 2227 } 2228 2229 if (r) { 2230 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2231 return EMULATE_DONE; 2232 if (!vcpu->mmio_needed) { 2233 kvm_report_emulation_failure(vcpu, "mmio"); 2234 return EMULATE_FAIL; 2235 } 2236 return EMULATE_DO_MMIO; 2237 } 2238 2239 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2240 2241 if (vcpu->mmio_is_write) { 2242 vcpu->mmio_needed = 0; 2243 return EMULATE_DO_MMIO; 2244 } 2245 2246 return EMULATE_DONE; 2247 } 2248 EXPORT_SYMBOL_GPL(emulate_instruction); 2249 2250 static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 2251 { 2252 int i; 2253 2254 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) 2255 if (vcpu->arch.pio.guest_pages[i]) { 2256 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); 2257 vcpu->arch.pio.guest_pages[i] = NULL; 2258 } 2259 } 2260 2261 static int pio_copy_data(struct kvm_vcpu *vcpu) 2262 { 2263 void *p = vcpu->arch.pio_data; 2264 void *q; 2265 unsigned bytes; 2266 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; 2267 2268 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 2269 PAGE_KERNEL); 2270 if (!q) { 2271 free_pio_guest_pages(vcpu); 2272 return -ENOMEM; 2273 } 2274 q += vcpu->arch.pio.guest_page_offset; 2275 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2276 if (vcpu->arch.pio.in) 2277 memcpy(q, p, bytes); 2278 else 2279 memcpy(p, q, bytes); 2280 q -= vcpu->arch.pio.guest_page_offset; 2281 vunmap(q); 2282 free_pio_guest_pages(vcpu); 2283 return 0; 2284 } 2285 2286 int complete_pio(struct kvm_vcpu *vcpu) 2287 { 2288 struct kvm_pio_request *io = &vcpu->arch.pio; 2289 long delta; 2290 int r; 2291 unsigned long val; 2292 2293 if (!io->string) { 2294 if (io->in) { 2295 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2296 memcpy(&val, vcpu->arch.pio_data, io->size); 2297 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2298 } 2299 } else { 2300 if (io->in) { 2301 r = pio_copy_data(vcpu); 2302 if (r) 2303 return r; 2304 } 2305 2306 delta = 1; 2307 if (io->rep) { 2308 delta *= io->cur_count; 2309 /* 2310 * The size of the register should really depend on 2311 * current address size. 2312 */ 2313 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2314 val -= delta; 2315 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2316 } 2317 if (io->down) 2318 delta = -delta; 2319 delta *= io->size; 2320 if (io->in) { 2321 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2322 val += delta; 2323 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2324 } else { 2325 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2326 val += delta; 2327 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2328 } 2329 } 2330 2331 io->count -= io->cur_count; 2332 io->cur_count = 0; 2333 2334 return 0; 2335 } 2336 2337 static void kernel_pio(struct kvm_io_device *pio_dev, 2338 struct kvm_vcpu *vcpu, 2339 void *pd) 2340 { 2341 /* TODO: String I/O for in kernel device */ 2342 2343 mutex_lock(&vcpu->kvm->lock); 2344 if (vcpu->arch.pio.in) 2345 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2346 vcpu->arch.pio.size, 2347 pd); 2348 else 2349 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2350 vcpu->arch.pio.size, 2351 pd); 2352 mutex_unlock(&vcpu->kvm->lock); 2353 } 2354 2355 static void pio_string_write(struct kvm_io_device *pio_dev, 2356 struct kvm_vcpu *vcpu) 2357 { 2358 struct kvm_pio_request *io = &vcpu->arch.pio; 2359 void *pd = vcpu->arch.pio_data; 2360 int i; 2361 2362 mutex_lock(&vcpu->kvm->lock); 2363 for (i = 0; i < io->cur_count; i++) { 2364 kvm_iodevice_write(pio_dev, io->port, 2365 io->size, 2366 pd); 2367 pd += io->size; 2368 } 2369 mutex_unlock(&vcpu->kvm->lock); 2370 } 2371 2372 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2373 gpa_t addr, int len, 2374 int is_write) 2375 { 2376 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); 2377 } 2378 2379 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2380 int size, unsigned port) 2381 { 2382 struct kvm_io_device *pio_dev; 2383 unsigned long val; 2384 2385 vcpu->run->exit_reason = KVM_EXIT_IO; 2386 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2387 vcpu->run->io.size = vcpu->arch.pio.size = size; 2388 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2389 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2390 vcpu->run->io.port = vcpu->arch.pio.port = port; 2391 vcpu->arch.pio.in = in; 2392 vcpu->arch.pio.string = 0; 2393 vcpu->arch.pio.down = 0; 2394 vcpu->arch.pio.guest_page_offset = 0; 2395 vcpu->arch.pio.rep = 0; 2396 2397 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2398 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2399 handler); 2400 else 2401 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2402 handler); 2403 2404 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2405 memcpy(vcpu->arch.pio_data, &val, 4); 2406 2407 kvm_x86_ops->skip_emulated_instruction(vcpu); 2408 2409 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2410 if (pio_dev) { 2411 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2412 complete_pio(vcpu); 2413 return 1; 2414 } 2415 return 0; 2416 } 2417 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2418 2419 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2420 int size, unsigned long count, int down, 2421 gva_t address, int rep, unsigned port) 2422 { 2423 unsigned now, in_page; 2424 int i, ret = 0; 2425 int nr_pages = 1; 2426 struct page *page; 2427 struct kvm_io_device *pio_dev; 2428 2429 vcpu->run->exit_reason = KVM_EXIT_IO; 2430 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2431 vcpu->run->io.size = vcpu->arch.pio.size = size; 2432 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2433 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2434 vcpu->run->io.port = vcpu->arch.pio.port = port; 2435 vcpu->arch.pio.in = in; 2436 vcpu->arch.pio.string = 1; 2437 vcpu->arch.pio.down = down; 2438 vcpu->arch.pio.guest_page_offset = offset_in_page(address); 2439 vcpu->arch.pio.rep = rep; 2440 2441 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2442 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2443 handler); 2444 else 2445 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2446 handler); 2447 2448 if (!count) { 2449 kvm_x86_ops->skip_emulated_instruction(vcpu); 2450 return 1; 2451 } 2452 2453 if (!down) 2454 in_page = PAGE_SIZE - offset_in_page(address); 2455 else 2456 in_page = offset_in_page(address) + size; 2457 now = min(count, (unsigned long)in_page / size); 2458 if (!now) { 2459 /* 2460 * String I/O straddles page boundary. Pin two guest pages 2461 * so that we satisfy atomicity constraints. Do just one 2462 * transaction to avoid complexity. 2463 */ 2464 nr_pages = 2; 2465 now = 1; 2466 } 2467 if (down) { 2468 /* 2469 * String I/O in reverse. Yuck. Kill the guest, fix later. 2470 */ 2471 pr_unimpl(vcpu, "guest string pio down\n"); 2472 kvm_inject_gp(vcpu, 0); 2473 return 1; 2474 } 2475 vcpu->run->io.count = now; 2476 vcpu->arch.pio.cur_count = now; 2477 2478 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2479 kvm_x86_ops->skip_emulated_instruction(vcpu); 2480 2481 for (i = 0; i < nr_pages; ++i) { 2482 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2483 vcpu->arch.pio.guest_pages[i] = page; 2484 if (!page) { 2485 kvm_inject_gp(vcpu, 0); 2486 free_pio_guest_pages(vcpu); 2487 return 1; 2488 } 2489 } 2490 2491 pio_dev = vcpu_find_pio_dev(vcpu, port, 2492 vcpu->arch.pio.cur_count, 2493 !vcpu->arch.pio.in); 2494 if (!vcpu->arch.pio.in) { 2495 /* string PIO write */ 2496 ret = pio_copy_data(vcpu); 2497 if (ret >= 0 && pio_dev) { 2498 pio_string_write(pio_dev, vcpu); 2499 complete_pio(vcpu); 2500 if (vcpu->arch.pio.count == 0) 2501 ret = 1; 2502 } 2503 } else if (pio_dev) 2504 pr_unimpl(vcpu, "no string pio read support yet, " 2505 "port %x size %d count %ld\n", 2506 port, size, count); 2507 2508 return ret; 2509 } 2510 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2511 2512 int kvm_arch_init(void *opaque) 2513 { 2514 int r; 2515 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2516 2517 if (kvm_x86_ops) { 2518 printk(KERN_ERR "kvm: already loaded the other module\n"); 2519 r = -EEXIST; 2520 goto out; 2521 } 2522 2523 if (!ops->cpu_has_kvm_support()) { 2524 printk(KERN_ERR "kvm: no hardware support\n"); 2525 r = -EOPNOTSUPP; 2526 goto out; 2527 } 2528 if (ops->disabled_by_bios()) { 2529 printk(KERN_ERR "kvm: disabled by bios\n"); 2530 r = -EOPNOTSUPP; 2531 goto out; 2532 } 2533 2534 r = kvm_mmu_module_init(); 2535 if (r) 2536 goto out; 2537 2538 kvm_init_msr_list(); 2539 2540 kvm_x86_ops = ops; 2541 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2542 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2543 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2544 PT_DIRTY_MASK, PT64_NX_MASK, 0); 2545 return 0; 2546 2547 out: 2548 return r; 2549 } 2550 2551 void kvm_arch_exit(void) 2552 { 2553 kvm_x86_ops = NULL; 2554 kvm_mmu_module_exit(); 2555 } 2556 2557 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2558 { 2559 ++vcpu->stat.halt_exits; 2560 KVMTRACE_0D(HLT, vcpu, handler); 2561 if (irqchip_in_kernel(vcpu->kvm)) { 2562 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2563 return 1; 2564 } else { 2565 vcpu->run->exit_reason = KVM_EXIT_HLT; 2566 return 0; 2567 } 2568 } 2569 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2570 2571 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2572 unsigned long a1) 2573 { 2574 if (is_long_mode(vcpu)) 2575 return a0; 2576 else 2577 return a0 | ((gpa_t)a1 << 32); 2578 } 2579 2580 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2581 { 2582 unsigned long nr, a0, a1, a2, a3, ret; 2583 int r = 1; 2584 2585 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 2586 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 2587 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 2588 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 2589 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 2590 2591 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2592 2593 if (!is_long_mode(vcpu)) { 2594 nr &= 0xFFFFFFFF; 2595 a0 &= 0xFFFFFFFF; 2596 a1 &= 0xFFFFFFFF; 2597 a2 &= 0xFFFFFFFF; 2598 a3 &= 0xFFFFFFFF; 2599 } 2600 2601 switch (nr) { 2602 case KVM_HC_VAPIC_POLL_IRQ: 2603 ret = 0; 2604 break; 2605 case KVM_HC_MMU_OP: 2606 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2607 break; 2608 default: 2609 ret = -KVM_ENOSYS; 2610 break; 2611 } 2612 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 2613 ++vcpu->stat.hypercalls; 2614 return r; 2615 } 2616 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2617 2618 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2619 { 2620 char instruction[3]; 2621 int ret = 0; 2622 unsigned long rip = kvm_rip_read(vcpu); 2623 2624 2625 /* 2626 * Blow out the MMU to ensure that no other VCPU has an active mapping 2627 * to ensure that the updated hypercall appears atomically across all 2628 * VCPUs. 2629 */ 2630 kvm_mmu_zap_all(vcpu->kvm); 2631 2632 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2633 if (emulator_write_emulated(rip, instruction, 3, vcpu) 2634 != X86EMUL_CONTINUE) 2635 ret = -EFAULT; 2636 2637 return ret; 2638 } 2639 2640 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2641 { 2642 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2643 } 2644 2645 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2646 { 2647 struct descriptor_table dt = { limit, base }; 2648 2649 kvm_x86_ops->set_gdt(vcpu, &dt); 2650 } 2651 2652 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2653 { 2654 struct descriptor_table dt = { limit, base }; 2655 2656 kvm_x86_ops->set_idt(vcpu, &dt); 2657 } 2658 2659 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2660 unsigned long *rflags) 2661 { 2662 kvm_lmsw(vcpu, msw); 2663 *rflags = kvm_x86_ops->get_rflags(vcpu); 2664 } 2665 2666 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2667 { 2668 unsigned long value; 2669 2670 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2671 switch (cr) { 2672 case 0: 2673 value = vcpu->arch.cr0; 2674 break; 2675 case 2: 2676 value = vcpu->arch.cr2; 2677 break; 2678 case 3: 2679 value = vcpu->arch.cr3; 2680 break; 2681 case 4: 2682 value = vcpu->arch.cr4; 2683 break; 2684 case 8: 2685 value = kvm_get_cr8(vcpu); 2686 break; 2687 default: 2688 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2689 return 0; 2690 } 2691 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, 2692 (u32)((u64)value >> 32), handler); 2693 2694 return value; 2695 } 2696 2697 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2698 unsigned long *rflags) 2699 { 2700 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, 2701 (u32)((u64)val >> 32), handler); 2702 2703 switch (cr) { 2704 case 0: 2705 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2706 *rflags = kvm_x86_ops->get_rflags(vcpu); 2707 break; 2708 case 2: 2709 vcpu->arch.cr2 = val; 2710 break; 2711 case 3: 2712 kvm_set_cr3(vcpu, val); 2713 break; 2714 case 4: 2715 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2716 break; 2717 case 8: 2718 kvm_set_cr8(vcpu, val & 0xfUL); 2719 break; 2720 default: 2721 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2722 } 2723 } 2724 2725 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2726 { 2727 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2728 int j, nent = vcpu->arch.cpuid_nent; 2729 2730 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2731 /* when no next entry is found, the current entry[i] is reselected */ 2732 for (j = i + 1; j == i; j = (j + 1) % nent) { 2733 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2734 if (ej->function == e->function) { 2735 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2736 return j; 2737 } 2738 } 2739 return 0; /* silence gcc, even though control never reaches here */ 2740 } 2741 2742 /* find an entry with matching function, matching index (if needed), and that 2743 * should be read next (if it's stateful) */ 2744 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 2745 u32 function, u32 index) 2746 { 2747 if (e->function != function) 2748 return 0; 2749 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 2750 return 0; 2751 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 2752 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 2753 return 0; 2754 return 1; 2755 } 2756 2757 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 2758 { 2759 int i; 2760 u32 function, index; 2761 struct kvm_cpuid_entry2 *e, *best; 2762 2763 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 2764 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 2765 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 2766 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 2767 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 2768 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 2769 best = NULL; 2770 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2771 e = &vcpu->arch.cpuid_entries[i]; 2772 if (is_matching_cpuid_entry(e, function, index)) { 2773 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 2774 move_to_next_stateful_cpuid_entry(vcpu, i); 2775 best = e; 2776 break; 2777 } 2778 /* 2779 * Both basic or both extended? 2780 */ 2781 if (((e->function ^ function) & 0x80000000) == 0) 2782 if (!best || e->function > best->function) 2783 best = e; 2784 } 2785 if (best) { 2786 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 2787 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 2788 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 2789 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 2790 } 2791 kvm_x86_ops->skip_emulated_instruction(vcpu); 2792 KVMTRACE_5D(CPUID, vcpu, function, 2793 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 2794 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 2795 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 2796 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 2797 } 2798 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2799 2800 /* 2801 * Check if userspace requested an interrupt window, and that the 2802 * interrupt window is open. 2803 * 2804 * No need to exit to userspace if we already have an interrupt queued. 2805 */ 2806 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 2807 struct kvm_run *kvm_run) 2808 { 2809 return (!vcpu->arch.irq_summary && 2810 kvm_run->request_interrupt_window && 2811 vcpu->arch.interrupt_window_open && 2812 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 2813 } 2814 2815 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 2816 struct kvm_run *kvm_run) 2817 { 2818 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 2819 kvm_run->cr8 = kvm_get_cr8(vcpu); 2820 kvm_run->apic_base = kvm_get_apic_base(vcpu); 2821 if (irqchip_in_kernel(vcpu->kvm)) 2822 kvm_run->ready_for_interrupt_injection = 1; 2823 else 2824 kvm_run->ready_for_interrupt_injection = 2825 (vcpu->arch.interrupt_window_open && 2826 vcpu->arch.irq_summary == 0); 2827 } 2828 2829 static void vapic_enter(struct kvm_vcpu *vcpu) 2830 { 2831 struct kvm_lapic *apic = vcpu->arch.apic; 2832 struct page *page; 2833 2834 if (!apic || !apic->vapic_addr) 2835 return; 2836 2837 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2838 2839 vcpu->arch.apic->vapic_page = page; 2840 } 2841 2842 static void vapic_exit(struct kvm_vcpu *vcpu) 2843 { 2844 struct kvm_lapic *apic = vcpu->arch.apic; 2845 2846 if (!apic || !apic->vapic_addr) 2847 return; 2848 2849 down_read(&vcpu->kvm->slots_lock); 2850 kvm_release_page_dirty(apic->vapic_page); 2851 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2852 up_read(&vcpu->kvm->slots_lock); 2853 } 2854 2855 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2856 { 2857 int r; 2858 2859 if (vcpu->requests) 2860 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2861 kvm_mmu_unload(vcpu); 2862 2863 r = kvm_mmu_reload(vcpu); 2864 if (unlikely(r)) 2865 goto out; 2866 2867 if (vcpu->requests) { 2868 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2869 __kvm_migrate_timers(vcpu); 2870 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 2871 kvm_mmu_sync_roots(vcpu); 2872 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2873 kvm_x86_ops->tlb_flush(vcpu); 2874 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2875 &vcpu->requests)) { 2876 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2877 r = 0; 2878 goto out; 2879 } 2880 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 2881 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2882 r = 0; 2883 goto out; 2884 } 2885 } 2886 2887 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 2888 kvm_inject_pending_timer_irqs(vcpu); 2889 2890 preempt_disable(); 2891 2892 kvm_x86_ops->prepare_guest_switch(vcpu); 2893 kvm_load_guest_fpu(vcpu); 2894 2895 local_irq_disable(); 2896 2897 if (vcpu->requests || need_resched() || signal_pending(current)) { 2898 local_irq_enable(); 2899 preempt_enable(); 2900 r = 1; 2901 goto out; 2902 } 2903 2904 if (vcpu->guest_debug.enabled) 2905 kvm_x86_ops->guest_debug_pre(vcpu); 2906 2907 vcpu->guest_mode = 1; 2908 /* 2909 * Make sure that guest_mode assignment won't happen after 2910 * testing the pending IRQ vector bitmap. 2911 */ 2912 smp_wmb(); 2913 2914 if (vcpu->arch.exception.pending) 2915 __queue_exception(vcpu); 2916 else if (irqchip_in_kernel(vcpu->kvm)) 2917 kvm_x86_ops->inject_pending_irq(vcpu); 2918 else 2919 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2920 2921 kvm_lapic_sync_to_vapic(vcpu); 2922 2923 up_read(&vcpu->kvm->slots_lock); 2924 2925 kvm_guest_enter(); 2926 2927 2928 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2929 kvm_x86_ops->run(vcpu, kvm_run); 2930 2931 vcpu->guest_mode = 0; 2932 local_irq_enable(); 2933 2934 ++vcpu->stat.exits; 2935 2936 /* 2937 * We must have an instruction between local_irq_enable() and 2938 * kvm_guest_exit(), so the timer interrupt isn't delayed by 2939 * the interrupt shadow. The stat.exits increment will do nicely. 2940 * But we need to prevent reordering, hence this barrier(): 2941 */ 2942 barrier(); 2943 2944 kvm_guest_exit(); 2945 2946 preempt_enable(); 2947 2948 down_read(&vcpu->kvm->slots_lock); 2949 2950 /* 2951 * Profile KVM exit RIPs: 2952 */ 2953 if (unlikely(prof_on == KVM_PROFILING)) { 2954 unsigned long rip = kvm_rip_read(vcpu); 2955 profile_hit(KVM_PROFILING, (void *)rip); 2956 } 2957 2958 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2959 vcpu->arch.exception.pending = false; 2960 2961 kvm_lapic_sync_from_vapic(vcpu); 2962 2963 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2964 out: 2965 return r; 2966 } 2967 2968 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2969 { 2970 int r; 2971 2972 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 2973 pr_debug("vcpu %d received sipi with vector # %x\n", 2974 vcpu->vcpu_id, vcpu->arch.sipi_vector); 2975 kvm_lapic_reset(vcpu); 2976 r = kvm_x86_ops->vcpu_reset(vcpu); 2977 if (r) 2978 return r; 2979 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2980 } 2981 2982 down_read(&vcpu->kvm->slots_lock); 2983 vapic_enter(vcpu); 2984 2985 r = 1; 2986 while (r > 0) { 2987 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 2988 r = vcpu_enter_guest(vcpu, kvm_run); 2989 else { 2990 up_read(&vcpu->kvm->slots_lock); 2991 kvm_vcpu_block(vcpu); 2992 down_read(&vcpu->kvm->slots_lock); 2993 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 2994 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 2995 vcpu->arch.mp_state = 2996 KVM_MP_STATE_RUNNABLE; 2997 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 2998 r = -EINTR; 2999 } 3000 3001 if (r > 0) { 3002 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3003 r = -EINTR; 3004 kvm_run->exit_reason = KVM_EXIT_INTR; 3005 ++vcpu->stat.request_irq_exits; 3006 } 3007 if (signal_pending(current)) { 3008 r = -EINTR; 3009 kvm_run->exit_reason = KVM_EXIT_INTR; 3010 ++vcpu->stat.signal_exits; 3011 } 3012 if (need_resched()) { 3013 up_read(&vcpu->kvm->slots_lock); 3014 kvm_resched(vcpu); 3015 down_read(&vcpu->kvm->slots_lock); 3016 } 3017 } 3018 } 3019 3020 up_read(&vcpu->kvm->slots_lock); 3021 post_kvm_run_save(vcpu, kvm_run); 3022 3023 vapic_exit(vcpu); 3024 3025 return r; 3026 } 3027 3028 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3029 { 3030 int r; 3031 sigset_t sigsaved; 3032 3033 vcpu_load(vcpu); 3034 3035 if (vcpu->sigset_active) 3036 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3037 3038 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3039 kvm_vcpu_block(vcpu); 3040 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3041 r = -EAGAIN; 3042 goto out; 3043 } 3044 3045 /* re-sync apic's tpr */ 3046 if (!irqchip_in_kernel(vcpu->kvm)) 3047 kvm_set_cr8(vcpu, kvm_run->cr8); 3048 3049 if (vcpu->arch.pio.cur_count) { 3050 r = complete_pio(vcpu); 3051 if (r) 3052 goto out; 3053 } 3054 #if CONFIG_HAS_IOMEM 3055 if (vcpu->mmio_needed) { 3056 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3057 vcpu->mmio_read_completed = 1; 3058 vcpu->mmio_needed = 0; 3059 3060 down_read(&vcpu->kvm->slots_lock); 3061 r = emulate_instruction(vcpu, kvm_run, 3062 vcpu->arch.mmio_fault_cr2, 0, 3063 EMULTYPE_NO_DECODE); 3064 up_read(&vcpu->kvm->slots_lock); 3065 if (r == EMULATE_DO_MMIO) { 3066 /* 3067 * Read-modify-write. Back to userspace. 3068 */ 3069 r = 0; 3070 goto out; 3071 } 3072 } 3073 #endif 3074 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3075 kvm_register_write(vcpu, VCPU_REGS_RAX, 3076 kvm_run->hypercall.ret); 3077 3078 r = __vcpu_run(vcpu, kvm_run); 3079 3080 out: 3081 if (vcpu->sigset_active) 3082 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3083 3084 vcpu_put(vcpu); 3085 return r; 3086 } 3087 3088 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3089 { 3090 vcpu_load(vcpu); 3091 3092 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3093 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3094 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3095 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3096 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3097 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3098 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3099 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3100 #ifdef CONFIG_X86_64 3101 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3102 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3103 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3104 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3105 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3106 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3107 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3108 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3109 #endif 3110 3111 regs->rip = kvm_rip_read(vcpu); 3112 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3113 3114 /* 3115 * Don't leak debug flags in case they were set for guest debugging 3116 */ 3117 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) 3118 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3119 3120 vcpu_put(vcpu); 3121 3122 return 0; 3123 } 3124 3125 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3126 { 3127 vcpu_load(vcpu); 3128 3129 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3130 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3131 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3132 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3133 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3134 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3135 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3136 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3137 #ifdef CONFIG_X86_64 3138 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3139 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3140 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3141 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3142 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3143 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3144 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3145 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3146 3147 #endif 3148 3149 kvm_rip_write(vcpu, regs->rip); 3150 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3151 3152 3153 vcpu->arch.exception.pending = false; 3154 3155 vcpu_put(vcpu); 3156 3157 return 0; 3158 } 3159 3160 void kvm_get_segment(struct kvm_vcpu *vcpu, 3161 struct kvm_segment *var, int seg) 3162 { 3163 kvm_x86_ops->get_segment(vcpu, var, seg); 3164 } 3165 3166 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3167 { 3168 struct kvm_segment cs; 3169 3170 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3171 *db = cs.db; 3172 *l = cs.l; 3173 } 3174 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3175 3176 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3177 struct kvm_sregs *sregs) 3178 { 3179 struct descriptor_table dt; 3180 int pending_vec; 3181 3182 vcpu_load(vcpu); 3183 3184 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3185 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3186 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3187 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3188 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3189 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3190 3191 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3192 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3193 3194 kvm_x86_ops->get_idt(vcpu, &dt); 3195 sregs->idt.limit = dt.limit; 3196 sregs->idt.base = dt.base; 3197 kvm_x86_ops->get_gdt(vcpu, &dt); 3198 sregs->gdt.limit = dt.limit; 3199 sregs->gdt.base = dt.base; 3200 3201 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3202 sregs->cr0 = vcpu->arch.cr0; 3203 sregs->cr2 = vcpu->arch.cr2; 3204 sregs->cr3 = vcpu->arch.cr3; 3205 sregs->cr4 = vcpu->arch.cr4; 3206 sregs->cr8 = kvm_get_cr8(vcpu); 3207 sregs->efer = vcpu->arch.shadow_efer; 3208 sregs->apic_base = kvm_get_apic_base(vcpu); 3209 3210 if (irqchip_in_kernel(vcpu->kvm)) { 3211 memset(sregs->interrupt_bitmap, 0, 3212 sizeof sregs->interrupt_bitmap); 3213 pending_vec = kvm_x86_ops->get_irq(vcpu); 3214 if (pending_vec >= 0) 3215 set_bit(pending_vec, 3216 (unsigned long *)sregs->interrupt_bitmap); 3217 } else 3218 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, 3219 sizeof sregs->interrupt_bitmap); 3220 3221 vcpu_put(vcpu); 3222 3223 return 0; 3224 } 3225 3226 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3227 struct kvm_mp_state *mp_state) 3228 { 3229 vcpu_load(vcpu); 3230 mp_state->mp_state = vcpu->arch.mp_state; 3231 vcpu_put(vcpu); 3232 return 0; 3233 } 3234 3235 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3236 struct kvm_mp_state *mp_state) 3237 { 3238 vcpu_load(vcpu); 3239 vcpu->arch.mp_state = mp_state->mp_state; 3240 vcpu_put(vcpu); 3241 return 0; 3242 } 3243 3244 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3245 struct kvm_segment *var, int seg) 3246 { 3247 kvm_x86_ops->set_segment(vcpu, var, seg); 3248 } 3249 3250 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3251 struct kvm_segment *kvm_desct) 3252 { 3253 kvm_desct->base = seg_desc->base0; 3254 kvm_desct->base |= seg_desc->base1 << 16; 3255 kvm_desct->base |= seg_desc->base2 << 24; 3256 kvm_desct->limit = seg_desc->limit0; 3257 kvm_desct->limit |= seg_desc->limit << 16; 3258 if (seg_desc->g) { 3259 kvm_desct->limit <<= 12; 3260 kvm_desct->limit |= 0xfff; 3261 } 3262 kvm_desct->selector = selector; 3263 kvm_desct->type = seg_desc->type; 3264 kvm_desct->present = seg_desc->p; 3265 kvm_desct->dpl = seg_desc->dpl; 3266 kvm_desct->db = seg_desc->d; 3267 kvm_desct->s = seg_desc->s; 3268 kvm_desct->l = seg_desc->l; 3269 kvm_desct->g = seg_desc->g; 3270 kvm_desct->avl = seg_desc->avl; 3271 if (!selector) 3272 kvm_desct->unusable = 1; 3273 else 3274 kvm_desct->unusable = 0; 3275 kvm_desct->padding = 0; 3276 } 3277 3278 static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, 3279 u16 selector, 3280 struct descriptor_table *dtable) 3281 { 3282 if (selector & 1 << 2) { 3283 struct kvm_segment kvm_seg; 3284 3285 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3286 3287 if (kvm_seg.unusable) 3288 dtable->limit = 0; 3289 else 3290 dtable->limit = kvm_seg.limit; 3291 dtable->base = kvm_seg.base; 3292 } 3293 else 3294 kvm_x86_ops->get_gdt(vcpu, dtable); 3295 } 3296 3297 /* allowed just for 8 bytes segments */ 3298 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3299 struct desc_struct *seg_desc) 3300 { 3301 gpa_t gpa; 3302 struct descriptor_table dtable; 3303 u16 index = selector >> 3; 3304 3305 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3306 3307 if (dtable.limit < index * 8 + 7) { 3308 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3309 return 1; 3310 } 3311 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3312 gpa += index * 8; 3313 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); 3314 } 3315 3316 /* allowed just for 8 bytes segments */ 3317 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3318 struct desc_struct *seg_desc) 3319 { 3320 gpa_t gpa; 3321 struct descriptor_table dtable; 3322 u16 index = selector >> 3; 3323 3324 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3325 3326 if (dtable.limit < index * 8 + 7) 3327 return 1; 3328 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3329 gpa += index * 8; 3330 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); 3331 } 3332 3333 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3334 struct desc_struct *seg_desc) 3335 { 3336 u32 base_addr; 3337 3338 base_addr = seg_desc->base0; 3339 base_addr |= (seg_desc->base1 << 16); 3340 base_addr |= (seg_desc->base2 << 24); 3341 3342 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 3343 } 3344 3345 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3346 { 3347 struct kvm_segment kvm_seg; 3348 3349 kvm_get_segment(vcpu, &kvm_seg, seg); 3350 return kvm_seg.selector; 3351 } 3352 3353 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3354 u16 selector, 3355 struct kvm_segment *kvm_seg) 3356 { 3357 struct desc_struct seg_desc; 3358 3359 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3360 return 1; 3361 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3362 return 0; 3363 } 3364 3365 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 3366 { 3367 struct kvm_segment segvar = { 3368 .base = selector << 4, 3369 .limit = 0xffff, 3370 .selector = selector, 3371 .type = 3, 3372 .present = 1, 3373 .dpl = 3, 3374 .db = 0, 3375 .s = 1, 3376 .l = 0, 3377 .g = 0, 3378 .avl = 0, 3379 .unusable = 0, 3380 }; 3381 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 3382 return 0; 3383 } 3384 3385 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3386 int type_bits, int seg) 3387 { 3388 struct kvm_segment kvm_seg; 3389 3390 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 3391 return kvm_load_realmode_segment(vcpu, selector, seg); 3392 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3393 return 1; 3394 kvm_seg.type |= type_bits; 3395 3396 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3397 seg != VCPU_SREG_LDTR) 3398 if (!kvm_seg.s) 3399 kvm_seg.unusable = 1; 3400 3401 kvm_set_segment(vcpu, &kvm_seg, seg); 3402 return 0; 3403 } 3404 3405 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3406 struct tss_segment_32 *tss) 3407 { 3408 tss->cr3 = vcpu->arch.cr3; 3409 tss->eip = kvm_rip_read(vcpu); 3410 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3411 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3412 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3413 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3414 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3415 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3416 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3417 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3418 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3419 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3420 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3421 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3422 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3423 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3424 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3425 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3426 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3427 } 3428 3429 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3430 struct tss_segment_32 *tss) 3431 { 3432 kvm_set_cr3(vcpu, tss->cr3); 3433 3434 kvm_rip_write(vcpu, tss->eip); 3435 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3436 3437 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 3438 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 3439 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 3440 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 3441 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 3442 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 3443 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 3444 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 3445 3446 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3447 return 1; 3448 3449 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3450 return 1; 3451 3452 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3453 return 1; 3454 3455 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3456 return 1; 3457 3458 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3459 return 1; 3460 3461 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3462 return 1; 3463 3464 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3465 return 1; 3466 return 0; 3467 } 3468 3469 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3470 struct tss_segment_16 *tss) 3471 { 3472 tss->ip = kvm_rip_read(vcpu); 3473 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3474 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3475 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3476 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3477 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3478 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3479 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3480 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 3481 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 3482 3483 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3484 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3485 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3486 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3487 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3488 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3489 } 3490 3491 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3492 struct tss_segment_16 *tss) 3493 { 3494 kvm_rip_write(vcpu, tss->ip); 3495 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3496 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 3497 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 3498 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 3499 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 3500 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 3501 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 3502 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 3503 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 3504 3505 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3506 return 1; 3507 3508 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3509 return 1; 3510 3511 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3512 return 1; 3513 3514 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3515 return 1; 3516 3517 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3518 return 1; 3519 return 0; 3520 } 3521 3522 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3523 u32 old_tss_base, 3524 struct desc_struct *nseg_desc) 3525 { 3526 struct tss_segment_16 tss_segment_16; 3527 int ret = 0; 3528 3529 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3530 sizeof tss_segment_16)) 3531 goto out; 3532 3533 save_state_to_tss16(vcpu, &tss_segment_16); 3534 3535 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3536 sizeof tss_segment_16)) 3537 goto out; 3538 3539 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3540 &tss_segment_16, sizeof tss_segment_16)) 3541 goto out; 3542 3543 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3544 goto out; 3545 3546 ret = 1; 3547 out: 3548 return ret; 3549 } 3550 3551 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3552 u32 old_tss_base, 3553 struct desc_struct *nseg_desc) 3554 { 3555 struct tss_segment_32 tss_segment_32; 3556 int ret = 0; 3557 3558 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3559 sizeof tss_segment_32)) 3560 goto out; 3561 3562 save_state_to_tss32(vcpu, &tss_segment_32); 3563 3564 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3565 sizeof tss_segment_32)) 3566 goto out; 3567 3568 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3569 &tss_segment_32, sizeof tss_segment_32)) 3570 goto out; 3571 3572 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3573 goto out; 3574 3575 ret = 1; 3576 out: 3577 return ret; 3578 } 3579 3580 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3581 { 3582 struct kvm_segment tr_seg; 3583 struct desc_struct cseg_desc; 3584 struct desc_struct nseg_desc; 3585 int ret = 0; 3586 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 3587 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 3588 3589 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 3590 3591 /* FIXME: Handle errors. Failure to read either TSS or their 3592 * descriptors should generate a pagefault. 3593 */ 3594 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3595 goto out; 3596 3597 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 3598 goto out; 3599 3600 if (reason != TASK_SWITCH_IRET) { 3601 int cpl; 3602 3603 cpl = kvm_x86_ops->get_cpl(vcpu); 3604 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3605 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3606 return 1; 3607 } 3608 } 3609 3610 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3611 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3612 return 1; 3613 } 3614 3615 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3616 cseg_desc.type &= ~(1 << 1); //clear the B flag 3617 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 3618 } 3619 3620 if (reason == TASK_SWITCH_IRET) { 3621 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3622 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 3623 } 3624 3625 kvm_x86_ops->skip_emulated_instruction(vcpu); 3626 3627 if (nseg_desc.type & 8) 3628 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3629 &nseg_desc); 3630 else 3631 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 3632 &nseg_desc); 3633 3634 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3635 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3636 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 3637 } 3638 3639 if (reason != TASK_SWITCH_IRET) { 3640 nseg_desc.type |= (1 << 1); 3641 save_guest_segment_descriptor(vcpu, tss_selector, 3642 &nseg_desc); 3643 } 3644 3645 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3646 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3647 tr_seg.type = 11; 3648 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3649 out: 3650 return ret; 3651 } 3652 EXPORT_SYMBOL_GPL(kvm_task_switch); 3653 3654 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 3655 struct kvm_sregs *sregs) 3656 { 3657 int mmu_reset_needed = 0; 3658 int i, pending_vec, max_bits; 3659 struct descriptor_table dt; 3660 3661 vcpu_load(vcpu); 3662 3663 dt.limit = sregs->idt.limit; 3664 dt.base = sregs->idt.base; 3665 kvm_x86_ops->set_idt(vcpu, &dt); 3666 dt.limit = sregs->gdt.limit; 3667 dt.base = sregs->gdt.base; 3668 kvm_x86_ops->set_gdt(vcpu, &dt); 3669 3670 vcpu->arch.cr2 = sregs->cr2; 3671 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 3672 vcpu->arch.cr3 = sregs->cr3; 3673 3674 kvm_set_cr8(vcpu, sregs->cr8); 3675 3676 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 3677 kvm_x86_ops->set_efer(vcpu, sregs->efer); 3678 kvm_set_apic_base(vcpu, sregs->apic_base); 3679 3680 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3681 3682 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 3683 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 3684 vcpu->arch.cr0 = sregs->cr0; 3685 3686 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 3687 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 3688 if (!is_long_mode(vcpu) && is_pae(vcpu)) 3689 load_pdptrs(vcpu, vcpu->arch.cr3); 3690 3691 if (mmu_reset_needed) 3692 kvm_mmu_reset_context(vcpu); 3693 3694 if (!irqchip_in_kernel(vcpu->kvm)) { 3695 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 3696 sizeof vcpu->arch.irq_pending); 3697 vcpu->arch.irq_summary = 0; 3698 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 3699 if (vcpu->arch.irq_pending[i]) 3700 __set_bit(i, &vcpu->arch.irq_summary); 3701 } else { 3702 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 3703 pending_vec = find_first_bit( 3704 (const unsigned long *)sregs->interrupt_bitmap, 3705 max_bits); 3706 /* Only pending external irq is handled here */ 3707 if (pending_vec < max_bits) { 3708 kvm_x86_ops->set_irq(vcpu, pending_vec); 3709 pr_debug("Set back pending irq %d\n", 3710 pending_vec); 3711 } 3712 kvm_pic_clear_isr_ack(vcpu->kvm); 3713 } 3714 3715 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3716 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3717 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3718 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3719 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3720 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3721 3722 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3723 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3724 3725 /* Older userspace won't unhalt the vcpu on reset. */ 3726 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 3727 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 3728 !(vcpu->arch.cr0 & X86_CR0_PE)) 3729 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3730 3731 vcpu_put(vcpu); 3732 3733 return 0; 3734 } 3735 3736 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 3737 struct kvm_debug_guest *dbg) 3738 { 3739 int r; 3740 3741 vcpu_load(vcpu); 3742 3743 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 3744 3745 vcpu_put(vcpu); 3746 3747 return r; 3748 } 3749 3750 /* 3751 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 3752 * we have asm/x86/processor.h 3753 */ 3754 struct fxsave { 3755 u16 cwd; 3756 u16 swd; 3757 u16 twd; 3758 u16 fop; 3759 u64 rip; 3760 u64 rdp; 3761 u32 mxcsr; 3762 u32 mxcsr_mask; 3763 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 3764 #ifdef CONFIG_X86_64 3765 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 3766 #else 3767 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 3768 #endif 3769 }; 3770 3771 /* 3772 * Translate a guest virtual address to a guest physical address. 3773 */ 3774 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 3775 struct kvm_translation *tr) 3776 { 3777 unsigned long vaddr = tr->linear_address; 3778 gpa_t gpa; 3779 3780 vcpu_load(vcpu); 3781 down_read(&vcpu->kvm->slots_lock); 3782 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 3783 up_read(&vcpu->kvm->slots_lock); 3784 tr->physical_address = gpa; 3785 tr->valid = gpa != UNMAPPED_GVA; 3786 tr->writeable = 1; 3787 tr->usermode = 0; 3788 vcpu_put(vcpu); 3789 3790 return 0; 3791 } 3792 3793 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 3794 { 3795 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 3796 3797 vcpu_load(vcpu); 3798 3799 memcpy(fpu->fpr, fxsave->st_space, 128); 3800 fpu->fcw = fxsave->cwd; 3801 fpu->fsw = fxsave->swd; 3802 fpu->ftwx = fxsave->twd; 3803 fpu->last_opcode = fxsave->fop; 3804 fpu->last_ip = fxsave->rip; 3805 fpu->last_dp = fxsave->rdp; 3806 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 3807 3808 vcpu_put(vcpu); 3809 3810 return 0; 3811 } 3812 3813 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 3814 { 3815 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 3816 3817 vcpu_load(vcpu); 3818 3819 memcpy(fxsave->st_space, fpu->fpr, 128); 3820 fxsave->cwd = fpu->fcw; 3821 fxsave->swd = fpu->fsw; 3822 fxsave->twd = fpu->ftwx; 3823 fxsave->fop = fpu->last_opcode; 3824 fxsave->rip = fpu->last_ip; 3825 fxsave->rdp = fpu->last_dp; 3826 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 3827 3828 vcpu_put(vcpu); 3829 3830 return 0; 3831 } 3832 3833 void fx_init(struct kvm_vcpu *vcpu) 3834 { 3835 unsigned after_mxcsr_mask; 3836 3837 /* 3838 * Touch the fpu the first time in non atomic context as if 3839 * this is the first fpu instruction the exception handler 3840 * will fire before the instruction returns and it'll have to 3841 * allocate ram with GFP_KERNEL. 3842 */ 3843 if (!used_math()) 3844 kvm_fx_save(&vcpu->arch.host_fx_image); 3845 3846 /* Initialize guest FPU by resetting ours and saving into guest's */ 3847 preempt_disable(); 3848 kvm_fx_save(&vcpu->arch.host_fx_image); 3849 kvm_fx_finit(); 3850 kvm_fx_save(&vcpu->arch.guest_fx_image); 3851 kvm_fx_restore(&vcpu->arch.host_fx_image); 3852 preempt_enable(); 3853 3854 vcpu->arch.cr0 |= X86_CR0_ET; 3855 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 3856 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 3857 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 3858 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 3859 } 3860 EXPORT_SYMBOL_GPL(fx_init); 3861 3862 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 3863 { 3864 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 3865 return; 3866 3867 vcpu->guest_fpu_loaded = 1; 3868 kvm_fx_save(&vcpu->arch.host_fx_image); 3869 kvm_fx_restore(&vcpu->arch.guest_fx_image); 3870 } 3871 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3872 3873 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 3874 { 3875 if (!vcpu->guest_fpu_loaded) 3876 return; 3877 3878 vcpu->guest_fpu_loaded = 0; 3879 kvm_fx_save(&vcpu->arch.guest_fx_image); 3880 kvm_fx_restore(&vcpu->arch.host_fx_image); 3881 ++vcpu->stat.fpu_reload; 3882 } 3883 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3884 3885 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 3886 { 3887 kvm_x86_ops->vcpu_free(vcpu); 3888 } 3889 3890 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 3891 unsigned int id) 3892 { 3893 return kvm_x86_ops->vcpu_create(kvm, id); 3894 } 3895 3896 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 3897 { 3898 int r; 3899 3900 /* We do fxsave: this must be aligned. */ 3901 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 3902 3903 vcpu_load(vcpu); 3904 r = kvm_arch_vcpu_reset(vcpu); 3905 if (r == 0) 3906 r = kvm_mmu_setup(vcpu); 3907 vcpu_put(vcpu); 3908 if (r < 0) 3909 goto free_vcpu; 3910 3911 return 0; 3912 free_vcpu: 3913 kvm_x86_ops->vcpu_free(vcpu); 3914 return r; 3915 } 3916 3917 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 3918 { 3919 vcpu_load(vcpu); 3920 kvm_mmu_unload(vcpu); 3921 vcpu_put(vcpu); 3922 3923 kvm_x86_ops->vcpu_free(vcpu); 3924 } 3925 3926 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 3927 { 3928 return kvm_x86_ops->vcpu_reset(vcpu); 3929 } 3930 3931 void kvm_arch_hardware_enable(void *garbage) 3932 { 3933 kvm_x86_ops->hardware_enable(garbage); 3934 } 3935 3936 void kvm_arch_hardware_disable(void *garbage) 3937 { 3938 kvm_x86_ops->hardware_disable(garbage); 3939 } 3940 3941 int kvm_arch_hardware_setup(void) 3942 { 3943 return kvm_x86_ops->hardware_setup(); 3944 } 3945 3946 void kvm_arch_hardware_unsetup(void) 3947 { 3948 kvm_x86_ops->hardware_unsetup(); 3949 } 3950 3951 void kvm_arch_check_processor_compat(void *rtn) 3952 { 3953 kvm_x86_ops->check_processor_compatibility(rtn); 3954 } 3955 3956 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 3957 { 3958 struct page *page; 3959 struct kvm *kvm; 3960 int r; 3961 3962 BUG_ON(vcpu->kvm == NULL); 3963 kvm = vcpu->kvm; 3964 3965 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3966 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 3967 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3968 else 3969 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 3970 3971 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3972 if (!page) { 3973 r = -ENOMEM; 3974 goto fail; 3975 } 3976 vcpu->arch.pio_data = page_address(page); 3977 3978 r = kvm_mmu_create(vcpu); 3979 if (r < 0) 3980 goto fail_free_pio_data; 3981 3982 if (irqchip_in_kernel(kvm)) { 3983 r = kvm_create_lapic(vcpu); 3984 if (r < 0) 3985 goto fail_mmu_destroy; 3986 } 3987 3988 return 0; 3989 3990 fail_mmu_destroy: 3991 kvm_mmu_destroy(vcpu); 3992 fail_free_pio_data: 3993 free_page((unsigned long)vcpu->arch.pio_data); 3994 fail: 3995 return r; 3996 } 3997 3998 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 3999 { 4000 kvm_free_lapic(vcpu); 4001 down_read(&vcpu->kvm->slots_lock); 4002 kvm_mmu_destroy(vcpu); 4003 up_read(&vcpu->kvm->slots_lock); 4004 free_page((unsigned long)vcpu->arch.pio_data); 4005 } 4006 4007 struct kvm *kvm_arch_create_vm(void) 4008 { 4009 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4010 4011 if (!kvm) 4012 return ERR_PTR(-ENOMEM); 4013 4014 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4015 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4016 4017 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4018 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4019 4020 return kvm; 4021 } 4022 4023 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4024 { 4025 vcpu_load(vcpu); 4026 kvm_mmu_unload(vcpu); 4027 vcpu_put(vcpu); 4028 } 4029 4030 static void kvm_free_vcpus(struct kvm *kvm) 4031 { 4032 unsigned int i; 4033 4034 /* 4035 * Unpin any mmu pages first. 4036 */ 4037 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4038 if (kvm->vcpus[i]) 4039 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4040 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4041 if (kvm->vcpus[i]) { 4042 kvm_arch_vcpu_free(kvm->vcpus[i]); 4043 kvm->vcpus[i] = NULL; 4044 } 4045 } 4046 4047 } 4048 4049 void kvm_arch_destroy_vm(struct kvm *kvm) 4050 { 4051 kvm_iommu_unmap_guest(kvm); 4052 kvm_free_all_assigned_devices(kvm); 4053 kvm_free_pit(kvm); 4054 kfree(kvm->arch.vpic); 4055 kfree(kvm->arch.vioapic); 4056 kvm_free_vcpus(kvm); 4057 kvm_free_physmem(kvm); 4058 if (kvm->arch.apic_access_page) 4059 put_page(kvm->arch.apic_access_page); 4060 if (kvm->arch.ept_identity_pagetable) 4061 put_page(kvm->arch.ept_identity_pagetable); 4062 kfree(kvm); 4063 } 4064 4065 int kvm_arch_set_memory_region(struct kvm *kvm, 4066 struct kvm_userspace_memory_region *mem, 4067 struct kvm_memory_slot old, 4068 int user_alloc) 4069 { 4070 int npages = mem->memory_size >> PAGE_SHIFT; 4071 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4072 4073 /*To keep backward compatibility with older userspace, 4074 *x86 needs to hanlde !user_alloc case. 4075 */ 4076 if (!user_alloc) { 4077 if (npages && !old.rmap) { 4078 unsigned long userspace_addr; 4079 4080 down_write(¤t->mm->mmap_sem); 4081 userspace_addr = do_mmap(NULL, 0, 4082 npages * PAGE_SIZE, 4083 PROT_READ | PROT_WRITE, 4084 MAP_PRIVATE | MAP_ANONYMOUS, 4085 0); 4086 up_write(¤t->mm->mmap_sem); 4087 4088 if (IS_ERR((void *)userspace_addr)) 4089 return PTR_ERR((void *)userspace_addr); 4090 4091 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4092 spin_lock(&kvm->mmu_lock); 4093 memslot->userspace_addr = userspace_addr; 4094 spin_unlock(&kvm->mmu_lock); 4095 } else { 4096 if (!old.user_alloc && old.rmap) { 4097 int ret; 4098 4099 down_write(¤t->mm->mmap_sem); 4100 ret = do_munmap(current->mm, old.userspace_addr, 4101 old.npages * PAGE_SIZE); 4102 up_write(¤t->mm->mmap_sem); 4103 if (ret < 0) 4104 printk(KERN_WARNING 4105 "kvm_vm_ioctl_set_memory_region: " 4106 "failed to munmap memory\n"); 4107 } 4108 } 4109 } 4110 4111 if (!kvm->arch.n_requested_mmu_pages) { 4112 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4113 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4114 } 4115 4116 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4117 kvm_flush_remote_tlbs(kvm); 4118 4119 return 0; 4120 } 4121 4122 void kvm_arch_flush_shadow(struct kvm *kvm) 4123 { 4124 kvm_mmu_zap_all(kvm); 4125 } 4126 4127 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4128 { 4129 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4130 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; 4131 } 4132 4133 static void vcpu_kick_intr(void *info) 4134 { 4135 #ifdef DEBUG 4136 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; 4137 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); 4138 #endif 4139 } 4140 4141 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4142 { 4143 int ipi_pcpu = vcpu->cpu; 4144 int cpu = get_cpu(); 4145 4146 if (waitqueue_active(&vcpu->wq)) { 4147 wake_up_interruptible(&vcpu->wq); 4148 ++vcpu->stat.halt_wakeup; 4149 } 4150 /* 4151 * We may be called synchronously with irqs disabled in guest mode, 4152 * So need not to call smp_call_function_single() in that case. 4153 */ 4154 if (vcpu->guest_mode && vcpu->cpu != cpu) 4155 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); 4156 put_cpu(); 4157 } 4158