1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 41 #include <asm/uaccess.h> 42 #include <asm/msr.h> 43 #include <asm/desc.h> 44 #include <asm/mtrr.h> 45 46 #define MAX_IO_MSRS 256 47 #define CR0_RESERVED_BITS \ 48 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 49 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 51 #define CR4_RESERVED_BITS \ 52 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 53 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 54 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 55 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 56 57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 58 /* EFER defaults: 59 * - enable syscall per default because its emulated by KVM 60 * - enable LME and LMA per default on 64 bit KVM 61 */ 62 #ifdef CONFIG_X86_64 63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 64 #else 65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 66 #endif 67 68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 70 71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 72 struct kvm_cpuid_entry2 __user *entries); 73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 74 u32 function, u32 index); 75 76 struct kvm_x86_ops *kvm_x86_ops; 77 EXPORT_SYMBOL_GPL(kvm_x86_ops); 78 79 struct kvm_stats_debugfs_item debugfs_entries[] = { 80 { "pf_fixed", VCPU_STAT(pf_fixed) }, 81 { "pf_guest", VCPU_STAT(pf_guest) }, 82 { "tlb_flush", VCPU_STAT(tlb_flush) }, 83 { "invlpg", VCPU_STAT(invlpg) }, 84 { "exits", VCPU_STAT(exits) }, 85 { "io_exits", VCPU_STAT(io_exits) }, 86 { "mmio_exits", VCPU_STAT(mmio_exits) }, 87 { "signal_exits", VCPU_STAT(signal_exits) }, 88 { "irq_window", VCPU_STAT(irq_window_exits) }, 89 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 90 { "halt_exits", VCPU_STAT(halt_exits) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 92 { "hypercalls", VCPU_STAT(hypercalls) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) }, 94 { "request_nmi", VCPU_STAT(request_nmi_exits) }, 95 { "irq_exits", VCPU_STAT(irq_exits) }, 96 { "host_state_reload", VCPU_STAT(host_state_reload) }, 97 { "efer_reload", VCPU_STAT(efer_reload) }, 98 { "fpu_reload", VCPU_STAT(fpu_reload) }, 99 { "insn_emulation", VCPU_STAT(insn_emulation) }, 100 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 101 { "irq_injections", VCPU_STAT(irq_injections) }, 102 { "nmi_injections", VCPU_STAT(nmi_injections) }, 103 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 104 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 105 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 106 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 107 { "mmu_flooded", VM_STAT(mmu_flooded) }, 108 { "mmu_recycled", VM_STAT(mmu_recycled) }, 109 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 110 { "mmu_unsync", VM_STAT(mmu_unsync) }, 111 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, 112 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 113 { "largepages", VM_STAT(lpages) }, 114 { NULL } 115 }; 116 117 unsigned long segment_base(u16 selector) 118 { 119 struct descriptor_table gdt; 120 struct desc_struct *d; 121 unsigned long table_base; 122 unsigned long v; 123 124 if (selector == 0) 125 return 0; 126 127 asm("sgdt %0" : "=m"(gdt)); 128 table_base = gdt.base; 129 130 if (selector & 4) { /* from ldt */ 131 u16 ldt_selector; 132 133 asm("sldt %0" : "=g"(ldt_selector)); 134 table_base = segment_base(ldt_selector); 135 } 136 d = (struct desc_struct *)(table_base + (selector & ~7)); 137 v = d->base0 | ((unsigned long)d->base1 << 16) | 138 ((unsigned long)d->base2 << 24); 139 #ifdef CONFIG_X86_64 140 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 141 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 142 #endif 143 return v; 144 } 145 EXPORT_SYMBOL_GPL(segment_base); 146 147 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 148 { 149 if (irqchip_in_kernel(vcpu->kvm)) 150 return vcpu->arch.apic_base; 151 else 152 return vcpu->arch.apic_base; 153 } 154 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 155 156 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 157 { 158 /* TODO: reserve bits check */ 159 if (irqchip_in_kernel(vcpu->kvm)) 160 kvm_lapic_set_base(vcpu, data); 161 else 162 vcpu->arch.apic_base = data; 163 } 164 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 165 166 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 167 { 168 WARN_ON(vcpu->arch.exception.pending); 169 vcpu->arch.exception.pending = true; 170 vcpu->arch.exception.has_error_code = false; 171 vcpu->arch.exception.nr = nr; 172 } 173 EXPORT_SYMBOL_GPL(kvm_queue_exception); 174 175 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 176 u32 error_code) 177 { 178 ++vcpu->stat.pf_guest; 179 180 if (vcpu->arch.exception.pending) { 181 if (vcpu->arch.exception.nr == PF_VECTOR) { 182 printk(KERN_DEBUG "kvm: inject_page_fault:" 183 " double fault 0x%lx\n", addr); 184 vcpu->arch.exception.nr = DF_VECTOR; 185 vcpu->arch.exception.error_code = 0; 186 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 187 /* triple fault -> shutdown */ 188 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 189 } 190 return; 191 } 192 vcpu->arch.cr2 = addr; 193 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 194 } 195 196 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 197 { 198 vcpu->arch.nmi_pending = 1; 199 } 200 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 201 202 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 203 { 204 WARN_ON(vcpu->arch.exception.pending); 205 vcpu->arch.exception.pending = true; 206 vcpu->arch.exception.has_error_code = true; 207 vcpu->arch.exception.nr = nr; 208 vcpu->arch.exception.error_code = error_code; 209 } 210 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 211 212 static void __queue_exception(struct kvm_vcpu *vcpu) 213 { 214 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 215 vcpu->arch.exception.has_error_code, 216 vcpu->arch.exception.error_code); 217 } 218 219 /* 220 * Load the pae pdptrs. Return true is they are all valid. 221 */ 222 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 223 { 224 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 225 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 226 int i; 227 int ret; 228 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 229 230 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 231 offset * sizeof(u64), sizeof(pdpte)); 232 if (ret < 0) { 233 ret = 0; 234 goto out; 235 } 236 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 237 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 238 ret = 0; 239 goto out; 240 } 241 } 242 ret = 1; 243 244 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 245 out: 246 247 return ret; 248 } 249 EXPORT_SYMBOL_GPL(load_pdptrs); 250 251 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 252 { 253 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 254 bool changed = true; 255 int r; 256 257 if (is_long_mode(vcpu) || !is_pae(vcpu)) 258 return false; 259 260 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 261 if (r < 0) 262 goto out; 263 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 264 out: 265 266 return changed; 267 } 268 269 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 270 { 271 if (cr0 & CR0_RESERVED_BITS) { 272 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 273 cr0, vcpu->arch.cr0); 274 kvm_inject_gp(vcpu, 0); 275 return; 276 } 277 278 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 279 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 280 kvm_inject_gp(vcpu, 0); 281 return; 282 } 283 284 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 285 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 286 "and a clear PE flag\n"); 287 kvm_inject_gp(vcpu, 0); 288 return; 289 } 290 291 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 292 #ifdef CONFIG_X86_64 293 if ((vcpu->arch.shadow_efer & EFER_LME)) { 294 int cs_db, cs_l; 295 296 if (!is_pae(vcpu)) { 297 printk(KERN_DEBUG "set_cr0: #GP, start paging " 298 "in long mode while PAE is disabled\n"); 299 kvm_inject_gp(vcpu, 0); 300 return; 301 } 302 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 303 if (cs_l) { 304 printk(KERN_DEBUG "set_cr0: #GP, start paging " 305 "in long mode while CS.L == 1\n"); 306 kvm_inject_gp(vcpu, 0); 307 return; 308 309 } 310 } else 311 #endif 312 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 313 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 314 "reserved bits\n"); 315 kvm_inject_gp(vcpu, 0); 316 return; 317 } 318 319 } 320 321 kvm_x86_ops->set_cr0(vcpu, cr0); 322 vcpu->arch.cr0 = cr0; 323 324 kvm_mmu_sync_global(vcpu); 325 kvm_mmu_reset_context(vcpu); 326 return; 327 } 328 EXPORT_SYMBOL_GPL(kvm_set_cr0); 329 330 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 331 { 332 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 333 KVMTRACE_1D(LMSW, vcpu, 334 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 335 handler); 336 } 337 EXPORT_SYMBOL_GPL(kvm_lmsw); 338 339 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 340 { 341 if (cr4 & CR4_RESERVED_BITS) { 342 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 343 kvm_inject_gp(vcpu, 0); 344 return; 345 } 346 347 if (is_long_mode(vcpu)) { 348 if (!(cr4 & X86_CR4_PAE)) { 349 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 350 "in long mode\n"); 351 kvm_inject_gp(vcpu, 0); 352 return; 353 } 354 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 355 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 356 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 357 kvm_inject_gp(vcpu, 0); 358 return; 359 } 360 361 if (cr4 & X86_CR4_VMXE) { 362 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 363 kvm_inject_gp(vcpu, 0); 364 return; 365 } 366 kvm_x86_ops->set_cr4(vcpu, cr4); 367 vcpu->arch.cr4 = cr4; 368 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 369 kvm_mmu_sync_global(vcpu); 370 kvm_mmu_reset_context(vcpu); 371 } 372 EXPORT_SYMBOL_GPL(kvm_set_cr4); 373 374 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 375 { 376 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 377 kvm_mmu_sync_roots(vcpu); 378 kvm_mmu_flush_tlb(vcpu); 379 return; 380 } 381 382 if (is_long_mode(vcpu)) { 383 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 384 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 385 kvm_inject_gp(vcpu, 0); 386 return; 387 } 388 } else { 389 if (is_pae(vcpu)) { 390 if (cr3 & CR3_PAE_RESERVED_BITS) { 391 printk(KERN_DEBUG 392 "set_cr3: #GP, reserved bits\n"); 393 kvm_inject_gp(vcpu, 0); 394 return; 395 } 396 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 397 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 398 "reserved bits\n"); 399 kvm_inject_gp(vcpu, 0); 400 return; 401 } 402 } 403 /* 404 * We don't check reserved bits in nonpae mode, because 405 * this isn't enforced, and VMware depends on this. 406 */ 407 } 408 409 /* 410 * Does the new cr3 value map to physical memory? (Note, we 411 * catch an invalid cr3 even in real-mode, because it would 412 * cause trouble later on when we turn on paging anyway.) 413 * 414 * A real CPU would silently accept an invalid cr3 and would 415 * attempt to use it - with largely undefined (and often hard 416 * to debug) behavior on the guest side. 417 */ 418 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 419 kvm_inject_gp(vcpu, 0); 420 else { 421 vcpu->arch.cr3 = cr3; 422 vcpu->arch.mmu.new_cr3(vcpu); 423 } 424 } 425 EXPORT_SYMBOL_GPL(kvm_set_cr3); 426 427 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 428 { 429 if (cr8 & CR8_RESERVED_BITS) { 430 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 431 kvm_inject_gp(vcpu, 0); 432 return; 433 } 434 if (irqchip_in_kernel(vcpu->kvm)) 435 kvm_lapic_set_tpr(vcpu, cr8); 436 else 437 vcpu->arch.cr8 = cr8; 438 } 439 EXPORT_SYMBOL_GPL(kvm_set_cr8); 440 441 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 442 { 443 if (irqchip_in_kernel(vcpu->kvm)) 444 return kvm_lapic_get_cr8(vcpu); 445 else 446 return vcpu->arch.cr8; 447 } 448 EXPORT_SYMBOL_GPL(kvm_get_cr8); 449 450 static inline u32 bit(int bitno) 451 { 452 return 1 << (bitno & 31); 453 } 454 455 /* 456 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 457 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 458 * 459 * This list is modified at module load time to reflect the 460 * capabilities of the host cpu. 461 */ 462 static u32 msrs_to_save[] = { 463 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 464 MSR_K6_STAR, 465 #ifdef CONFIG_X86_64 466 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 467 #endif 468 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 469 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 470 }; 471 472 static unsigned num_msrs_to_save; 473 474 static u32 emulated_msrs[] = { 475 MSR_IA32_MISC_ENABLE, 476 }; 477 478 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 479 { 480 if (efer & efer_reserved_bits) { 481 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 482 efer); 483 kvm_inject_gp(vcpu, 0); 484 return; 485 } 486 487 if (is_paging(vcpu) 488 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 489 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 490 kvm_inject_gp(vcpu, 0); 491 return; 492 } 493 494 if (efer & EFER_FFXSR) { 495 struct kvm_cpuid_entry2 *feat; 496 497 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 498 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 499 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 500 kvm_inject_gp(vcpu, 0); 501 return; 502 } 503 } 504 505 if (efer & EFER_SVME) { 506 struct kvm_cpuid_entry2 *feat; 507 508 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 509 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 510 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 511 kvm_inject_gp(vcpu, 0); 512 return; 513 } 514 } 515 516 kvm_x86_ops->set_efer(vcpu, efer); 517 518 efer &= ~EFER_LMA; 519 efer |= vcpu->arch.shadow_efer & EFER_LMA; 520 521 vcpu->arch.shadow_efer = efer; 522 } 523 524 void kvm_enable_efer_bits(u64 mask) 525 { 526 efer_reserved_bits &= ~mask; 527 } 528 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 529 530 531 /* 532 * Writes msr value into into the appropriate "register". 533 * Returns 0 on success, non-0 otherwise. 534 * Assumes vcpu_load() was already called. 535 */ 536 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 537 { 538 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 539 } 540 541 /* 542 * Adapt set_msr() to msr_io()'s calling convention 543 */ 544 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 545 { 546 return kvm_set_msr(vcpu, index, *data); 547 } 548 549 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 550 { 551 static int version; 552 struct pvclock_wall_clock wc; 553 struct timespec now, sys, boot; 554 555 if (!wall_clock) 556 return; 557 558 version++; 559 560 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 561 562 /* 563 * The guest calculates current wall clock time by adding 564 * system time (updated by kvm_write_guest_time below) to the 565 * wall clock specified here. guest system time equals host 566 * system time for us, thus we must fill in host boot time here. 567 */ 568 now = current_kernel_time(); 569 ktime_get_ts(&sys); 570 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 571 572 wc.sec = boot.tv_sec; 573 wc.nsec = boot.tv_nsec; 574 wc.version = version; 575 576 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 577 578 version++; 579 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 580 } 581 582 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 583 { 584 uint32_t quotient, remainder; 585 586 /* Don't try to replace with do_div(), this one calculates 587 * "(dividend << 32) / divisor" */ 588 __asm__ ( "divl %4" 589 : "=a" (quotient), "=d" (remainder) 590 : "0" (0), "1" (dividend), "r" (divisor) ); 591 return quotient; 592 } 593 594 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 595 { 596 uint64_t nsecs = 1000000000LL; 597 int32_t shift = 0; 598 uint64_t tps64; 599 uint32_t tps32; 600 601 tps64 = tsc_khz * 1000LL; 602 while (tps64 > nsecs*2) { 603 tps64 >>= 1; 604 shift--; 605 } 606 607 tps32 = (uint32_t)tps64; 608 while (tps32 <= (uint32_t)nsecs) { 609 tps32 <<= 1; 610 shift++; 611 } 612 613 hv_clock->tsc_shift = shift; 614 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 615 616 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 617 __func__, tsc_khz, hv_clock->tsc_shift, 618 hv_clock->tsc_to_system_mul); 619 } 620 621 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 622 623 static void kvm_write_guest_time(struct kvm_vcpu *v) 624 { 625 struct timespec ts; 626 unsigned long flags; 627 struct kvm_vcpu_arch *vcpu = &v->arch; 628 void *shared_kaddr; 629 630 if ((!vcpu->time_page)) 631 return; 632 633 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { 634 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); 635 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); 636 } 637 638 /* Keep irq disabled to prevent changes to the clock */ 639 local_irq_save(flags); 640 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 641 &vcpu->hv_clock.tsc_timestamp); 642 ktime_get_ts(&ts); 643 local_irq_restore(flags); 644 645 /* With all the info we got, fill in the values */ 646 647 vcpu->hv_clock.system_time = ts.tv_nsec + 648 (NSEC_PER_SEC * (u64)ts.tv_sec); 649 /* 650 * The interface expects us to write an even number signaling that the 651 * update is finished. Since the guest won't see the intermediate 652 * state, we just increase by 2 at the end. 653 */ 654 vcpu->hv_clock.version += 2; 655 656 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 657 658 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 659 sizeof(vcpu->hv_clock)); 660 661 kunmap_atomic(shared_kaddr, KM_USER0); 662 663 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 664 } 665 666 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 667 { 668 struct kvm_vcpu_arch *vcpu = &v->arch; 669 670 if (!vcpu->time_page) 671 return 0; 672 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 673 return 1; 674 } 675 676 static bool msr_mtrr_valid(unsigned msr) 677 { 678 switch (msr) { 679 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 680 case MSR_MTRRfix64K_00000: 681 case MSR_MTRRfix16K_80000: 682 case MSR_MTRRfix16K_A0000: 683 case MSR_MTRRfix4K_C0000: 684 case MSR_MTRRfix4K_C8000: 685 case MSR_MTRRfix4K_D0000: 686 case MSR_MTRRfix4K_D8000: 687 case MSR_MTRRfix4K_E0000: 688 case MSR_MTRRfix4K_E8000: 689 case MSR_MTRRfix4K_F0000: 690 case MSR_MTRRfix4K_F8000: 691 case MSR_MTRRdefType: 692 case MSR_IA32_CR_PAT: 693 return true; 694 case 0x2f8: 695 return true; 696 } 697 return false; 698 } 699 700 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 701 { 702 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 703 704 if (!msr_mtrr_valid(msr)) 705 return 1; 706 707 if (msr == MSR_MTRRdefType) { 708 vcpu->arch.mtrr_state.def_type = data; 709 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 710 } else if (msr == MSR_MTRRfix64K_00000) 711 p[0] = data; 712 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 713 p[1 + msr - MSR_MTRRfix16K_80000] = data; 714 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 715 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 716 else if (msr == MSR_IA32_CR_PAT) 717 vcpu->arch.pat = data; 718 else { /* Variable MTRRs */ 719 int idx, is_mtrr_mask; 720 u64 *pt; 721 722 idx = (msr - 0x200) / 2; 723 is_mtrr_mask = msr - 0x200 - 2 * idx; 724 if (!is_mtrr_mask) 725 pt = 726 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 727 else 728 pt = 729 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 730 *pt = data; 731 } 732 733 kvm_mmu_reset_context(vcpu); 734 return 0; 735 } 736 737 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 738 { 739 switch (msr) { 740 case MSR_EFER: 741 set_efer(vcpu, data); 742 break; 743 case MSR_IA32_MC0_STATUS: 744 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 745 __func__, data); 746 break; 747 case MSR_IA32_MCG_STATUS: 748 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 749 __func__, data); 750 break; 751 case MSR_IA32_MCG_CTL: 752 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 753 __func__, data); 754 break; 755 case MSR_IA32_DEBUGCTLMSR: 756 if (!data) { 757 /* We support the non-activated case already */ 758 break; 759 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 760 /* Values other than LBR and BTF are vendor-specific, 761 thus reserved and should throw a #GP */ 762 return 1; 763 } 764 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 765 __func__, data); 766 break; 767 case MSR_IA32_UCODE_REV: 768 case MSR_IA32_UCODE_WRITE: 769 case MSR_VM_HSAVE_PA: 770 break; 771 case 0x200 ... 0x2ff: 772 return set_msr_mtrr(vcpu, msr, data); 773 case MSR_IA32_APICBASE: 774 kvm_set_apic_base(vcpu, data); 775 break; 776 case MSR_IA32_MISC_ENABLE: 777 vcpu->arch.ia32_misc_enable_msr = data; 778 break; 779 case MSR_KVM_WALL_CLOCK: 780 vcpu->kvm->arch.wall_clock = data; 781 kvm_write_wall_clock(vcpu->kvm, data); 782 break; 783 case MSR_KVM_SYSTEM_TIME: { 784 if (vcpu->arch.time_page) { 785 kvm_release_page_dirty(vcpu->arch.time_page); 786 vcpu->arch.time_page = NULL; 787 } 788 789 vcpu->arch.time = data; 790 791 /* we verify if the enable bit is set... */ 792 if (!(data & 1)) 793 break; 794 795 /* ...but clean it before doing the actual write */ 796 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 797 798 vcpu->arch.time_page = 799 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 800 801 if (is_error_page(vcpu->arch.time_page)) { 802 kvm_release_page_clean(vcpu->arch.time_page); 803 vcpu->arch.time_page = NULL; 804 } 805 806 kvm_request_guest_time_update(vcpu); 807 break; 808 } 809 default: 810 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 811 return 1; 812 } 813 return 0; 814 } 815 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 816 817 818 /* 819 * Reads an msr value (of 'msr_index') into 'pdata'. 820 * Returns 0 on success, non-0 otherwise. 821 * Assumes vcpu_load() was already called. 822 */ 823 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 824 { 825 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 826 } 827 828 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 829 { 830 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 831 832 if (!msr_mtrr_valid(msr)) 833 return 1; 834 835 if (msr == MSR_MTRRdefType) 836 *pdata = vcpu->arch.mtrr_state.def_type + 837 (vcpu->arch.mtrr_state.enabled << 10); 838 else if (msr == MSR_MTRRfix64K_00000) 839 *pdata = p[0]; 840 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 841 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 842 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 843 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 844 else if (msr == MSR_IA32_CR_PAT) 845 *pdata = vcpu->arch.pat; 846 else { /* Variable MTRRs */ 847 int idx, is_mtrr_mask; 848 u64 *pt; 849 850 idx = (msr - 0x200) / 2; 851 is_mtrr_mask = msr - 0x200 - 2 * idx; 852 if (!is_mtrr_mask) 853 pt = 854 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 855 else 856 pt = 857 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 858 *pdata = *pt; 859 } 860 861 return 0; 862 } 863 864 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 865 { 866 u64 data; 867 868 switch (msr) { 869 case 0xc0010010: /* SYSCFG */ 870 case 0xc0010015: /* HWCR */ 871 case MSR_IA32_PLATFORM_ID: 872 case MSR_IA32_P5_MC_ADDR: 873 case MSR_IA32_P5_MC_TYPE: 874 case MSR_IA32_MC0_CTL: 875 case MSR_IA32_MCG_STATUS: 876 case MSR_IA32_MCG_CAP: 877 case MSR_IA32_MCG_CTL: 878 case MSR_IA32_MC0_MISC: 879 case MSR_IA32_MC0_MISC+4: 880 case MSR_IA32_MC0_MISC+8: 881 case MSR_IA32_MC0_MISC+12: 882 case MSR_IA32_MC0_MISC+16: 883 case MSR_IA32_MC0_MISC+20: 884 case MSR_IA32_UCODE_REV: 885 case MSR_IA32_EBL_CR_POWERON: 886 case MSR_IA32_DEBUGCTLMSR: 887 case MSR_IA32_LASTBRANCHFROMIP: 888 case MSR_IA32_LASTBRANCHTOIP: 889 case MSR_IA32_LASTINTFROMIP: 890 case MSR_IA32_LASTINTTOIP: 891 case MSR_VM_HSAVE_PA: 892 data = 0; 893 break; 894 case MSR_MTRRcap: 895 data = 0x500 | KVM_NR_VAR_MTRR; 896 break; 897 case 0x200 ... 0x2ff: 898 return get_msr_mtrr(vcpu, msr, pdata); 899 case 0xcd: /* fsb frequency */ 900 data = 3; 901 break; 902 case MSR_IA32_APICBASE: 903 data = kvm_get_apic_base(vcpu); 904 break; 905 case MSR_IA32_MISC_ENABLE: 906 data = vcpu->arch.ia32_misc_enable_msr; 907 break; 908 case MSR_IA32_PERF_STATUS: 909 /* TSC increment by tick */ 910 data = 1000ULL; 911 /* CPU multiplier */ 912 data |= (((uint64_t)4ULL) << 40); 913 break; 914 case MSR_EFER: 915 data = vcpu->arch.shadow_efer; 916 break; 917 case MSR_KVM_WALL_CLOCK: 918 data = vcpu->kvm->arch.wall_clock; 919 break; 920 case MSR_KVM_SYSTEM_TIME: 921 data = vcpu->arch.time; 922 break; 923 default: 924 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 925 return 1; 926 } 927 *pdata = data; 928 return 0; 929 } 930 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 931 932 /* 933 * Read or write a bunch of msrs. All parameters are kernel addresses. 934 * 935 * @return number of msrs set successfully. 936 */ 937 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 938 struct kvm_msr_entry *entries, 939 int (*do_msr)(struct kvm_vcpu *vcpu, 940 unsigned index, u64 *data)) 941 { 942 int i; 943 944 vcpu_load(vcpu); 945 946 down_read(&vcpu->kvm->slots_lock); 947 for (i = 0; i < msrs->nmsrs; ++i) 948 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 949 break; 950 up_read(&vcpu->kvm->slots_lock); 951 952 vcpu_put(vcpu); 953 954 return i; 955 } 956 957 /* 958 * Read or write a bunch of msrs. Parameters are user addresses. 959 * 960 * @return number of msrs set successfully. 961 */ 962 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 963 int (*do_msr)(struct kvm_vcpu *vcpu, 964 unsigned index, u64 *data), 965 int writeback) 966 { 967 struct kvm_msrs msrs; 968 struct kvm_msr_entry *entries; 969 int r, n; 970 unsigned size; 971 972 r = -EFAULT; 973 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 974 goto out; 975 976 r = -E2BIG; 977 if (msrs.nmsrs >= MAX_IO_MSRS) 978 goto out; 979 980 r = -ENOMEM; 981 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 982 entries = vmalloc(size); 983 if (!entries) 984 goto out; 985 986 r = -EFAULT; 987 if (copy_from_user(entries, user_msrs->entries, size)) 988 goto out_free; 989 990 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 991 if (r < 0) 992 goto out_free; 993 994 r = -EFAULT; 995 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 996 goto out_free; 997 998 r = n; 999 1000 out_free: 1001 vfree(entries); 1002 out: 1003 return r; 1004 } 1005 1006 int kvm_dev_ioctl_check_extension(long ext) 1007 { 1008 int r; 1009 1010 switch (ext) { 1011 case KVM_CAP_IRQCHIP: 1012 case KVM_CAP_HLT: 1013 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1014 case KVM_CAP_SET_TSS_ADDR: 1015 case KVM_CAP_EXT_CPUID: 1016 case KVM_CAP_CLOCKSOURCE: 1017 case KVM_CAP_PIT: 1018 case KVM_CAP_NOP_IO_DELAY: 1019 case KVM_CAP_MP_STATE: 1020 case KVM_CAP_SYNC_MMU: 1021 case KVM_CAP_REINJECT_CONTROL: 1022 case KVM_CAP_IRQ_INJECT_STATUS: 1023 r = 1; 1024 break; 1025 case KVM_CAP_COALESCED_MMIO: 1026 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1027 break; 1028 case KVM_CAP_VAPIC: 1029 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1030 break; 1031 case KVM_CAP_NR_VCPUS: 1032 r = KVM_MAX_VCPUS; 1033 break; 1034 case KVM_CAP_NR_MEMSLOTS: 1035 r = KVM_MEMORY_SLOTS; 1036 break; 1037 case KVM_CAP_PV_MMU: 1038 r = !tdp_enabled; 1039 break; 1040 case KVM_CAP_IOMMU: 1041 r = iommu_found(); 1042 break; 1043 default: 1044 r = 0; 1045 break; 1046 } 1047 return r; 1048 1049 } 1050 1051 long kvm_arch_dev_ioctl(struct file *filp, 1052 unsigned int ioctl, unsigned long arg) 1053 { 1054 void __user *argp = (void __user *)arg; 1055 long r; 1056 1057 switch (ioctl) { 1058 case KVM_GET_MSR_INDEX_LIST: { 1059 struct kvm_msr_list __user *user_msr_list = argp; 1060 struct kvm_msr_list msr_list; 1061 unsigned n; 1062 1063 r = -EFAULT; 1064 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1065 goto out; 1066 n = msr_list.nmsrs; 1067 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1068 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1069 goto out; 1070 r = -E2BIG; 1071 if (n < num_msrs_to_save) 1072 goto out; 1073 r = -EFAULT; 1074 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1075 num_msrs_to_save * sizeof(u32))) 1076 goto out; 1077 if (copy_to_user(user_msr_list->indices 1078 + num_msrs_to_save * sizeof(u32), 1079 &emulated_msrs, 1080 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1081 goto out; 1082 r = 0; 1083 break; 1084 } 1085 case KVM_GET_SUPPORTED_CPUID: { 1086 struct kvm_cpuid2 __user *cpuid_arg = argp; 1087 struct kvm_cpuid2 cpuid; 1088 1089 r = -EFAULT; 1090 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1091 goto out; 1092 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1093 cpuid_arg->entries); 1094 if (r) 1095 goto out; 1096 1097 r = -EFAULT; 1098 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1099 goto out; 1100 r = 0; 1101 break; 1102 } 1103 default: 1104 r = -EINVAL; 1105 } 1106 out: 1107 return r; 1108 } 1109 1110 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1111 { 1112 kvm_x86_ops->vcpu_load(vcpu, cpu); 1113 kvm_request_guest_time_update(vcpu); 1114 } 1115 1116 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1117 { 1118 kvm_x86_ops->vcpu_put(vcpu); 1119 kvm_put_guest_fpu(vcpu); 1120 } 1121 1122 static int is_efer_nx(void) 1123 { 1124 u64 efer; 1125 1126 rdmsrl(MSR_EFER, efer); 1127 return efer & EFER_NX; 1128 } 1129 1130 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1131 { 1132 int i; 1133 struct kvm_cpuid_entry2 *e, *entry; 1134 1135 entry = NULL; 1136 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1137 e = &vcpu->arch.cpuid_entries[i]; 1138 if (e->function == 0x80000001) { 1139 entry = e; 1140 break; 1141 } 1142 } 1143 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1144 entry->edx &= ~(1 << 20); 1145 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1146 } 1147 } 1148 1149 /* when an old userspace process fills a new kernel module */ 1150 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1151 struct kvm_cpuid *cpuid, 1152 struct kvm_cpuid_entry __user *entries) 1153 { 1154 int r, i; 1155 struct kvm_cpuid_entry *cpuid_entries; 1156 1157 r = -E2BIG; 1158 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1159 goto out; 1160 r = -ENOMEM; 1161 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1162 if (!cpuid_entries) 1163 goto out; 1164 r = -EFAULT; 1165 if (copy_from_user(cpuid_entries, entries, 1166 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1167 goto out_free; 1168 for (i = 0; i < cpuid->nent; i++) { 1169 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1170 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1171 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1172 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1173 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1174 vcpu->arch.cpuid_entries[i].index = 0; 1175 vcpu->arch.cpuid_entries[i].flags = 0; 1176 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1177 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1178 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1179 } 1180 vcpu->arch.cpuid_nent = cpuid->nent; 1181 cpuid_fix_nx_cap(vcpu); 1182 r = 0; 1183 1184 out_free: 1185 vfree(cpuid_entries); 1186 out: 1187 return r; 1188 } 1189 1190 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1191 struct kvm_cpuid2 *cpuid, 1192 struct kvm_cpuid_entry2 __user *entries) 1193 { 1194 int r; 1195 1196 r = -E2BIG; 1197 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1198 goto out; 1199 r = -EFAULT; 1200 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1201 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1202 goto out; 1203 vcpu->arch.cpuid_nent = cpuid->nent; 1204 return 0; 1205 1206 out: 1207 return r; 1208 } 1209 1210 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1211 struct kvm_cpuid2 *cpuid, 1212 struct kvm_cpuid_entry2 __user *entries) 1213 { 1214 int r; 1215 1216 r = -E2BIG; 1217 if (cpuid->nent < vcpu->arch.cpuid_nent) 1218 goto out; 1219 r = -EFAULT; 1220 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1221 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1222 goto out; 1223 return 0; 1224 1225 out: 1226 cpuid->nent = vcpu->arch.cpuid_nent; 1227 return r; 1228 } 1229 1230 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1231 u32 index) 1232 { 1233 entry->function = function; 1234 entry->index = index; 1235 cpuid_count(entry->function, entry->index, 1236 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1237 entry->flags = 0; 1238 } 1239 1240 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1241 u32 index, int *nent, int maxnent) 1242 { 1243 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1244 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1245 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1246 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1247 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1248 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | 1249 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1250 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | 1251 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | 1252 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); 1253 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | 1254 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1255 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1256 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1257 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1258 bit(X86_FEATURE_PGE) | 1259 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1260 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | 1261 bit(X86_FEATURE_SYSCALL) | 1262 (bit(X86_FEATURE_NX) && is_efer_nx()) | 1263 #ifdef CONFIG_X86_64 1264 bit(X86_FEATURE_LM) | 1265 #endif 1266 bit(X86_FEATURE_FXSR_OPT) | 1267 bit(X86_FEATURE_MMXEXT) | 1268 bit(X86_FEATURE_3DNOWEXT) | 1269 bit(X86_FEATURE_3DNOW); 1270 const u32 kvm_supported_word3_x86_features = 1271 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1272 const u32 kvm_supported_word6_x86_features = 1273 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | 1274 bit(X86_FEATURE_SVM); 1275 1276 /* all calls to cpuid_count() should be made on the same cpu */ 1277 get_cpu(); 1278 do_cpuid_1_ent(entry, function, index); 1279 ++*nent; 1280 1281 switch (function) { 1282 case 0: 1283 entry->eax = min(entry->eax, (u32)0xb); 1284 break; 1285 case 1: 1286 entry->edx &= kvm_supported_word0_x86_features; 1287 entry->ecx &= kvm_supported_word3_x86_features; 1288 break; 1289 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1290 * may return different values. This forces us to get_cpu() before 1291 * issuing the first command, and also to emulate this annoying behavior 1292 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1293 case 2: { 1294 int t, times = entry->eax & 0xff; 1295 1296 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1297 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1298 for (t = 1; t < times && *nent < maxnent; ++t) { 1299 do_cpuid_1_ent(&entry[t], function, 0); 1300 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1301 ++*nent; 1302 } 1303 break; 1304 } 1305 /* function 4 and 0xb have additional index. */ 1306 case 4: { 1307 int i, cache_type; 1308 1309 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1310 /* read more entries until cache_type is zero */ 1311 for (i = 1; *nent < maxnent; ++i) { 1312 cache_type = entry[i - 1].eax & 0x1f; 1313 if (!cache_type) 1314 break; 1315 do_cpuid_1_ent(&entry[i], function, i); 1316 entry[i].flags |= 1317 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1318 ++*nent; 1319 } 1320 break; 1321 } 1322 case 0xb: { 1323 int i, level_type; 1324 1325 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1326 /* read more entries until level_type is zero */ 1327 for (i = 1; *nent < maxnent; ++i) { 1328 level_type = entry[i - 1].ecx & 0xff00; 1329 if (!level_type) 1330 break; 1331 do_cpuid_1_ent(&entry[i], function, i); 1332 entry[i].flags |= 1333 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1334 ++*nent; 1335 } 1336 break; 1337 } 1338 case 0x80000000: 1339 entry->eax = min(entry->eax, 0x8000001a); 1340 break; 1341 case 0x80000001: 1342 entry->edx &= kvm_supported_word1_x86_features; 1343 entry->ecx &= kvm_supported_word6_x86_features; 1344 break; 1345 } 1346 put_cpu(); 1347 } 1348 1349 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1350 struct kvm_cpuid_entry2 __user *entries) 1351 { 1352 struct kvm_cpuid_entry2 *cpuid_entries; 1353 int limit, nent = 0, r = -E2BIG; 1354 u32 func; 1355 1356 if (cpuid->nent < 1) 1357 goto out; 1358 r = -ENOMEM; 1359 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1360 if (!cpuid_entries) 1361 goto out; 1362 1363 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1364 limit = cpuid_entries[0].eax; 1365 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1366 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1367 &nent, cpuid->nent); 1368 r = -E2BIG; 1369 if (nent >= cpuid->nent) 1370 goto out_free; 1371 1372 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1373 limit = cpuid_entries[nent - 1].eax; 1374 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1375 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1376 &nent, cpuid->nent); 1377 r = -EFAULT; 1378 if (copy_to_user(entries, cpuid_entries, 1379 nent * sizeof(struct kvm_cpuid_entry2))) 1380 goto out_free; 1381 cpuid->nent = nent; 1382 r = 0; 1383 1384 out_free: 1385 vfree(cpuid_entries); 1386 out: 1387 return r; 1388 } 1389 1390 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1391 struct kvm_lapic_state *s) 1392 { 1393 vcpu_load(vcpu); 1394 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1395 vcpu_put(vcpu); 1396 1397 return 0; 1398 } 1399 1400 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1401 struct kvm_lapic_state *s) 1402 { 1403 vcpu_load(vcpu); 1404 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1405 kvm_apic_post_state_restore(vcpu); 1406 vcpu_put(vcpu); 1407 1408 return 0; 1409 } 1410 1411 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1412 struct kvm_interrupt *irq) 1413 { 1414 if (irq->irq < 0 || irq->irq >= 256) 1415 return -EINVAL; 1416 if (irqchip_in_kernel(vcpu->kvm)) 1417 return -ENXIO; 1418 vcpu_load(vcpu); 1419 1420 set_bit(irq->irq, vcpu->arch.irq_pending); 1421 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 1422 1423 vcpu_put(vcpu); 1424 1425 return 0; 1426 } 1427 1428 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1429 { 1430 vcpu_load(vcpu); 1431 kvm_inject_nmi(vcpu); 1432 vcpu_put(vcpu); 1433 1434 return 0; 1435 } 1436 1437 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1438 struct kvm_tpr_access_ctl *tac) 1439 { 1440 if (tac->flags) 1441 return -EINVAL; 1442 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1443 return 0; 1444 } 1445 1446 long kvm_arch_vcpu_ioctl(struct file *filp, 1447 unsigned int ioctl, unsigned long arg) 1448 { 1449 struct kvm_vcpu *vcpu = filp->private_data; 1450 void __user *argp = (void __user *)arg; 1451 int r; 1452 struct kvm_lapic_state *lapic = NULL; 1453 1454 switch (ioctl) { 1455 case KVM_GET_LAPIC: { 1456 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1457 1458 r = -ENOMEM; 1459 if (!lapic) 1460 goto out; 1461 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1462 if (r) 1463 goto out; 1464 r = -EFAULT; 1465 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1466 goto out; 1467 r = 0; 1468 break; 1469 } 1470 case KVM_SET_LAPIC: { 1471 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1472 r = -ENOMEM; 1473 if (!lapic) 1474 goto out; 1475 r = -EFAULT; 1476 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1477 goto out; 1478 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1479 if (r) 1480 goto out; 1481 r = 0; 1482 break; 1483 } 1484 case KVM_INTERRUPT: { 1485 struct kvm_interrupt irq; 1486 1487 r = -EFAULT; 1488 if (copy_from_user(&irq, argp, sizeof irq)) 1489 goto out; 1490 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1491 if (r) 1492 goto out; 1493 r = 0; 1494 break; 1495 } 1496 case KVM_NMI: { 1497 r = kvm_vcpu_ioctl_nmi(vcpu); 1498 if (r) 1499 goto out; 1500 r = 0; 1501 break; 1502 } 1503 case KVM_SET_CPUID: { 1504 struct kvm_cpuid __user *cpuid_arg = argp; 1505 struct kvm_cpuid cpuid; 1506 1507 r = -EFAULT; 1508 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1509 goto out; 1510 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1511 if (r) 1512 goto out; 1513 break; 1514 } 1515 case KVM_SET_CPUID2: { 1516 struct kvm_cpuid2 __user *cpuid_arg = argp; 1517 struct kvm_cpuid2 cpuid; 1518 1519 r = -EFAULT; 1520 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1521 goto out; 1522 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1523 cpuid_arg->entries); 1524 if (r) 1525 goto out; 1526 break; 1527 } 1528 case KVM_GET_CPUID2: { 1529 struct kvm_cpuid2 __user *cpuid_arg = argp; 1530 struct kvm_cpuid2 cpuid; 1531 1532 r = -EFAULT; 1533 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1534 goto out; 1535 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1536 cpuid_arg->entries); 1537 if (r) 1538 goto out; 1539 r = -EFAULT; 1540 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1541 goto out; 1542 r = 0; 1543 break; 1544 } 1545 case KVM_GET_MSRS: 1546 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1547 break; 1548 case KVM_SET_MSRS: 1549 r = msr_io(vcpu, argp, do_set_msr, 0); 1550 break; 1551 case KVM_TPR_ACCESS_REPORTING: { 1552 struct kvm_tpr_access_ctl tac; 1553 1554 r = -EFAULT; 1555 if (copy_from_user(&tac, argp, sizeof tac)) 1556 goto out; 1557 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1558 if (r) 1559 goto out; 1560 r = -EFAULT; 1561 if (copy_to_user(argp, &tac, sizeof tac)) 1562 goto out; 1563 r = 0; 1564 break; 1565 }; 1566 case KVM_SET_VAPIC_ADDR: { 1567 struct kvm_vapic_addr va; 1568 1569 r = -EINVAL; 1570 if (!irqchip_in_kernel(vcpu->kvm)) 1571 goto out; 1572 r = -EFAULT; 1573 if (copy_from_user(&va, argp, sizeof va)) 1574 goto out; 1575 r = 0; 1576 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1577 break; 1578 } 1579 default: 1580 r = -EINVAL; 1581 } 1582 out: 1583 if (lapic) 1584 kfree(lapic); 1585 return r; 1586 } 1587 1588 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1589 { 1590 int ret; 1591 1592 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1593 return -1; 1594 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1595 return ret; 1596 } 1597 1598 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1599 u32 kvm_nr_mmu_pages) 1600 { 1601 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1602 return -EINVAL; 1603 1604 down_write(&kvm->slots_lock); 1605 1606 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1607 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1608 1609 up_write(&kvm->slots_lock); 1610 return 0; 1611 } 1612 1613 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1614 { 1615 return kvm->arch.n_alloc_mmu_pages; 1616 } 1617 1618 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1619 { 1620 int i; 1621 struct kvm_mem_alias *alias; 1622 1623 for (i = 0; i < kvm->arch.naliases; ++i) { 1624 alias = &kvm->arch.aliases[i]; 1625 if (gfn >= alias->base_gfn 1626 && gfn < alias->base_gfn + alias->npages) 1627 return alias->target_gfn + gfn - alias->base_gfn; 1628 } 1629 return gfn; 1630 } 1631 1632 /* 1633 * Set a new alias region. Aliases map a portion of physical memory into 1634 * another portion. This is useful for memory windows, for example the PC 1635 * VGA region. 1636 */ 1637 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1638 struct kvm_memory_alias *alias) 1639 { 1640 int r, n; 1641 struct kvm_mem_alias *p; 1642 1643 r = -EINVAL; 1644 /* General sanity checks */ 1645 if (alias->memory_size & (PAGE_SIZE - 1)) 1646 goto out; 1647 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1648 goto out; 1649 if (alias->slot >= KVM_ALIAS_SLOTS) 1650 goto out; 1651 if (alias->guest_phys_addr + alias->memory_size 1652 < alias->guest_phys_addr) 1653 goto out; 1654 if (alias->target_phys_addr + alias->memory_size 1655 < alias->target_phys_addr) 1656 goto out; 1657 1658 down_write(&kvm->slots_lock); 1659 spin_lock(&kvm->mmu_lock); 1660 1661 p = &kvm->arch.aliases[alias->slot]; 1662 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1663 p->npages = alias->memory_size >> PAGE_SHIFT; 1664 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1665 1666 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1667 if (kvm->arch.aliases[n - 1].npages) 1668 break; 1669 kvm->arch.naliases = n; 1670 1671 spin_unlock(&kvm->mmu_lock); 1672 kvm_mmu_zap_all(kvm); 1673 1674 up_write(&kvm->slots_lock); 1675 1676 return 0; 1677 1678 out: 1679 return r; 1680 } 1681 1682 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1683 { 1684 int r; 1685 1686 r = 0; 1687 switch (chip->chip_id) { 1688 case KVM_IRQCHIP_PIC_MASTER: 1689 memcpy(&chip->chip.pic, 1690 &pic_irqchip(kvm)->pics[0], 1691 sizeof(struct kvm_pic_state)); 1692 break; 1693 case KVM_IRQCHIP_PIC_SLAVE: 1694 memcpy(&chip->chip.pic, 1695 &pic_irqchip(kvm)->pics[1], 1696 sizeof(struct kvm_pic_state)); 1697 break; 1698 case KVM_IRQCHIP_IOAPIC: 1699 memcpy(&chip->chip.ioapic, 1700 ioapic_irqchip(kvm), 1701 sizeof(struct kvm_ioapic_state)); 1702 break; 1703 default: 1704 r = -EINVAL; 1705 break; 1706 } 1707 return r; 1708 } 1709 1710 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1711 { 1712 int r; 1713 1714 r = 0; 1715 switch (chip->chip_id) { 1716 case KVM_IRQCHIP_PIC_MASTER: 1717 memcpy(&pic_irqchip(kvm)->pics[0], 1718 &chip->chip.pic, 1719 sizeof(struct kvm_pic_state)); 1720 break; 1721 case KVM_IRQCHIP_PIC_SLAVE: 1722 memcpy(&pic_irqchip(kvm)->pics[1], 1723 &chip->chip.pic, 1724 sizeof(struct kvm_pic_state)); 1725 break; 1726 case KVM_IRQCHIP_IOAPIC: 1727 memcpy(ioapic_irqchip(kvm), 1728 &chip->chip.ioapic, 1729 sizeof(struct kvm_ioapic_state)); 1730 break; 1731 default: 1732 r = -EINVAL; 1733 break; 1734 } 1735 kvm_pic_update_irq(pic_irqchip(kvm)); 1736 return r; 1737 } 1738 1739 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1740 { 1741 int r = 0; 1742 1743 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1744 return r; 1745 } 1746 1747 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1748 { 1749 int r = 0; 1750 1751 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1752 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1753 return r; 1754 } 1755 1756 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 1757 struct kvm_reinject_control *control) 1758 { 1759 if (!kvm->arch.vpit) 1760 return -ENXIO; 1761 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 1762 return 0; 1763 } 1764 1765 /* 1766 * Get (and clear) the dirty memory log for a memory slot. 1767 */ 1768 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1769 struct kvm_dirty_log *log) 1770 { 1771 int r; 1772 int n; 1773 struct kvm_memory_slot *memslot; 1774 int is_dirty = 0; 1775 1776 down_write(&kvm->slots_lock); 1777 1778 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1779 if (r) 1780 goto out; 1781 1782 /* If nothing is dirty, don't bother messing with page tables. */ 1783 if (is_dirty) { 1784 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1785 kvm_flush_remote_tlbs(kvm); 1786 memslot = &kvm->memslots[log->slot]; 1787 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1788 memset(memslot->dirty_bitmap, 0, n); 1789 } 1790 r = 0; 1791 out: 1792 up_write(&kvm->slots_lock); 1793 return r; 1794 } 1795 1796 long kvm_arch_vm_ioctl(struct file *filp, 1797 unsigned int ioctl, unsigned long arg) 1798 { 1799 struct kvm *kvm = filp->private_data; 1800 void __user *argp = (void __user *)arg; 1801 int r = -EINVAL; 1802 /* 1803 * This union makes it completely explicit to gcc-3.x 1804 * that these two variables' stack usage should be 1805 * combined, not added together. 1806 */ 1807 union { 1808 struct kvm_pit_state ps; 1809 struct kvm_memory_alias alias; 1810 } u; 1811 1812 switch (ioctl) { 1813 case KVM_SET_TSS_ADDR: 1814 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1815 if (r < 0) 1816 goto out; 1817 break; 1818 case KVM_SET_MEMORY_REGION: { 1819 struct kvm_memory_region kvm_mem; 1820 struct kvm_userspace_memory_region kvm_userspace_mem; 1821 1822 r = -EFAULT; 1823 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1824 goto out; 1825 kvm_userspace_mem.slot = kvm_mem.slot; 1826 kvm_userspace_mem.flags = kvm_mem.flags; 1827 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1828 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1829 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1830 if (r) 1831 goto out; 1832 break; 1833 } 1834 case KVM_SET_NR_MMU_PAGES: 1835 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1836 if (r) 1837 goto out; 1838 break; 1839 case KVM_GET_NR_MMU_PAGES: 1840 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1841 break; 1842 case KVM_SET_MEMORY_ALIAS: 1843 r = -EFAULT; 1844 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 1845 goto out; 1846 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 1847 if (r) 1848 goto out; 1849 break; 1850 case KVM_CREATE_IRQCHIP: 1851 r = -ENOMEM; 1852 kvm->arch.vpic = kvm_create_pic(kvm); 1853 if (kvm->arch.vpic) { 1854 r = kvm_ioapic_init(kvm); 1855 if (r) { 1856 kfree(kvm->arch.vpic); 1857 kvm->arch.vpic = NULL; 1858 goto out; 1859 } 1860 } else 1861 goto out; 1862 r = kvm_setup_default_irq_routing(kvm); 1863 if (r) { 1864 kfree(kvm->arch.vpic); 1865 kfree(kvm->arch.vioapic); 1866 goto out; 1867 } 1868 break; 1869 case KVM_CREATE_PIT: 1870 mutex_lock(&kvm->lock); 1871 r = -EEXIST; 1872 if (kvm->arch.vpit) 1873 goto create_pit_unlock; 1874 r = -ENOMEM; 1875 kvm->arch.vpit = kvm_create_pit(kvm); 1876 if (kvm->arch.vpit) 1877 r = 0; 1878 create_pit_unlock: 1879 mutex_unlock(&kvm->lock); 1880 break; 1881 case KVM_IRQ_LINE_STATUS: 1882 case KVM_IRQ_LINE: { 1883 struct kvm_irq_level irq_event; 1884 1885 r = -EFAULT; 1886 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1887 goto out; 1888 if (irqchip_in_kernel(kvm)) { 1889 __s32 status; 1890 mutex_lock(&kvm->lock); 1891 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1892 irq_event.irq, irq_event.level); 1893 mutex_unlock(&kvm->lock); 1894 if (ioctl == KVM_IRQ_LINE_STATUS) { 1895 irq_event.status = status; 1896 if (copy_to_user(argp, &irq_event, 1897 sizeof irq_event)) 1898 goto out; 1899 } 1900 r = 0; 1901 } 1902 break; 1903 } 1904 case KVM_GET_IRQCHIP: { 1905 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1906 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1907 1908 r = -ENOMEM; 1909 if (!chip) 1910 goto out; 1911 r = -EFAULT; 1912 if (copy_from_user(chip, argp, sizeof *chip)) 1913 goto get_irqchip_out; 1914 r = -ENXIO; 1915 if (!irqchip_in_kernel(kvm)) 1916 goto get_irqchip_out; 1917 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 1918 if (r) 1919 goto get_irqchip_out; 1920 r = -EFAULT; 1921 if (copy_to_user(argp, chip, sizeof *chip)) 1922 goto get_irqchip_out; 1923 r = 0; 1924 get_irqchip_out: 1925 kfree(chip); 1926 if (r) 1927 goto out; 1928 break; 1929 } 1930 case KVM_SET_IRQCHIP: { 1931 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1932 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1933 1934 r = -ENOMEM; 1935 if (!chip) 1936 goto out; 1937 r = -EFAULT; 1938 if (copy_from_user(chip, argp, sizeof *chip)) 1939 goto set_irqchip_out; 1940 r = -ENXIO; 1941 if (!irqchip_in_kernel(kvm)) 1942 goto set_irqchip_out; 1943 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 1944 if (r) 1945 goto set_irqchip_out; 1946 r = 0; 1947 set_irqchip_out: 1948 kfree(chip); 1949 if (r) 1950 goto out; 1951 break; 1952 } 1953 case KVM_GET_PIT: { 1954 r = -EFAULT; 1955 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 1956 goto out; 1957 r = -ENXIO; 1958 if (!kvm->arch.vpit) 1959 goto out; 1960 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 1961 if (r) 1962 goto out; 1963 r = -EFAULT; 1964 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 1965 goto out; 1966 r = 0; 1967 break; 1968 } 1969 case KVM_SET_PIT: { 1970 r = -EFAULT; 1971 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 1972 goto out; 1973 r = -ENXIO; 1974 if (!kvm->arch.vpit) 1975 goto out; 1976 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 1977 if (r) 1978 goto out; 1979 r = 0; 1980 break; 1981 } 1982 case KVM_REINJECT_CONTROL: { 1983 struct kvm_reinject_control control; 1984 r = -EFAULT; 1985 if (copy_from_user(&control, argp, sizeof(control))) 1986 goto out; 1987 r = kvm_vm_ioctl_reinject(kvm, &control); 1988 if (r) 1989 goto out; 1990 r = 0; 1991 break; 1992 } 1993 default: 1994 ; 1995 } 1996 out: 1997 return r; 1998 } 1999 2000 static void kvm_init_msr_list(void) 2001 { 2002 u32 dummy[2]; 2003 unsigned i, j; 2004 2005 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2006 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2007 continue; 2008 if (j < i) 2009 msrs_to_save[j] = msrs_to_save[i]; 2010 j++; 2011 } 2012 num_msrs_to_save = j; 2013 } 2014 2015 /* 2016 * Only apic need an MMIO device hook, so shortcut now.. 2017 */ 2018 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 2019 gpa_t addr, int len, 2020 int is_write) 2021 { 2022 struct kvm_io_device *dev; 2023 2024 if (vcpu->arch.apic) { 2025 dev = &vcpu->arch.apic->dev; 2026 if (dev->in_range(dev, addr, len, is_write)) 2027 return dev; 2028 } 2029 return NULL; 2030 } 2031 2032 2033 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 2034 gpa_t addr, int len, 2035 int is_write) 2036 { 2037 struct kvm_io_device *dev; 2038 2039 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 2040 if (dev == NULL) 2041 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, 2042 is_write); 2043 return dev; 2044 } 2045 2046 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2047 struct kvm_vcpu *vcpu) 2048 { 2049 void *data = val; 2050 int r = X86EMUL_CONTINUE; 2051 2052 while (bytes) { 2053 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2054 unsigned offset = addr & (PAGE_SIZE-1); 2055 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2056 int ret; 2057 2058 if (gpa == UNMAPPED_GVA) { 2059 r = X86EMUL_PROPAGATE_FAULT; 2060 goto out; 2061 } 2062 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2063 if (ret < 0) { 2064 r = X86EMUL_UNHANDLEABLE; 2065 goto out; 2066 } 2067 2068 bytes -= toread; 2069 data += toread; 2070 addr += toread; 2071 } 2072 out: 2073 return r; 2074 } 2075 2076 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2077 struct kvm_vcpu *vcpu) 2078 { 2079 void *data = val; 2080 int r = X86EMUL_CONTINUE; 2081 2082 while (bytes) { 2083 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2084 unsigned offset = addr & (PAGE_SIZE-1); 2085 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2086 int ret; 2087 2088 if (gpa == UNMAPPED_GVA) { 2089 r = X86EMUL_PROPAGATE_FAULT; 2090 goto out; 2091 } 2092 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2093 if (ret < 0) { 2094 r = X86EMUL_UNHANDLEABLE; 2095 goto out; 2096 } 2097 2098 bytes -= towrite; 2099 data += towrite; 2100 addr += towrite; 2101 } 2102 out: 2103 return r; 2104 } 2105 2106 2107 static int emulator_read_emulated(unsigned long addr, 2108 void *val, 2109 unsigned int bytes, 2110 struct kvm_vcpu *vcpu) 2111 { 2112 struct kvm_io_device *mmio_dev; 2113 gpa_t gpa; 2114 2115 if (vcpu->mmio_read_completed) { 2116 memcpy(val, vcpu->mmio_data, bytes); 2117 vcpu->mmio_read_completed = 0; 2118 return X86EMUL_CONTINUE; 2119 } 2120 2121 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2122 2123 /* For APIC access vmexit */ 2124 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2125 goto mmio; 2126 2127 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2128 == X86EMUL_CONTINUE) 2129 return X86EMUL_CONTINUE; 2130 if (gpa == UNMAPPED_GVA) 2131 return X86EMUL_PROPAGATE_FAULT; 2132 2133 mmio: 2134 /* 2135 * Is this MMIO handled locally? 2136 */ 2137 mutex_lock(&vcpu->kvm->lock); 2138 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2139 if (mmio_dev) { 2140 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 2141 mutex_unlock(&vcpu->kvm->lock); 2142 return X86EMUL_CONTINUE; 2143 } 2144 mutex_unlock(&vcpu->kvm->lock); 2145 2146 vcpu->mmio_needed = 1; 2147 vcpu->mmio_phys_addr = gpa; 2148 vcpu->mmio_size = bytes; 2149 vcpu->mmio_is_write = 0; 2150 2151 return X86EMUL_UNHANDLEABLE; 2152 } 2153 2154 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2155 const void *val, int bytes) 2156 { 2157 int ret; 2158 2159 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2160 if (ret < 0) 2161 return 0; 2162 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2163 return 1; 2164 } 2165 2166 static int emulator_write_emulated_onepage(unsigned long addr, 2167 const void *val, 2168 unsigned int bytes, 2169 struct kvm_vcpu *vcpu) 2170 { 2171 struct kvm_io_device *mmio_dev; 2172 gpa_t gpa; 2173 2174 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2175 2176 if (gpa == UNMAPPED_GVA) { 2177 kvm_inject_page_fault(vcpu, addr, 2); 2178 return X86EMUL_PROPAGATE_FAULT; 2179 } 2180 2181 /* For APIC access vmexit */ 2182 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2183 goto mmio; 2184 2185 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2186 return X86EMUL_CONTINUE; 2187 2188 mmio: 2189 /* 2190 * Is this MMIO handled locally? 2191 */ 2192 mutex_lock(&vcpu->kvm->lock); 2193 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); 2194 if (mmio_dev) { 2195 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2196 mutex_unlock(&vcpu->kvm->lock); 2197 return X86EMUL_CONTINUE; 2198 } 2199 mutex_unlock(&vcpu->kvm->lock); 2200 2201 vcpu->mmio_needed = 1; 2202 vcpu->mmio_phys_addr = gpa; 2203 vcpu->mmio_size = bytes; 2204 vcpu->mmio_is_write = 1; 2205 memcpy(vcpu->mmio_data, val, bytes); 2206 2207 return X86EMUL_CONTINUE; 2208 } 2209 2210 int emulator_write_emulated(unsigned long addr, 2211 const void *val, 2212 unsigned int bytes, 2213 struct kvm_vcpu *vcpu) 2214 { 2215 /* Crossing a page boundary? */ 2216 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2217 int rc, now; 2218 2219 now = -addr & ~PAGE_MASK; 2220 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2221 if (rc != X86EMUL_CONTINUE) 2222 return rc; 2223 addr += now; 2224 val += now; 2225 bytes -= now; 2226 } 2227 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2228 } 2229 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2230 2231 static int emulator_cmpxchg_emulated(unsigned long addr, 2232 const void *old, 2233 const void *new, 2234 unsigned int bytes, 2235 struct kvm_vcpu *vcpu) 2236 { 2237 static int reported; 2238 2239 if (!reported) { 2240 reported = 1; 2241 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 2242 } 2243 #ifndef CONFIG_X86_64 2244 /* guests cmpxchg8b have to be emulated atomically */ 2245 if (bytes == 8) { 2246 gpa_t gpa; 2247 struct page *page; 2248 char *kaddr; 2249 u64 val; 2250 2251 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2252 2253 if (gpa == UNMAPPED_GVA || 2254 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2255 goto emul_write; 2256 2257 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2258 goto emul_write; 2259 2260 val = *(u64 *)new; 2261 2262 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2263 2264 kaddr = kmap_atomic(page, KM_USER0); 2265 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2266 kunmap_atomic(kaddr, KM_USER0); 2267 kvm_release_page_dirty(page); 2268 } 2269 emul_write: 2270 #endif 2271 2272 return emulator_write_emulated(addr, new, bytes, vcpu); 2273 } 2274 2275 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2276 { 2277 return kvm_x86_ops->get_segment_base(vcpu, seg); 2278 } 2279 2280 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2281 { 2282 kvm_mmu_invlpg(vcpu, address); 2283 return X86EMUL_CONTINUE; 2284 } 2285 2286 int emulate_clts(struct kvm_vcpu *vcpu) 2287 { 2288 KVMTRACE_0D(CLTS, vcpu, handler); 2289 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2290 return X86EMUL_CONTINUE; 2291 } 2292 2293 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2294 { 2295 struct kvm_vcpu *vcpu = ctxt->vcpu; 2296 2297 switch (dr) { 2298 case 0 ... 3: 2299 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2300 return X86EMUL_CONTINUE; 2301 default: 2302 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2303 return X86EMUL_UNHANDLEABLE; 2304 } 2305 } 2306 2307 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2308 { 2309 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2310 int exception; 2311 2312 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2313 if (exception) { 2314 /* FIXME: better handling */ 2315 return X86EMUL_UNHANDLEABLE; 2316 } 2317 return X86EMUL_CONTINUE; 2318 } 2319 2320 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2321 { 2322 u8 opcodes[4]; 2323 unsigned long rip = kvm_rip_read(vcpu); 2324 unsigned long rip_linear; 2325 2326 if (!printk_ratelimit()) 2327 return; 2328 2329 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2330 2331 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2332 2333 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2334 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2335 } 2336 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2337 2338 static struct x86_emulate_ops emulate_ops = { 2339 .read_std = kvm_read_guest_virt, 2340 .read_emulated = emulator_read_emulated, 2341 .write_emulated = emulator_write_emulated, 2342 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2343 }; 2344 2345 static void cache_all_regs(struct kvm_vcpu *vcpu) 2346 { 2347 kvm_register_read(vcpu, VCPU_REGS_RAX); 2348 kvm_register_read(vcpu, VCPU_REGS_RSP); 2349 kvm_register_read(vcpu, VCPU_REGS_RIP); 2350 vcpu->arch.regs_dirty = ~0; 2351 } 2352 2353 int emulate_instruction(struct kvm_vcpu *vcpu, 2354 struct kvm_run *run, 2355 unsigned long cr2, 2356 u16 error_code, 2357 int emulation_type) 2358 { 2359 int r; 2360 struct decode_cache *c; 2361 2362 kvm_clear_exception_queue(vcpu); 2363 vcpu->arch.mmio_fault_cr2 = cr2; 2364 /* 2365 * TODO: fix x86_emulate.c to use guest_read/write_register 2366 * instead of direct ->regs accesses, can save hundred cycles 2367 * on Intel for instructions that don't read/change RSP, for 2368 * for example. 2369 */ 2370 cache_all_regs(vcpu); 2371 2372 vcpu->mmio_is_write = 0; 2373 vcpu->arch.pio.string = 0; 2374 2375 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2376 int cs_db, cs_l; 2377 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2378 2379 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2380 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2381 vcpu->arch.emulate_ctxt.mode = 2382 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2383 ? X86EMUL_MODE_REAL : cs_l 2384 ? X86EMUL_MODE_PROT64 : cs_db 2385 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2386 2387 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2388 2389 /* Reject the instructions other than VMCALL/VMMCALL when 2390 * try to emulate invalid opcode */ 2391 c = &vcpu->arch.emulate_ctxt.decode; 2392 if ((emulation_type & EMULTYPE_TRAP_UD) && 2393 (!(c->twobyte && c->b == 0x01 && 2394 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2395 c->modrm_mod == 3 && c->modrm_rm == 1))) 2396 return EMULATE_FAIL; 2397 2398 ++vcpu->stat.insn_emulation; 2399 if (r) { 2400 ++vcpu->stat.insn_emulation_fail; 2401 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2402 return EMULATE_DONE; 2403 return EMULATE_FAIL; 2404 } 2405 } 2406 2407 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2408 2409 if (vcpu->arch.pio.string) 2410 return EMULATE_DO_MMIO; 2411 2412 if ((r || vcpu->mmio_is_write) && run) { 2413 run->exit_reason = KVM_EXIT_MMIO; 2414 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2415 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2416 run->mmio.len = vcpu->mmio_size; 2417 run->mmio.is_write = vcpu->mmio_is_write; 2418 } 2419 2420 if (r) { 2421 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2422 return EMULATE_DONE; 2423 if (!vcpu->mmio_needed) { 2424 kvm_report_emulation_failure(vcpu, "mmio"); 2425 return EMULATE_FAIL; 2426 } 2427 return EMULATE_DO_MMIO; 2428 } 2429 2430 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2431 2432 if (vcpu->mmio_is_write) { 2433 vcpu->mmio_needed = 0; 2434 return EMULATE_DO_MMIO; 2435 } 2436 2437 return EMULATE_DONE; 2438 } 2439 EXPORT_SYMBOL_GPL(emulate_instruction); 2440 2441 static int pio_copy_data(struct kvm_vcpu *vcpu) 2442 { 2443 void *p = vcpu->arch.pio_data; 2444 gva_t q = vcpu->arch.pio.guest_gva; 2445 unsigned bytes; 2446 int ret; 2447 2448 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2449 if (vcpu->arch.pio.in) 2450 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2451 else 2452 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2453 return ret; 2454 } 2455 2456 int complete_pio(struct kvm_vcpu *vcpu) 2457 { 2458 struct kvm_pio_request *io = &vcpu->arch.pio; 2459 long delta; 2460 int r; 2461 unsigned long val; 2462 2463 if (!io->string) { 2464 if (io->in) { 2465 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2466 memcpy(&val, vcpu->arch.pio_data, io->size); 2467 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2468 } 2469 } else { 2470 if (io->in) { 2471 r = pio_copy_data(vcpu); 2472 if (r) 2473 return r; 2474 } 2475 2476 delta = 1; 2477 if (io->rep) { 2478 delta *= io->cur_count; 2479 /* 2480 * The size of the register should really depend on 2481 * current address size. 2482 */ 2483 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2484 val -= delta; 2485 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2486 } 2487 if (io->down) 2488 delta = -delta; 2489 delta *= io->size; 2490 if (io->in) { 2491 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2492 val += delta; 2493 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2494 } else { 2495 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2496 val += delta; 2497 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2498 } 2499 } 2500 2501 io->count -= io->cur_count; 2502 io->cur_count = 0; 2503 2504 return 0; 2505 } 2506 2507 static void kernel_pio(struct kvm_io_device *pio_dev, 2508 struct kvm_vcpu *vcpu, 2509 void *pd) 2510 { 2511 /* TODO: String I/O for in kernel device */ 2512 2513 mutex_lock(&vcpu->kvm->lock); 2514 if (vcpu->arch.pio.in) 2515 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2516 vcpu->arch.pio.size, 2517 pd); 2518 else 2519 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2520 vcpu->arch.pio.size, 2521 pd); 2522 mutex_unlock(&vcpu->kvm->lock); 2523 } 2524 2525 static void pio_string_write(struct kvm_io_device *pio_dev, 2526 struct kvm_vcpu *vcpu) 2527 { 2528 struct kvm_pio_request *io = &vcpu->arch.pio; 2529 void *pd = vcpu->arch.pio_data; 2530 int i; 2531 2532 mutex_lock(&vcpu->kvm->lock); 2533 for (i = 0; i < io->cur_count; i++) { 2534 kvm_iodevice_write(pio_dev, io->port, 2535 io->size, 2536 pd); 2537 pd += io->size; 2538 } 2539 mutex_unlock(&vcpu->kvm->lock); 2540 } 2541 2542 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2543 gpa_t addr, int len, 2544 int is_write) 2545 { 2546 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); 2547 } 2548 2549 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2550 int size, unsigned port) 2551 { 2552 struct kvm_io_device *pio_dev; 2553 unsigned long val; 2554 2555 vcpu->run->exit_reason = KVM_EXIT_IO; 2556 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2557 vcpu->run->io.size = vcpu->arch.pio.size = size; 2558 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2559 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2560 vcpu->run->io.port = vcpu->arch.pio.port = port; 2561 vcpu->arch.pio.in = in; 2562 vcpu->arch.pio.string = 0; 2563 vcpu->arch.pio.down = 0; 2564 vcpu->arch.pio.rep = 0; 2565 2566 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2567 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2568 handler); 2569 else 2570 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2571 handler); 2572 2573 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2574 memcpy(vcpu->arch.pio_data, &val, 4); 2575 2576 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2577 if (pio_dev) { 2578 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2579 complete_pio(vcpu); 2580 return 1; 2581 } 2582 return 0; 2583 } 2584 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2585 2586 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2587 int size, unsigned long count, int down, 2588 gva_t address, int rep, unsigned port) 2589 { 2590 unsigned now, in_page; 2591 int ret = 0; 2592 struct kvm_io_device *pio_dev; 2593 2594 vcpu->run->exit_reason = KVM_EXIT_IO; 2595 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2596 vcpu->run->io.size = vcpu->arch.pio.size = size; 2597 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2598 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2599 vcpu->run->io.port = vcpu->arch.pio.port = port; 2600 vcpu->arch.pio.in = in; 2601 vcpu->arch.pio.string = 1; 2602 vcpu->arch.pio.down = down; 2603 vcpu->arch.pio.rep = rep; 2604 2605 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2606 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2607 handler); 2608 else 2609 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2610 handler); 2611 2612 if (!count) { 2613 kvm_x86_ops->skip_emulated_instruction(vcpu); 2614 return 1; 2615 } 2616 2617 if (!down) 2618 in_page = PAGE_SIZE - offset_in_page(address); 2619 else 2620 in_page = offset_in_page(address) + size; 2621 now = min(count, (unsigned long)in_page / size); 2622 if (!now) 2623 now = 1; 2624 if (down) { 2625 /* 2626 * String I/O in reverse. Yuck. Kill the guest, fix later. 2627 */ 2628 pr_unimpl(vcpu, "guest string pio down\n"); 2629 kvm_inject_gp(vcpu, 0); 2630 return 1; 2631 } 2632 vcpu->run->io.count = now; 2633 vcpu->arch.pio.cur_count = now; 2634 2635 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2636 kvm_x86_ops->skip_emulated_instruction(vcpu); 2637 2638 vcpu->arch.pio.guest_gva = address; 2639 2640 pio_dev = vcpu_find_pio_dev(vcpu, port, 2641 vcpu->arch.pio.cur_count, 2642 !vcpu->arch.pio.in); 2643 if (!vcpu->arch.pio.in) { 2644 /* string PIO write */ 2645 ret = pio_copy_data(vcpu); 2646 if (ret == X86EMUL_PROPAGATE_FAULT) { 2647 kvm_inject_gp(vcpu, 0); 2648 return 1; 2649 } 2650 if (ret == 0 && pio_dev) { 2651 pio_string_write(pio_dev, vcpu); 2652 complete_pio(vcpu); 2653 if (vcpu->arch.pio.count == 0) 2654 ret = 1; 2655 } 2656 } else if (pio_dev) 2657 pr_unimpl(vcpu, "no string pio read support yet, " 2658 "port %x size %d count %ld\n", 2659 port, size, count); 2660 2661 return ret; 2662 } 2663 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2664 2665 static void bounce_off(void *info) 2666 { 2667 /* nothing */ 2668 } 2669 2670 static unsigned int ref_freq; 2671 static unsigned long tsc_khz_ref; 2672 2673 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 2674 void *data) 2675 { 2676 struct cpufreq_freqs *freq = data; 2677 struct kvm *kvm; 2678 struct kvm_vcpu *vcpu; 2679 int i, send_ipi = 0; 2680 2681 if (!ref_freq) 2682 ref_freq = freq->old; 2683 2684 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 2685 return 0; 2686 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 2687 return 0; 2688 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 2689 2690 spin_lock(&kvm_lock); 2691 list_for_each_entry(kvm, &vm_list, vm_list) { 2692 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2693 vcpu = kvm->vcpus[i]; 2694 if (!vcpu) 2695 continue; 2696 if (vcpu->cpu != freq->cpu) 2697 continue; 2698 if (!kvm_request_guest_time_update(vcpu)) 2699 continue; 2700 if (vcpu->cpu != smp_processor_id()) 2701 send_ipi++; 2702 } 2703 } 2704 spin_unlock(&kvm_lock); 2705 2706 if (freq->old < freq->new && send_ipi) { 2707 /* 2708 * We upscale the frequency. Must make the guest 2709 * doesn't see old kvmclock values while running with 2710 * the new frequency, otherwise we risk the guest sees 2711 * time go backwards. 2712 * 2713 * In case we update the frequency for another cpu 2714 * (which might be in guest context) send an interrupt 2715 * to kick the cpu out of guest context. Next time 2716 * guest context is entered kvmclock will be updated, 2717 * so the guest will not see stale values. 2718 */ 2719 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 2720 } 2721 return 0; 2722 } 2723 2724 static struct notifier_block kvmclock_cpufreq_notifier_block = { 2725 .notifier_call = kvmclock_cpufreq_notifier 2726 }; 2727 2728 int kvm_arch_init(void *opaque) 2729 { 2730 int r, cpu; 2731 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2732 2733 if (kvm_x86_ops) { 2734 printk(KERN_ERR "kvm: already loaded the other module\n"); 2735 r = -EEXIST; 2736 goto out; 2737 } 2738 2739 if (!ops->cpu_has_kvm_support()) { 2740 printk(KERN_ERR "kvm: no hardware support\n"); 2741 r = -EOPNOTSUPP; 2742 goto out; 2743 } 2744 if (ops->disabled_by_bios()) { 2745 printk(KERN_ERR "kvm: disabled by bios\n"); 2746 r = -EOPNOTSUPP; 2747 goto out; 2748 } 2749 2750 r = kvm_mmu_module_init(); 2751 if (r) 2752 goto out; 2753 2754 kvm_init_msr_list(); 2755 2756 kvm_x86_ops = ops; 2757 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2758 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2759 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2760 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2761 2762 for_each_possible_cpu(cpu) 2763 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2764 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 2765 tsc_khz_ref = tsc_khz; 2766 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 2767 CPUFREQ_TRANSITION_NOTIFIER); 2768 } 2769 2770 return 0; 2771 2772 out: 2773 return r; 2774 } 2775 2776 void kvm_arch_exit(void) 2777 { 2778 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 2779 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 2780 CPUFREQ_TRANSITION_NOTIFIER); 2781 kvm_x86_ops = NULL; 2782 kvm_mmu_module_exit(); 2783 } 2784 2785 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2786 { 2787 ++vcpu->stat.halt_exits; 2788 KVMTRACE_0D(HLT, vcpu, handler); 2789 if (irqchip_in_kernel(vcpu->kvm)) { 2790 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2791 return 1; 2792 } else { 2793 vcpu->run->exit_reason = KVM_EXIT_HLT; 2794 return 0; 2795 } 2796 } 2797 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2798 2799 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2800 unsigned long a1) 2801 { 2802 if (is_long_mode(vcpu)) 2803 return a0; 2804 else 2805 return a0 | ((gpa_t)a1 << 32); 2806 } 2807 2808 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2809 { 2810 unsigned long nr, a0, a1, a2, a3, ret; 2811 int r = 1; 2812 2813 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 2814 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 2815 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 2816 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 2817 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 2818 2819 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2820 2821 if (!is_long_mode(vcpu)) { 2822 nr &= 0xFFFFFFFF; 2823 a0 &= 0xFFFFFFFF; 2824 a1 &= 0xFFFFFFFF; 2825 a2 &= 0xFFFFFFFF; 2826 a3 &= 0xFFFFFFFF; 2827 } 2828 2829 switch (nr) { 2830 case KVM_HC_VAPIC_POLL_IRQ: 2831 ret = 0; 2832 break; 2833 case KVM_HC_MMU_OP: 2834 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2835 break; 2836 default: 2837 ret = -KVM_ENOSYS; 2838 break; 2839 } 2840 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 2841 ++vcpu->stat.hypercalls; 2842 return r; 2843 } 2844 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2845 2846 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2847 { 2848 char instruction[3]; 2849 int ret = 0; 2850 unsigned long rip = kvm_rip_read(vcpu); 2851 2852 2853 /* 2854 * Blow out the MMU to ensure that no other VCPU has an active mapping 2855 * to ensure that the updated hypercall appears atomically across all 2856 * VCPUs. 2857 */ 2858 kvm_mmu_zap_all(vcpu->kvm); 2859 2860 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2861 if (emulator_write_emulated(rip, instruction, 3, vcpu) 2862 != X86EMUL_CONTINUE) 2863 ret = -EFAULT; 2864 2865 return ret; 2866 } 2867 2868 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2869 { 2870 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2871 } 2872 2873 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2874 { 2875 struct descriptor_table dt = { limit, base }; 2876 2877 kvm_x86_ops->set_gdt(vcpu, &dt); 2878 } 2879 2880 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2881 { 2882 struct descriptor_table dt = { limit, base }; 2883 2884 kvm_x86_ops->set_idt(vcpu, &dt); 2885 } 2886 2887 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2888 unsigned long *rflags) 2889 { 2890 kvm_lmsw(vcpu, msw); 2891 *rflags = kvm_x86_ops->get_rflags(vcpu); 2892 } 2893 2894 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2895 { 2896 unsigned long value; 2897 2898 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2899 switch (cr) { 2900 case 0: 2901 value = vcpu->arch.cr0; 2902 break; 2903 case 2: 2904 value = vcpu->arch.cr2; 2905 break; 2906 case 3: 2907 value = vcpu->arch.cr3; 2908 break; 2909 case 4: 2910 value = vcpu->arch.cr4; 2911 break; 2912 case 8: 2913 value = kvm_get_cr8(vcpu); 2914 break; 2915 default: 2916 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2917 return 0; 2918 } 2919 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, 2920 (u32)((u64)value >> 32), handler); 2921 2922 return value; 2923 } 2924 2925 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2926 unsigned long *rflags) 2927 { 2928 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, 2929 (u32)((u64)val >> 32), handler); 2930 2931 switch (cr) { 2932 case 0: 2933 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2934 *rflags = kvm_x86_ops->get_rflags(vcpu); 2935 break; 2936 case 2: 2937 vcpu->arch.cr2 = val; 2938 break; 2939 case 3: 2940 kvm_set_cr3(vcpu, val); 2941 break; 2942 case 4: 2943 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2944 break; 2945 case 8: 2946 kvm_set_cr8(vcpu, val & 0xfUL); 2947 break; 2948 default: 2949 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2950 } 2951 } 2952 2953 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2954 { 2955 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2956 int j, nent = vcpu->arch.cpuid_nent; 2957 2958 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2959 /* when no next entry is found, the current entry[i] is reselected */ 2960 for (j = i + 1; ; j = (j + 1) % nent) { 2961 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2962 if (ej->function == e->function) { 2963 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2964 return j; 2965 } 2966 } 2967 return 0; /* silence gcc, even though control never reaches here */ 2968 } 2969 2970 /* find an entry with matching function, matching index (if needed), and that 2971 * should be read next (if it's stateful) */ 2972 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 2973 u32 function, u32 index) 2974 { 2975 if (e->function != function) 2976 return 0; 2977 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 2978 return 0; 2979 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 2980 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 2981 return 0; 2982 return 1; 2983 } 2984 2985 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 2986 u32 function, u32 index) 2987 { 2988 int i; 2989 struct kvm_cpuid_entry2 *best = NULL; 2990 2991 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2992 struct kvm_cpuid_entry2 *e; 2993 2994 e = &vcpu->arch.cpuid_entries[i]; 2995 if (is_matching_cpuid_entry(e, function, index)) { 2996 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 2997 move_to_next_stateful_cpuid_entry(vcpu, i); 2998 best = e; 2999 break; 3000 } 3001 /* 3002 * Both basic or both extended? 3003 */ 3004 if (((e->function ^ function) & 0x80000000) == 0) 3005 if (!best || e->function > best->function) 3006 best = e; 3007 } 3008 return best; 3009 } 3010 3011 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3012 { 3013 u32 function, index; 3014 struct kvm_cpuid_entry2 *best; 3015 3016 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3017 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3018 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3019 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3020 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3021 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3022 best = kvm_find_cpuid_entry(vcpu, function, index); 3023 if (best) { 3024 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3025 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3026 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3027 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3028 } 3029 kvm_x86_ops->skip_emulated_instruction(vcpu); 3030 KVMTRACE_5D(CPUID, vcpu, function, 3031 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 3032 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 3033 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 3034 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 3035 } 3036 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3037 3038 /* 3039 * Check if userspace requested an interrupt window, and that the 3040 * interrupt window is open. 3041 * 3042 * No need to exit to userspace if we already have an interrupt queued. 3043 */ 3044 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3045 struct kvm_run *kvm_run) 3046 { 3047 return (!vcpu->arch.irq_summary && 3048 kvm_run->request_interrupt_window && 3049 vcpu->arch.interrupt_window_open && 3050 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 3051 } 3052 3053 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3054 struct kvm_run *kvm_run) 3055 { 3056 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3057 kvm_run->cr8 = kvm_get_cr8(vcpu); 3058 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3059 if (irqchip_in_kernel(vcpu->kvm)) 3060 kvm_run->ready_for_interrupt_injection = 1; 3061 else 3062 kvm_run->ready_for_interrupt_injection = 3063 (vcpu->arch.interrupt_window_open && 3064 vcpu->arch.irq_summary == 0); 3065 } 3066 3067 static void vapic_enter(struct kvm_vcpu *vcpu) 3068 { 3069 struct kvm_lapic *apic = vcpu->arch.apic; 3070 struct page *page; 3071 3072 if (!apic || !apic->vapic_addr) 3073 return; 3074 3075 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3076 3077 vcpu->arch.apic->vapic_page = page; 3078 } 3079 3080 static void vapic_exit(struct kvm_vcpu *vcpu) 3081 { 3082 struct kvm_lapic *apic = vcpu->arch.apic; 3083 3084 if (!apic || !apic->vapic_addr) 3085 return; 3086 3087 down_read(&vcpu->kvm->slots_lock); 3088 kvm_release_page_dirty(apic->vapic_page); 3089 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3090 up_read(&vcpu->kvm->slots_lock); 3091 } 3092 3093 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3094 { 3095 int r; 3096 3097 if (vcpu->requests) 3098 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3099 kvm_mmu_unload(vcpu); 3100 3101 r = kvm_mmu_reload(vcpu); 3102 if (unlikely(r)) 3103 goto out; 3104 3105 if (vcpu->requests) { 3106 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3107 __kvm_migrate_timers(vcpu); 3108 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3109 kvm_write_guest_time(vcpu); 3110 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3111 kvm_mmu_sync_roots(vcpu); 3112 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3113 kvm_x86_ops->tlb_flush(vcpu); 3114 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3115 &vcpu->requests)) { 3116 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3117 r = 0; 3118 goto out; 3119 } 3120 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3121 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3122 r = 0; 3123 goto out; 3124 } 3125 } 3126 3127 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3128 kvm_inject_pending_timer_irqs(vcpu); 3129 3130 preempt_disable(); 3131 3132 kvm_x86_ops->prepare_guest_switch(vcpu); 3133 kvm_load_guest_fpu(vcpu); 3134 3135 local_irq_disable(); 3136 3137 if (vcpu->requests || need_resched() || signal_pending(current)) { 3138 local_irq_enable(); 3139 preempt_enable(); 3140 r = 1; 3141 goto out; 3142 } 3143 3144 vcpu->guest_mode = 1; 3145 /* 3146 * Make sure that guest_mode assignment won't happen after 3147 * testing the pending IRQ vector bitmap. 3148 */ 3149 smp_wmb(); 3150 3151 if (vcpu->arch.exception.pending) 3152 __queue_exception(vcpu); 3153 else if (irqchip_in_kernel(vcpu->kvm)) 3154 kvm_x86_ops->inject_pending_irq(vcpu); 3155 else 3156 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 3157 3158 kvm_lapic_sync_to_vapic(vcpu); 3159 3160 up_read(&vcpu->kvm->slots_lock); 3161 3162 kvm_guest_enter(); 3163 3164 get_debugreg(vcpu->arch.host_dr6, 6); 3165 get_debugreg(vcpu->arch.host_dr7, 7); 3166 if (unlikely(vcpu->arch.switch_db_regs)) { 3167 get_debugreg(vcpu->arch.host_db[0], 0); 3168 get_debugreg(vcpu->arch.host_db[1], 1); 3169 get_debugreg(vcpu->arch.host_db[2], 2); 3170 get_debugreg(vcpu->arch.host_db[3], 3); 3171 3172 set_debugreg(0, 7); 3173 set_debugreg(vcpu->arch.eff_db[0], 0); 3174 set_debugreg(vcpu->arch.eff_db[1], 1); 3175 set_debugreg(vcpu->arch.eff_db[2], 2); 3176 set_debugreg(vcpu->arch.eff_db[3], 3); 3177 } 3178 3179 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3180 kvm_x86_ops->run(vcpu, kvm_run); 3181 3182 if (unlikely(vcpu->arch.switch_db_regs)) { 3183 set_debugreg(0, 7); 3184 set_debugreg(vcpu->arch.host_db[0], 0); 3185 set_debugreg(vcpu->arch.host_db[1], 1); 3186 set_debugreg(vcpu->arch.host_db[2], 2); 3187 set_debugreg(vcpu->arch.host_db[3], 3); 3188 } 3189 set_debugreg(vcpu->arch.host_dr6, 6); 3190 set_debugreg(vcpu->arch.host_dr7, 7); 3191 3192 vcpu->guest_mode = 0; 3193 local_irq_enable(); 3194 3195 ++vcpu->stat.exits; 3196 3197 /* 3198 * We must have an instruction between local_irq_enable() and 3199 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3200 * the interrupt shadow. The stat.exits increment will do nicely. 3201 * But we need to prevent reordering, hence this barrier(): 3202 */ 3203 barrier(); 3204 3205 kvm_guest_exit(); 3206 3207 preempt_enable(); 3208 3209 down_read(&vcpu->kvm->slots_lock); 3210 3211 /* 3212 * Profile KVM exit RIPs: 3213 */ 3214 if (unlikely(prof_on == KVM_PROFILING)) { 3215 unsigned long rip = kvm_rip_read(vcpu); 3216 profile_hit(KVM_PROFILING, (void *)rip); 3217 } 3218 3219 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 3220 vcpu->arch.exception.pending = false; 3221 3222 kvm_lapic_sync_from_vapic(vcpu); 3223 3224 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3225 out: 3226 return r; 3227 } 3228 3229 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3230 { 3231 int r; 3232 3233 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3234 pr_debug("vcpu %d received sipi with vector # %x\n", 3235 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3236 kvm_lapic_reset(vcpu); 3237 r = kvm_arch_vcpu_reset(vcpu); 3238 if (r) 3239 return r; 3240 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3241 } 3242 3243 down_read(&vcpu->kvm->slots_lock); 3244 vapic_enter(vcpu); 3245 3246 r = 1; 3247 while (r > 0) { 3248 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3249 r = vcpu_enter_guest(vcpu, kvm_run); 3250 else { 3251 up_read(&vcpu->kvm->slots_lock); 3252 kvm_vcpu_block(vcpu); 3253 down_read(&vcpu->kvm->slots_lock); 3254 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3255 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3256 vcpu->arch.mp_state = 3257 KVM_MP_STATE_RUNNABLE; 3258 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3259 r = -EINTR; 3260 } 3261 3262 if (r > 0) { 3263 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3264 r = -EINTR; 3265 kvm_run->exit_reason = KVM_EXIT_INTR; 3266 ++vcpu->stat.request_irq_exits; 3267 } 3268 if (signal_pending(current)) { 3269 r = -EINTR; 3270 kvm_run->exit_reason = KVM_EXIT_INTR; 3271 ++vcpu->stat.signal_exits; 3272 } 3273 if (need_resched()) { 3274 up_read(&vcpu->kvm->slots_lock); 3275 kvm_resched(vcpu); 3276 down_read(&vcpu->kvm->slots_lock); 3277 } 3278 } 3279 } 3280 3281 up_read(&vcpu->kvm->slots_lock); 3282 post_kvm_run_save(vcpu, kvm_run); 3283 3284 vapic_exit(vcpu); 3285 3286 return r; 3287 } 3288 3289 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3290 { 3291 int r; 3292 sigset_t sigsaved; 3293 3294 vcpu_load(vcpu); 3295 3296 if (vcpu->sigset_active) 3297 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3298 3299 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3300 kvm_vcpu_block(vcpu); 3301 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3302 r = -EAGAIN; 3303 goto out; 3304 } 3305 3306 /* re-sync apic's tpr */ 3307 if (!irqchip_in_kernel(vcpu->kvm)) 3308 kvm_set_cr8(vcpu, kvm_run->cr8); 3309 3310 if (vcpu->arch.pio.cur_count) { 3311 r = complete_pio(vcpu); 3312 if (r) 3313 goto out; 3314 } 3315 #if CONFIG_HAS_IOMEM 3316 if (vcpu->mmio_needed) { 3317 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3318 vcpu->mmio_read_completed = 1; 3319 vcpu->mmio_needed = 0; 3320 3321 down_read(&vcpu->kvm->slots_lock); 3322 r = emulate_instruction(vcpu, kvm_run, 3323 vcpu->arch.mmio_fault_cr2, 0, 3324 EMULTYPE_NO_DECODE); 3325 up_read(&vcpu->kvm->slots_lock); 3326 if (r == EMULATE_DO_MMIO) { 3327 /* 3328 * Read-modify-write. Back to userspace. 3329 */ 3330 r = 0; 3331 goto out; 3332 } 3333 } 3334 #endif 3335 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3336 kvm_register_write(vcpu, VCPU_REGS_RAX, 3337 kvm_run->hypercall.ret); 3338 3339 r = __vcpu_run(vcpu, kvm_run); 3340 3341 out: 3342 if (vcpu->sigset_active) 3343 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3344 3345 vcpu_put(vcpu); 3346 return r; 3347 } 3348 3349 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3350 { 3351 vcpu_load(vcpu); 3352 3353 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3354 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3355 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3356 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3357 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3358 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3359 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3360 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3361 #ifdef CONFIG_X86_64 3362 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3363 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3364 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3365 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3366 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3367 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3368 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3369 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3370 #endif 3371 3372 regs->rip = kvm_rip_read(vcpu); 3373 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3374 3375 /* 3376 * Don't leak debug flags in case they were set for guest debugging 3377 */ 3378 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3379 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3380 3381 vcpu_put(vcpu); 3382 3383 return 0; 3384 } 3385 3386 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3387 { 3388 vcpu_load(vcpu); 3389 3390 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3391 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3392 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3393 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3394 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3395 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3396 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3397 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3398 #ifdef CONFIG_X86_64 3399 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3400 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3401 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3402 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3403 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3404 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3405 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3406 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3407 3408 #endif 3409 3410 kvm_rip_write(vcpu, regs->rip); 3411 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3412 3413 3414 vcpu->arch.exception.pending = false; 3415 3416 vcpu_put(vcpu); 3417 3418 return 0; 3419 } 3420 3421 void kvm_get_segment(struct kvm_vcpu *vcpu, 3422 struct kvm_segment *var, int seg) 3423 { 3424 kvm_x86_ops->get_segment(vcpu, var, seg); 3425 } 3426 3427 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3428 { 3429 struct kvm_segment cs; 3430 3431 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3432 *db = cs.db; 3433 *l = cs.l; 3434 } 3435 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3436 3437 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3438 struct kvm_sregs *sregs) 3439 { 3440 struct descriptor_table dt; 3441 int pending_vec; 3442 3443 vcpu_load(vcpu); 3444 3445 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3446 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3447 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3448 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3449 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3450 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3451 3452 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3453 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3454 3455 kvm_x86_ops->get_idt(vcpu, &dt); 3456 sregs->idt.limit = dt.limit; 3457 sregs->idt.base = dt.base; 3458 kvm_x86_ops->get_gdt(vcpu, &dt); 3459 sregs->gdt.limit = dt.limit; 3460 sregs->gdt.base = dt.base; 3461 3462 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3463 sregs->cr0 = vcpu->arch.cr0; 3464 sregs->cr2 = vcpu->arch.cr2; 3465 sregs->cr3 = vcpu->arch.cr3; 3466 sregs->cr4 = vcpu->arch.cr4; 3467 sregs->cr8 = kvm_get_cr8(vcpu); 3468 sregs->efer = vcpu->arch.shadow_efer; 3469 sregs->apic_base = kvm_get_apic_base(vcpu); 3470 3471 if (irqchip_in_kernel(vcpu->kvm)) { 3472 memset(sregs->interrupt_bitmap, 0, 3473 sizeof sregs->interrupt_bitmap); 3474 pending_vec = kvm_x86_ops->get_irq(vcpu); 3475 if (pending_vec >= 0) 3476 set_bit(pending_vec, 3477 (unsigned long *)sregs->interrupt_bitmap); 3478 } else 3479 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, 3480 sizeof sregs->interrupt_bitmap); 3481 3482 vcpu_put(vcpu); 3483 3484 return 0; 3485 } 3486 3487 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3488 struct kvm_mp_state *mp_state) 3489 { 3490 vcpu_load(vcpu); 3491 mp_state->mp_state = vcpu->arch.mp_state; 3492 vcpu_put(vcpu); 3493 return 0; 3494 } 3495 3496 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3497 struct kvm_mp_state *mp_state) 3498 { 3499 vcpu_load(vcpu); 3500 vcpu->arch.mp_state = mp_state->mp_state; 3501 vcpu_put(vcpu); 3502 return 0; 3503 } 3504 3505 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3506 struct kvm_segment *var, int seg) 3507 { 3508 kvm_x86_ops->set_segment(vcpu, var, seg); 3509 } 3510 3511 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3512 struct kvm_segment *kvm_desct) 3513 { 3514 kvm_desct->base = seg_desc->base0; 3515 kvm_desct->base |= seg_desc->base1 << 16; 3516 kvm_desct->base |= seg_desc->base2 << 24; 3517 kvm_desct->limit = seg_desc->limit0; 3518 kvm_desct->limit |= seg_desc->limit << 16; 3519 if (seg_desc->g) { 3520 kvm_desct->limit <<= 12; 3521 kvm_desct->limit |= 0xfff; 3522 } 3523 kvm_desct->selector = selector; 3524 kvm_desct->type = seg_desc->type; 3525 kvm_desct->present = seg_desc->p; 3526 kvm_desct->dpl = seg_desc->dpl; 3527 kvm_desct->db = seg_desc->d; 3528 kvm_desct->s = seg_desc->s; 3529 kvm_desct->l = seg_desc->l; 3530 kvm_desct->g = seg_desc->g; 3531 kvm_desct->avl = seg_desc->avl; 3532 if (!selector) 3533 kvm_desct->unusable = 1; 3534 else 3535 kvm_desct->unusable = 0; 3536 kvm_desct->padding = 0; 3537 } 3538 3539 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 3540 u16 selector, 3541 struct descriptor_table *dtable) 3542 { 3543 if (selector & 1 << 2) { 3544 struct kvm_segment kvm_seg; 3545 3546 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3547 3548 if (kvm_seg.unusable) 3549 dtable->limit = 0; 3550 else 3551 dtable->limit = kvm_seg.limit; 3552 dtable->base = kvm_seg.base; 3553 } 3554 else 3555 kvm_x86_ops->get_gdt(vcpu, dtable); 3556 } 3557 3558 /* allowed just for 8 bytes segments */ 3559 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3560 struct desc_struct *seg_desc) 3561 { 3562 gpa_t gpa; 3563 struct descriptor_table dtable; 3564 u16 index = selector >> 3; 3565 3566 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3567 3568 if (dtable.limit < index * 8 + 7) { 3569 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3570 return 1; 3571 } 3572 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3573 gpa += index * 8; 3574 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); 3575 } 3576 3577 /* allowed just for 8 bytes segments */ 3578 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3579 struct desc_struct *seg_desc) 3580 { 3581 gpa_t gpa; 3582 struct descriptor_table dtable; 3583 u16 index = selector >> 3; 3584 3585 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3586 3587 if (dtable.limit < index * 8 + 7) 3588 return 1; 3589 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3590 gpa += index * 8; 3591 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); 3592 } 3593 3594 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3595 struct desc_struct *seg_desc) 3596 { 3597 u32 base_addr; 3598 3599 base_addr = seg_desc->base0; 3600 base_addr |= (seg_desc->base1 << 16); 3601 base_addr |= (seg_desc->base2 << 24); 3602 3603 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 3604 } 3605 3606 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3607 { 3608 struct kvm_segment kvm_seg; 3609 3610 kvm_get_segment(vcpu, &kvm_seg, seg); 3611 return kvm_seg.selector; 3612 } 3613 3614 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3615 u16 selector, 3616 struct kvm_segment *kvm_seg) 3617 { 3618 struct desc_struct seg_desc; 3619 3620 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3621 return 1; 3622 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3623 return 0; 3624 } 3625 3626 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 3627 { 3628 struct kvm_segment segvar = { 3629 .base = selector << 4, 3630 .limit = 0xffff, 3631 .selector = selector, 3632 .type = 3, 3633 .present = 1, 3634 .dpl = 3, 3635 .db = 0, 3636 .s = 1, 3637 .l = 0, 3638 .g = 0, 3639 .avl = 0, 3640 .unusable = 0, 3641 }; 3642 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 3643 return 0; 3644 } 3645 3646 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3647 int type_bits, int seg) 3648 { 3649 struct kvm_segment kvm_seg; 3650 3651 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 3652 return kvm_load_realmode_segment(vcpu, selector, seg); 3653 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3654 return 1; 3655 kvm_seg.type |= type_bits; 3656 3657 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3658 seg != VCPU_SREG_LDTR) 3659 if (!kvm_seg.s) 3660 kvm_seg.unusable = 1; 3661 3662 kvm_set_segment(vcpu, &kvm_seg, seg); 3663 return 0; 3664 } 3665 3666 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3667 struct tss_segment_32 *tss) 3668 { 3669 tss->cr3 = vcpu->arch.cr3; 3670 tss->eip = kvm_rip_read(vcpu); 3671 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3672 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3673 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3674 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3675 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3676 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3677 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3678 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3679 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3680 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3681 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3682 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3683 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3684 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3685 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3686 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3687 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3688 } 3689 3690 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3691 struct tss_segment_32 *tss) 3692 { 3693 kvm_set_cr3(vcpu, tss->cr3); 3694 3695 kvm_rip_write(vcpu, tss->eip); 3696 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3697 3698 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 3699 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 3700 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 3701 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 3702 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 3703 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 3704 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 3705 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 3706 3707 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3708 return 1; 3709 3710 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3711 return 1; 3712 3713 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3714 return 1; 3715 3716 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3717 return 1; 3718 3719 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3720 return 1; 3721 3722 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3723 return 1; 3724 3725 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3726 return 1; 3727 return 0; 3728 } 3729 3730 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3731 struct tss_segment_16 *tss) 3732 { 3733 tss->ip = kvm_rip_read(vcpu); 3734 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3735 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3736 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3737 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3738 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3739 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3740 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3741 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 3742 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 3743 3744 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3745 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3746 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3747 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3748 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3749 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3750 } 3751 3752 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3753 struct tss_segment_16 *tss) 3754 { 3755 kvm_rip_write(vcpu, tss->ip); 3756 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3757 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 3758 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 3759 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 3760 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 3761 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 3762 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 3763 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 3764 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 3765 3766 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3767 return 1; 3768 3769 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3770 return 1; 3771 3772 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3773 return 1; 3774 3775 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3776 return 1; 3777 3778 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3779 return 1; 3780 return 0; 3781 } 3782 3783 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3784 u32 old_tss_base, 3785 struct desc_struct *nseg_desc) 3786 { 3787 struct tss_segment_16 tss_segment_16; 3788 int ret = 0; 3789 3790 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3791 sizeof tss_segment_16)) 3792 goto out; 3793 3794 save_state_to_tss16(vcpu, &tss_segment_16); 3795 3796 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3797 sizeof tss_segment_16)) 3798 goto out; 3799 3800 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3801 &tss_segment_16, sizeof tss_segment_16)) 3802 goto out; 3803 3804 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3805 goto out; 3806 3807 ret = 1; 3808 out: 3809 return ret; 3810 } 3811 3812 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3813 u32 old_tss_base, 3814 struct desc_struct *nseg_desc) 3815 { 3816 struct tss_segment_32 tss_segment_32; 3817 int ret = 0; 3818 3819 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3820 sizeof tss_segment_32)) 3821 goto out; 3822 3823 save_state_to_tss32(vcpu, &tss_segment_32); 3824 3825 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3826 sizeof tss_segment_32)) 3827 goto out; 3828 3829 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3830 &tss_segment_32, sizeof tss_segment_32)) 3831 goto out; 3832 3833 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3834 goto out; 3835 3836 ret = 1; 3837 out: 3838 return ret; 3839 } 3840 3841 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3842 { 3843 struct kvm_segment tr_seg; 3844 struct desc_struct cseg_desc; 3845 struct desc_struct nseg_desc; 3846 int ret = 0; 3847 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 3848 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 3849 3850 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 3851 3852 /* FIXME: Handle errors. Failure to read either TSS or their 3853 * descriptors should generate a pagefault. 3854 */ 3855 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3856 goto out; 3857 3858 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 3859 goto out; 3860 3861 if (reason != TASK_SWITCH_IRET) { 3862 int cpl; 3863 3864 cpl = kvm_x86_ops->get_cpl(vcpu); 3865 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3866 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3867 return 1; 3868 } 3869 } 3870 3871 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3872 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3873 return 1; 3874 } 3875 3876 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3877 cseg_desc.type &= ~(1 << 1); //clear the B flag 3878 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 3879 } 3880 3881 if (reason == TASK_SWITCH_IRET) { 3882 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3883 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 3884 } 3885 3886 kvm_x86_ops->skip_emulated_instruction(vcpu); 3887 3888 if (nseg_desc.type & 8) 3889 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3890 &nseg_desc); 3891 else 3892 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 3893 &nseg_desc); 3894 3895 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3896 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3897 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 3898 } 3899 3900 if (reason != TASK_SWITCH_IRET) { 3901 nseg_desc.type |= (1 << 1); 3902 save_guest_segment_descriptor(vcpu, tss_selector, 3903 &nseg_desc); 3904 } 3905 3906 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3907 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3908 tr_seg.type = 11; 3909 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3910 out: 3911 return ret; 3912 } 3913 EXPORT_SYMBOL_GPL(kvm_task_switch); 3914 3915 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 3916 struct kvm_sregs *sregs) 3917 { 3918 int mmu_reset_needed = 0; 3919 int i, pending_vec, max_bits; 3920 struct descriptor_table dt; 3921 3922 vcpu_load(vcpu); 3923 3924 dt.limit = sregs->idt.limit; 3925 dt.base = sregs->idt.base; 3926 kvm_x86_ops->set_idt(vcpu, &dt); 3927 dt.limit = sregs->gdt.limit; 3928 dt.base = sregs->gdt.base; 3929 kvm_x86_ops->set_gdt(vcpu, &dt); 3930 3931 vcpu->arch.cr2 = sregs->cr2; 3932 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 3933 vcpu->arch.cr3 = sregs->cr3; 3934 3935 kvm_set_cr8(vcpu, sregs->cr8); 3936 3937 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 3938 kvm_x86_ops->set_efer(vcpu, sregs->efer); 3939 kvm_set_apic_base(vcpu, sregs->apic_base); 3940 3941 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3942 3943 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 3944 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 3945 vcpu->arch.cr0 = sregs->cr0; 3946 3947 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 3948 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 3949 if (!is_long_mode(vcpu) && is_pae(vcpu)) 3950 load_pdptrs(vcpu, vcpu->arch.cr3); 3951 3952 if (mmu_reset_needed) 3953 kvm_mmu_reset_context(vcpu); 3954 3955 if (!irqchip_in_kernel(vcpu->kvm)) { 3956 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 3957 sizeof vcpu->arch.irq_pending); 3958 vcpu->arch.irq_summary = 0; 3959 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 3960 if (vcpu->arch.irq_pending[i]) 3961 __set_bit(i, &vcpu->arch.irq_summary); 3962 } else { 3963 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 3964 pending_vec = find_first_bit( 3965 (const unsigned long *)sregs->interrupt_bitmap, 3966 max_bits); 3967 /* Only pending external irq is handled here */ 3968 if (pending_vec < max_bits) { 3969 kvm_x86_ops->set_irq(vcpu, pending_vec); 3970 pr_debug("Set back pending irq %d\n", 3971 pending_vec); 3972 } 3973 kvm_pic_clear_isr_ack(vcpu->kvm); 3974 } 3975 3976 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3977 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3978 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3979 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3980 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3981 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3982 3983 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3984 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3985 3986 /* Older userspace won't unhalt the vcpu on reset. */ 3987 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 3988 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 3989 !(vcpu->arch.cr0 & X86_CR0_PE)) 3990 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3991 3992 vcpu_put(vcpu); 3993 3994 return 0; 3995 } 3996 3997 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 3998 struct kvm_guest_debug *dbg) 3999 { 4000 int i, r; 4001 4002 vcpu_load(vcpu); 4003 4004 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4005 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4006 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4007 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4008 vcpu->arch.switch_db_regs = 4009 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4010 } else { 4011 for (i = 0; i < KVM_NR_DB_REGS; i++) 4012 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4013 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4014 } 4015 4016 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4017 4018 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4019 kvm_queue_exception(vcpu, DB_VECTOR); 4020 else if (dbg->control & KVM_GUESTDBG_INJECT_BP) 4021 kvm_queue_exception(vcpu, BP_VECTOR); 4022 4023 vcpu_put(vcpu); 4024 4025 return r; 4026 } 4027 4028 /* 4029 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4030 * we have asm/x86/processor.h 4031 */ 4032 struct fxsave { 4033 u16 cwd; 4034 u16 swd; 4035 u16 twd; 4036 u16 fop; 4037 u64 rip; 4038 u64 rdp; 4039 u32 mxcsr; 4040 u32 mxcsr_mask; 4041 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4042 #ifdef CONFIG_X86_64 4043 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4044 #else 4045 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4046 #endif 4047 }; 4048 4049 /* 4050 * Translate a guest virtual address to a guest physical address. 4051 */ 4052 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4053 struct kvm_translation *tr) 4054 { 4055 unsigned long vaddr = tr->linear_address; 4056 gpa_t gpa; 4057 4058 vcpu_load(vcpu); 4059 down_read(&vcpu->kvm->slots_lock); 4060 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4061 up_read(&vcpu->kvm->slots_lock); 4062 tr->physical_address = gpa; 4063 tr->valid = gpa != UNMAPPED_GVA; 4064 tr->writeable = 1; 4065 tr->usermode = 0; 4066 vcpu_put(vcpu); 4067 4068 return 0; 4069 } 4070 4071 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4072 { 4073 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4074 4075 vcpu_load(vcpu); 4076 4077 memcpy(fpu->fpr, fxsave->st_space, 128); 4078 fpu->fcw = fxsave->cwd; 4079 fpu->fsw = fxsave->swd; 4080 fpu->ftwx = fxsave->twd; 4081 fpu->last_opcode = fxsave->fop; 4082 fpu->last_ip = fxsave->rip; 4083 fpu->last_dp = fxsave->rdp; 4084 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4085 4086 vcpu_put(vcpu); 4087 4088 return 0; 4089 } 4090 4091 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4092 { 4093 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4094 4095 vcpu_load(vcpu); 4096 4097 memcpy(fxsave->st_space, fpu->fpr, 128); 4098 fxsave->cwd = fpu->fcw; 4099 fxsave->swd = fpu->fsw; 4100 fxsave->twd = fpu->ftwx; 4101 fxsave->fop = fpu->last_opcode; 4102 fxsave->rip = fpu->last_ip; 4103 fxsave->rdp = fpu->last_dp; 4104 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4105 4106 vcpu_put(vcpu); 4107 4108 return 0; 4109 } 4110 4111 void fx_init(struct kvm_vcpu *vcpu) 4112 { 4113 unsigned after_mxcsr_mask; 4114 4115 /* 4116 * Touch the fpu the first time in non atomic context as if 4117 * this is the first fpu instruction the exception handler 4118 * will fire before the instruction returns and it'll have to 4119 * allocate ram with GFP_KERNEL. 4120 */ 4121 if (!used_math()) 4122 kvm_fx_save(&vcpu->arch.host_fx_image); 4123 4124 /* Initialize guest FPU by resetting ours and saving into guest's */ 4125 preempt_disable(); 4126 kvm_fx_save(&vcpu->arch.host_fx_image); 4127 kvm_fx_finit(); 4128 kvm_fx_save(&vcpu->arch.guest_fx_image); 4129 kvm_fx_restore(&vcpu->arch.host_fx_image); 4130 preempt_enable(); 4131 4132 vcpu->arch.cr0 |= X86_CR0_ET; 4133 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4134 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4135 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4136 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4137 } 4138 EXPORT_SYMBOL_GPL(fx_init); 4139 4140 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4141 { 4142 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4143 return; 4144 4145 vcpu->guest_fpu_loaded = 1; 4146 kvm_fx_save(&vcpu->arch.host_fx_image); 4147 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4148 } 4149 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4150 4151 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4152 { 4153 if (!vcpu->guest_fpu_loaded) 4154 return; 4155 4156 vcpu->guest_fpu_loaded = 0; 4157 kvm_fx_save(&vcpu->arch.guest_fx_image); 4158 kvm_fx_restore(&vcpu->arch.host_fx_image); 4159 ++vcpu->stat.fpu_reload; 4160 } 4161 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4162 4163 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4164 { 4165 if (vcpu->arch.time_page) { 4166 kvm_release_page_dirty(vcpu->arch.time_page); 4167 vcpu->arch.time_page = NULL; 4168 } 4169 4170 kvm_x86_ops->vcpu_free(vcpu); 4171 } 4172 4173 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4174 unsigned int id) 4175 { 4176 return kvm_x86_ops->vcpu_create(kvm, id); 4177 } 4178 4179 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4180 { 4181 int r; 4182 4183 /* We do fxsave: this must be aligned. */ 4184 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4185 4186 vcpu->arch.mtrr_state.have_fixed = 1; 4187 vcpu_load(vcpu); 4188 r = kvm_arch_vcpu_reset(vcpu); 4189 if (r == 0) 4190 r = kvm_mmu_setup(vcpu); 4191 vcpu_put(vcpu); 4192 if (r < 0) 4193 goto free_vcpu; 4194 4195 return 0; 4196 free_vcpu: 4197 kvm_x86_ops->vcpu_free(vcpu); 4198 return r; 4199 } 4200 4201 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4202 { 4203 vcpu_load(vcpu); 4204 kvm_mmu_unload(vcpu); 4205 vcpu_put(vcpu); 4206 4207 kvm_x86_ops->vcpu_free(vcpu); 4208 } 4209 4210 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4211 { 4212 vcpu->arch.nmi_pending = false; 4213 vcpu->arch.nmi_injected = false; 4214 4215 vcpu->arch.switch_db_regs = 0; 4216 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4217 vcpu->arch.dr6 = DR6_FIXED_1; 4218 vcpu->arch.dr7 = DR7_FIXED_1; 4219 4220 return kvm_x86_ops->vcpu_reset(vcpu); 4221 } 4222 4223 void kvm_arch_hardware_enable(void *garbage) 4224 { 4225 kvm_x86_ops->hardware_enable(garbage); 4226 } 4227 4228 void kvm_arch_hardware_disable(void *garbage) 4229 { 4230 kvm_x86_ops->hardware_disable(garbage); 4231 } 4232 4233 int kvm_arch_hardware_setup(void) 4234 { 4235 return kvm_x86_ops->hardware_setup(); 4236 } 4237 4238 void kvm_arch_hardware_unsetup(void) 4239 { 4240 kvm_x86_ops->hardware_unsetup(); 4241 } 4242 4243 void kvm_arch_check_processor_compat(void *rtn) 4244 { 4245 kvm_x86_ops->check_processor_compatibility(rtn); 4246 } 4247 4248 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4249 { 4250 struct page *page; 4251 struct kvm *kvm; 4252 int r; 4253 4254 BUG_ON(vcpu->kvm == NULL); 4255 kvm = vcpu->kvm; 4256 4257 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4258 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4259 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4260 else 4261 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4262 4263 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4264 if (!page) { 4265 r = -ENOMEM; 4266 goto fail; 4267 } 4268 vcpu->arch.pio_data = page_address(page); 4269 4270 r = kvm_mmu_create(vcpu); 4271 if (r < 0) 4272 goto fail_free_pio_data; 4273 4274 if (irqchip_in_kernel(kvm)) { 4275 r = kvm_create_lapic(vcpu); 4276 if (r < 0) 4277 goto fail_mmu_destroy; 4278 } 4279 4280 return 0; 4281 4282 fail_mmu_destroy: 4283 kvm_mmu_destroy(vcpu); 4284 fail_free_pio_data: 4285 free_page((unsigned long)vcpu->arch.pio_data); 4286 fail: 4287 return r; 4288 } 4289 4290 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4291 { 4292 kvm_free_lapic(vcpu); 4293 down_read(&vcpu->kvm->slots_lock); 4294 kvm_mmu_destroy(vcpu); 4295 up_read(&vcpu->kvm->slots_lock); 4296 free_page((unsigned long)vcpu->arch.pio_data); 4297 } 4298 4299 struct kvm *kvm_arch_create_vm(void) 4300 { 4301 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4302 4303 if (!kvm) 4304 return ERR_PTR(-ENOMEM); 4305 4306 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4307 INIT_LIST_HEAD(&kvm->arch.oos_global_pages); 4308 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4309 4310 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4311 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4312 4313 rdtscll(kvm->arch.vm_init_tsc); 4314 4315 return kvm; 4316 } 4317 4318 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4319 { 4320 vcpu_load(vcpu); 4321 kvm_mmu_unload(vcpu); 4322 vcpu_put(vcpu); 4323 } 4324 4325 static void kvm_free_vcpus(struct kvm *kvm) 4326 { 4327 unsigned int i; 4328 4329 /* 4330 * Unpin any mmu pages first. 4331 */ 4332 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4333 if (kvm->vcpus[i]) 4334 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4335 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4336 if (kvm->vcpus[i]) { 4337 kvm_arch_vcpu_free(kvm->vcpus[i]); 4338 kvm->vcpus[i] = NULL; 4339 } 4340 } 4341 4342 } 4343 4344 void kvm_arch_sync_events(struct kvm *kvm) 4345 { 4346 kvm_free_all_assigned_devices(kvm); 4347 } 4348 4349 void kvm_arch_destroy_vm(struct kvm *kvm) 4350 { 4351 kvm_iommu_unmap_guest(kvm); 4352 kvm_free_pit(kvm); 4353 kfree(kvm->arch.vpic); 4354 kfree(kvm->arch.vioapic); 4355 kvm_free_vcpus(kvm); 4356 kvm_free_physmem(kvm); 4357 if (kvm->arch.apic_access_page) 4358 put_page(kvm->arch.apic_access_page); 4359 if (kvm->arch.ept_identity_pagetable) 4360 put_page(kvm->arch.ept_identity_pagetable); 4361 kfree(kvm); 4362 } 4363 4364 int kvm_arch_set_memory_region(struct kvm *kvm, 4365 struct kvm_userspace_memory_region *mem, 4366 struct kvm_memory_slot old, 4367 int user_alloc) 4368 { 4369 int npages = mem->memory_size >> PAGE_SHIFT; 4370 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4371 4372 /*To keep backward compatibility with older userspace, 4373 *x86 needs to hanlde !user_alloc case. 4374 */ 4375 if (!user_alloc) { 4376 if (npages && !old.rmap) { 4377 unsigned long userspace_addr; 4378 4379 down_write(¤t->mm->mmap_sem); 4380 userspace_addr = do_mmap(NULL, 0, 4381 npages * PAGE_SIZE, 4382 PROT_READ | PROT_WRITE, 4383 MAP_PRIVATE | MAP_ANONYMOUS, 4384 0); 4385 up_write(¤t->mm->mmap_sem); 4386 4387 if (IS_ERR((void *)userspace_addr)) 4388 return PTR_ERR((void *)userspace_addr); 4389 4390 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4391 spin_lock(&kvm->mmu_lock); 4392 memslot->userspace_addr = userspace_addr; 4393 spin_unlock(&kvm->mmu_lock); 4394 } else { 4395 if (!old.user_alloc && old.rmap) { 4396 int ret; 4397 4398 down_write(¤t->mm->mmap_sem); 4399 ret = do_munmap(current->mm, old.userspace_addr, 4400 old.npages * PAGE_SIZE); 4401 up_write(¤t->mm->mmap_sem); 4402 if (ret < 0) 4403 printk(KERN_WARNING 4404 "kvm_vm_ioctl_set_memory_region: " 4405 "failed to munmap memory\n"); 4406 } 4407 } 4408 } 4409 4410 if (!kvm->arch.n_requested_mmu_pages) { 4411 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4412 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4413 } 4414 4415 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4416 kvm_flush_remote_tlbs(kvm); 4417 4418 return 0; 4419 } 4420 4421 void kvm_arch_flush_shadow(struct kvm *kvm) 4422 { 4423 kvm_mmu_zap_all(kvm); 4424 } 4425 4426 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4427 { 4428 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4429 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4430 || vcpu->arch.nmi_pending; 4431 } 4432 4433 static void vcpu_kick_intr(void *info) 4434 { 4435 #ifdef DEBUG 4436 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; 4437 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); 4438 #endif 4439 } 4440 4441 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4442 { 4443 int ipi_pcpu = vcpu->cpu; 4444 int cpu = get_cpu(); 4445 4446 if (waitqueue_active(&vcpu->wq)) { 4447 wake_up_interruptible(&vcpu->wq); 4448 ++vcpu->stat.halt_wakeup; 4449 } 4450 /* 4451 * We may be called synchronously with irqs disabled in guest mode, 4452 * So need not to call smp_call_function_single() in that case. 4453 */ 4454 if (vcpu->guest_mode && vcpu->cpu != cpu) 4455 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); 4456 put_cpu(); 4457 } 4458