1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 41 #include <asm/uaccess.h> 42 #include <asm/msr.h> 43 #include <asm/desc.h> 44 #include <asm/mtrr.h> 45 46 #define MAX_IO_MSRS 256 47 #define CR0_RESERVED_BITS \ 48 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 49 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 51 #define CR4_RESERVED_BITS \ 52 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 53 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 54 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 55 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 56 57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 58 /* EFER defaults: 59 * - enable syscall per default because its emulated by KVM 60 * - enable LME and LMA per default on 64 bit KVM 61 */ 62 #ifdef CONFIG_X86_64 63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 64 #else 65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 66 #endif 67 68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 70 71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 72 struct kvm_cpuid_entry2 __user *entries); 73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 74 u32 function, u32 index); 75 76 struct kvm_x86_ops *kvm_x86_ops; 77 EXPORT_SYMBOL_GPL(kvm_x86_ops); 78 79 struct kvm_stats_debugfs_item debugfs_entries[] = { 80 { "pf_fixed", VCPU_STAT(pf_fixed) }, 81 { "pf_guest", VCPU_STAT(pf_guest) }, 82 { "tlb_flush", VCPU_STAT(tlb_flush) }, 83 { "invlpg", VCPU_STAT(invlpg) }, 84 { "exits", VCPU_STAT(exits) }, 85 { "io_exits", VCPU_STAT(io_exits) }, 86 { "mmio_exits", VCPU_STAT(mmio_exits) }, 87 { "signal_exits", VCPU_STAT(signal_exits) }, 88 { "irq_window", VCPU_STAT(irq_window_exits) }, 89 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 90 { "halt_exits", VCPU_STAT(halt_exits) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 92 { "hypercalls", VCPU_STAT(hypercalls) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) }, 94 { "request_nmi", VCPU_STAT(request_nmi_exits) }, 95 { "irq_exits", VCPU_STAT(irq_exits) }, 96 { "host_state_reload", VCPU_STAT(host_state_reload) }, 97 { "efer_reload", VCPU_STAT(efer_reload) }, 98 { "fpu_reload", VCPU_STAT(fpu_reload) }, 99 { "insn_emulation", VCPU_STAT(insn_emulation) }, 100 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 101 { "irq_injections", VCPU_STAT(irq_injections) }, 102 { "nmi_injections", VCPU_STAT(nmi_injections) }, 103 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 104 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 105 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 106 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 107 { "mmu_flooded", VM_STAT(mmu_flooded) }, 108 { "mmu_recycled", VM_STAT(mmu_recycled) }, 109 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 110 { "mmu_unsync", VM_STAT(mmu_unsync) }, 111 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, 112 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 113 { "largepages", VM_STAT(lpages) }, 114 { NULL } 115 }; 116 117 unsigned long segment_base(u16 selector) 118 { 119 struct descriptor_table gdt; 120 struct desc_struct *d; 121 unsigned long table_base; 122 unsigned long v; 123 124 if (selector == 0) 125 return 0; 126 127 asm("sgdt %0" : "=m"(gdt)); 128 table_base = gdt.base; 129 130 if (selector & 4) { /* from ldt */ 131 u16 ldt_selector; 132 133 asm("sldt %0" : "=g"(ldt_selector)); 134 table_base = segment_base(ldt_selector); 135 } 136 d = (struct desc_struct *)(table_base + (selector & ~7)); 137 v = d->base0 | ((unsigned long)d->base1 << 16) | 138 ((unsigned long)d->base2 << 24); 139 #ifdef CONFIG_X86_64 140 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 141 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 142 #endif 143 return v; 144 } 145 EXPORT_SYMBOL_GPL(segment_base); 146 147 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 148 { 149 if (irqchip_in_kernel(vcpu->kvm)) 150 return vcpu->arch.apic_base; 151 else 152 return vcpu->arch.apic_base; 153 } 154 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 155 156 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 157 { 158 /* TODO: reserve bits check */ 159 if (irqchip_in_kernel(vcpu->kvm)) 160 kvm_lapic_set_base(vcpu, data); 161 else 162 vcpu->arch.apic_base = data; 163 } 164 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 165 166 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 167 { 168 WARN_ON(vcpu->arch.exception.pending); 169 vcpu->arch.exception.pending = true; 170 vcpu->arch.exception.has_error_code = false; 171 vcpu->arch.exception.nr = nr; 172 } 173 EXPORT_SYMBOL_GPL(kvm_queue_exception); 174 175 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 176 u32 error_code) 177 { 178 ++vcpu->stat.pf_guest; 179 180 if (vcpu->arch.exception.pending) { 181 if (vcpu->arch.exception.nr == PF_VECTOR) { 182 printk(KERN_DEBUG "kvm: inject_page_fault:" 183 " double fault 0x%lx\n", addr); 184 vcpu->arch.exception.nr = DF_VECTOR; 185 vcpu->arch.exception.error_code = 0; 186 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 187 /* triple fault -> shutdown */ 188 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 189 } 190 return; 191 } 192 vcpu->arch.cr2 = addr; 193 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 194 } 195 196 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 197 { 198 vcpu->arch.nmi_pending = 1; 199 } 200 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 201 202 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 203 { 204 WARN_ON(vcpu->arch.exception.pending); 205 vcpu->arch.exception.pending = true; 206 vcpu->arch.exception.has_error_code = true; 207 vcpu->arch.exception.nr = nr; 208 vcpu->arch.exception.error_code = error_code; 209 } 210 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 211 212 static void __queue_exception(struct kvm_vcpu *vcpu) 213 { 214 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 215 vcpu->arch.exception.has_error_code, 216 vcpu->arch.exception.error_code); 217 } 218 219 /* 220 * Load the pae pdptrs. Return true is they are all valid. 221 */ 222 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 223 { 224 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 225 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 226 int i; 227 int ret; 228 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 229 230 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 231 offset * sizeof(u64), sizeof(pdpte)); 232 if (ret < 0) { 233 ret = 0; 234 goto out; 235 } 236 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 237 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 238 ret = 0; 239 goto out; 240 } 241 } 242 ret = 1; 243 244 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 245 out: 246 247 return ret; 248 } 249 EXPORT_SYMBOL_GPL(load_pdptrs); 250 251 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 252 { 253 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 254 bool changed = true; 255 int r; 256 257 if (is_long_mode(vcpu) || !is_pae(vcpu)) 258 return false; 259 260 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 261 if (r < 0) 262 goto out; 263 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 264 out: 265 266 return changed; 267 } 268 269 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 270 { 271 if (cr0 & CR0_RESERVED_BITS) { 272 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 273 cr0, vcpu->arch.cr0); 274 kvm_inject_gp(vcpu, 0); 275 return; 276 } 277 278 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 279 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 280 kvm_inject_gp(vcpu, 0); 281 return; 282 } 283 284 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 285 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 286 "and a clear PE flag\n"); 287 kvm_inject_gp(vcpu, 0); 288 return; 289 } 290 291 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 292 #ifdef CONFIG_X86_64 293 if ((vcpu->arch.shadow_efer & EFER_LME)) { 294 int cs_db, cs_l; 295 296 if (!is_pae(vcpu)) { 297 printk(KERN_DEBUG "set_cr0: #GP, start paging " 298 "in long mode while PAE is disabled\n"); 299 kvm_inject_gp(vcpu, 0); 300 return; 301 } 302 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 303 if (cs_l) { 304 printk(KERN_DEBUG "set_cr0: #GP, start paging " 305 "in long mode while CS.L == 1\n"); 306 kvm_inject_gp(vcpu, 0); 307 return; 308 309 } 310 } else 311 #endif 312 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 313 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 314 "reserved bits\n"); 315 kvm_inject_gp(vcpu, 0); 316 return; 317 } 318 319 } 320 321 kvm_x86_ops->set_cr0(vcpu, cr0); 322 vcpu->arch.cr0 = cr0; 323 324 kvm_mmu_sync_global(vcpu); 325 kvm_mmu_reset_context(vcpu); 326 return; 327 } 328 EXPORT_SYMBOL_GPL(kvm_set_cr0); 329 330 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 331 { 332 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 333 KVMTRACE_1D(LMSW, vcpu, 334 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 335 handler); 336 } 337 EXPORT_SYMBOL_GPL(kvm_lmsw); 338 339 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 340 { 341 unsigned long old_cr4 = vcpu->arch.cr4; 342 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 343 344 if (cr4 & CR4_RESERVED_BITS) { 345 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 346 kvm_inject_gp(vcpu, 0); 347 return; 348 } 349 350 if (is_long_mode(vcpu)) { 351 if (!(cr4 & X86_CR4_PAE)) { 352 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 353 "in long mode\n"); 354 kvm_inject_gp(vcpu, 0); 355 return; 356 } 357 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 358 && ((cr4 ^ old_cr4) & pdptr_bits) 359 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 360 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 361 kvm_inject_gp(vcpu, 0); 362 return; 363 } 364 365 if (cr4 & X86_CR4_VMXE) { 366 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 367 kvm_inject_gp(vcpu, 0); 368 return; 369 } 370 kvm_x86_ops->set_cr4(vcpu, cr4); 371 vcpu->arch.cr4 = cr4; 372 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 373 kvm_mmu_sync_global(vcpu); 374 kvm_mmu_reset_context(vcpu); 375 } 376 EXPORT_SYMBOL_GPL(kvm_set_cr4); 377 378 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 379 { 380 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 381 kvm_mmu_sync_roots(vcpu); 382 kvm_mmu_flush_tlb(vcpu); 383 return; 384 } 385 386 if (is_long_mode(vcpu)) { 387 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 388 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 389 kvm_inject_gp(vcpu, 0); 390 return; 391 } 392 } else { 393 if (is_pae(vcpu)) { 394 if (cr3 & CR3_PAE_RESERVED_BITS) { 395 printk(KERN_DEBUG 396 "set_cr3: #GP, reserved bits\n"); 397 kvm_inject_gp(vcpu, 0); 398 return; 399 } 400 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 401 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 402 "reserved bits\n"); 403 kvm_inject_gp(vcpu, 0); 404 return; 405 } 406 } 407 /* 408 * We don't check reserved bits in nonpae mode, because 409 * this isn't enforced, and VMware depends on this. 410 */ 411 } 412 413 /* 414 * Does the new cr3 value map to physical memory? (Note, we 415 * catch an invalid cr3 even in real-mode, because it would 416 * cause trouble later on when we turn on paging anyway.) 417 * 418 * A real CPU would silently accept an invalid cr3 and would 419 * attempt to use it - with largely undefined (and often hard 420 * to debug) behavior on the guest side. 421 */ 422 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 423 kvm_inject_gp(vcpu, 0); 424 else { 425 vcpu->arch.cr3 = cr3; 426 vcpu->arch.mmu.new_cr3(vcpu); 427 } 428 } 429 EXPORT_SYMBOL_GPL(kvm_set_cr3); 430 431 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 432 { 433 if (cr8 & CR8_RESERVED_BITS) { 434 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 435 kvm_inject_gp(vcpu, 0); 436 return; 437 } 438 if (irqchip_in_kernel(vcpu->kvm)) 439 kvm_lapic_set_tpr(vcpu, cr8); 440 else 441 vcpu->arch.cr8 = cr8; 442 } 443 EXPORT_SYMBOL_GPL(kvm_set_cr8); 444 445 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 446 { 447 if (irqchip_in_kernel(vcpu->kvm)) 448 return kvm_lapic_get_cr8(vcpu); 449 else 450 return vcpu->arch.cr8; 451 } 452 EXPORT_SYMBOL_GPL(kvm_get_cr8); 453 454 static inline u32 bit(int bitno) 455 { 456 return 1 << (bitno & 31); 457 } 458 459 /* 460 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 461 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 462 * 463 * This list is modified at module load time to reflect the 464 * capabilities of the host cpu. 465 */ 466 static u32 msrs_to_save[] = { 467 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 468 MSR_K6_STAR, 469 #ifdef CONFIG_X86_64 470 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 471 #endif 472 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 473 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 474 }; 475 476 static unsigned num_msrs_to_save; 477 478 static u32 emulated_msrs[] = { 479 MSR_IA32_MISC_ENABLE, 480 }; 481 482 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 483 { 484 if (efer & efer_reserved_bits) { 485 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 486 efer); 487 kvm_inject_gp(vcpu, 0); 488 return; 489 } 490 491 if (is_paging(vcpu) 492 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 493 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 494 kvm_inject_gp(vcpu, 0); 495 return; 496 } 497 498 if (efer & EFER_FFXSR) { 499 struct kvm_cpuid_entry2 *feat; 500 501 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 502 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 503 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 504 kvm_inject_gp(vcpu, 0); 505 return; 506 } 507 } 508 509 if (efer & EFER_SVME) { 510 struct kvm_cpuid_entry2 *feat; 511 512 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 513 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 514 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 515 kvm_inject_gp(vcpu, 0); 516 return; 517 } 518 } 519 520 kvm_x86_ops->set_efer(vcpu, efer); 521 522 efer &= ~EFER_LMA; 523 efer |= vcpu->arch.shadow_efer & EFER_LMA; 524 525 vcpu->arch.shadow_efer = efer; 526 } 527 528 void kvm_enable_efer_bits(u64 mask) 529 { 530 efer_reserved_bits &= ~mask; 531 } 532 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 533 534 535 /* 536 * Writes msr value into into the appropriate "register". 537 * Returns 0 on success, non-0 otherwise. 538 * Assumes vcpu_load() was already called. 539 */ 540 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 541 { 542 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 543 } 544 545 /* 546 * Adapt set_msr() to msr_io()'s calling convention 547 */ 548 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 549 { 550 return kvm_set_msr(vcpu, index, *data); 551 } 552 553 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 554 { 555 static int version; 556 struct pvclock_wall_clock wc; 557 struct timespec now, sys, boot; 558 559 if (!wall_clock) 560 return; 561 562 version++; 563 564 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 565 566 /* 567 * The guest calculates current wall clock time by adding 568 * system time (updated by kvm_write_guest_time below) to the 569 * wall clock specified here. guest system time equals host 570 * system time for us, thus we must fill in host boot time here. 571 */ 572 now = current_kernel_time(); 573 ktime_get_ts(&sys); 574 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 575 576 wc.sec = boot.tv_sec; 577 wc.nsec = boot.tv_nsec; 578 wc.version = version; 579 580 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 581 582 version++; 583 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 584 } 585 586 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 587 { 588 uint32_t quotient, remainder; 589 590 /* Don't try to replace with do_div(), this one calculates 591 * "(dividend << 32) / divisor" */ 592 __asm__ ( "divl %4" 593 : "=a" (quotient), "=d" (remainder) 594 : "0" (0), "1" (dividend), "r" (divisor) ); 595 return quotient; 596 } 597 598 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 599 { 600 uint64_t nsecs = 1000000000LL; 601 int32_t shift = 0; 602 uint64_t tps64; 603 uint32_t tps32; 604 605 tps64 = tsc_khz * 1000LL; 606 while (tps64 > nsecs*2) { 607 tps64 >>= 1; 608 shift--; 609 } 610 611 tps32 = (uint32_t)tps64; 612 while (tps32 <= (uint32_t)nsecs) { 613 tps32 <<= 1; 614 shift++; 615 } 616 617 hv_clock->tsc_shift = shift; 618 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 619 620 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 621 __func__, tsc_khz, hv_clock->tsc_shift, 622 hv_clock->tsc_to_system_mul); 623 } 624 625 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 626 627 static void kvm_write_guest_time(struct kvm_vcpu *v) 628 { 629 struct timespec ts; 630 unsigned long flags; 631 struct kvm_vcpu_arch *vcpu = &v->arch; 632 void *shared_kaddr; 633 634 if ((!vcpu->time_page)) 635 return; 636 637 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { 638 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); 639 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); 640 } 641 642 /* Keep irq disabled to prevent changes to the clock */ 643 local_irq_save(flags); 644 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 645 &vcpu->hv_clock.tsc_timestamp); 646 ktime_get_ts(&ts); 647 local_irq_restore(flags); 648 649 /* With all the info we got, fill in the values */ 650 651 vcpu->hv_clock.system_time = ts.tv_nsec + 652 (NSEC_PER_SEC * (u64)ts.tv_sec); 653 /* 654 * The interface expects us to write an even number signaling that the 655 * update is finished. Since the guest won't see the intermediate 656 * state, we just increase by 2 at the end. 657 */ 658 vcpu->hv_clock.version += 2; 659 660 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 661 662 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 663 sizeof(vcpu->hv_clock)); 664 665 kunmap_atomic(shared_kaddr, KM_USER0); 666 667 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 668 } 669 670 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 671 { 672 struct kvm_vcpu_arch *vcpu = &v->arch; 673 674 if (!vcpu->time_page) 675 return 0; 676 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 677 return 1; 678 } 679 680 static bool msr_mtrr_valid(unsigned msr) 681 { 682 switch (msr) { 683 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 684 case MSR_MTRRfix64K_00000: 685 case MSR_MTRRfix16K_80000: 686 case MSR_MTRRfix16K_A0000: 687 case MSR_MTRRfix4K_C0000: 688 case MSR_MTRRfix4K_C8000: 689 case MSR_MTRRfix4K_D0000: 690 case MSR_MTRRfix4K_D8000: 691 case MSR_MTRRfix4K_E0000: 692 case MSR_MTRRfix4K_E8000: 693 case MSR_MTRRfix4K_F0000: 694 case MSR_MTRRfix4K_F8000: 695 case MSR_MTRRdefType: 696 case MSR_IA32_CR_PAT: 697 return true; 698 case 0x2f8: 699 return true; 700 } 701 return false; 702 } 703 704 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 705 { 706 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 707 708 if (!msr_mtrr_valid(msr)) 709 return 1; 710 711 if (msr == MSR_MTRRdefType) { 712 vcpu->arch.mtrr_state.def_type = data; 713 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 714 } else if (msr == MSR_MTRRfix64K_00000) 715 p[0] = data; 716 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 717 p[1 + msr - MSR_MTRRfix16K_80000] = data; 718 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 719 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 720 else if (msr == MSR_IA32_CR_PAT) 721 vcpu->arch.pat = data; 722 else { /* Variable MTRRs */ 723 int idx, is_mtrr_mask; 724 u64 *pt; 725 726 idx = (msr - 0x200) / 2; 727 is_mtrr_mask = msr - 0x200 - 2 * idx; 728 if (!is_mtrr_mask) 729 pt = 730 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 731 else 732 pt = 733 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 734 *pt = data; 735 } 736 737 kvm_mmu_reset_context(vcpu); 738 return 0; 739 } 740 741 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 742 { 743 switch (msr) { 744 case MSR_EFER: 745 set_efer(vcpu, data); 746 break; 747 case MSR_IA32_MC0_STATUS: 748 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 749 __func__, data); 750 break; 751 case MSR_IA32_MCG_STATUS: 752 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 753 __func__, data); 754 break; 755 case MSR_IA32_MCG_CTL: 756 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 757 __func__, data); 758 break; 759 case MSR_IA32_DEBUGCTLMSR: 760 if (!data) { 761 /* We support the non-activated case already */ 762 break; 763 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 764 /* Values other than LBR and BTF are vendor-specific, 765 thus reserved and should throw a #GP */ 766 return 1; 767 } 768 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 769 __func__, data); 770 break; 771 case MSR_IA32_UCODE_REV: 772 case MSR_IA32_UCODE_WRITE: 773 case MSR_VM_HSAVE_PA: 774 break; 775 case 0x200 ... 0x2ff: 776 return set_msr_mtrr(vcpu, msr, data); 777 case MSR_IA32_APICBASE: 778 kvm_set_apic_base(vcpu, data); 779 break; 780 case MSR_IA32_MISC_ENABLE: 781 vcpu->arch.ia32_misc_enable_msr = data; 782 break; 783 case MSR_KVM_WALL_CLOCK: 784 vcpu->kvm->arch.wall_clock = data; 785 kvm_write_wall_clock(vcpu->kvm, data); 786 break; 787 case MSR_KVM_SYSTEM_TIME: { 788 if (vcpu->arch.time_page) { 789 kvm_release_page_dirty(vcpu->arch.time_page); 790 vcpu->arch.time_page = NULL; 791 } 792 793 vcpu->arch.time = data; 794 795 /* we verify if the enable bit is set... */ 796 if (!(data & 1)) 797 break; 798 799 /* ...but clean it before doing the actual write */ 800 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 801 802 vcpu->arch.time_page = 803 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 804 805 if (is_error_page(vcpu->arch.time_page)) { 806 kvm_release_page_clean(vcpu->arch.time_page); 807 vcpu->arch.time_page = NULL; 808 } 809 810 kvm_request_guest_time_update(vcpu); 811 break; 812 } 813 default: 814 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 815 return 1; 816 } 817 return 0; 818 } 819 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 820 821 822 /* 823 * Reads an msr value (of 'msr_index') into 'pdata'. 824 * Returns 0 on success, non-0 otherwise. 825 * Assumes vcpu_load() was already called. 826 */ 827 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 828 { 829 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 830 } 831 832 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 833 { 834 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 835 836 if (!msr_mtrr_valid(msr)) 837 return 1; 838 839 if (msr == MSR_MTRRdefType) 840 *pdata = vcpu->arch.mtrr_state.def_type + 841 (vcpu->arch.mtrr_state.enabled << 10); 842 else if (msr == MSR_MTRRfix64K_00000) 843 *pdata = p[0]; 844 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 845 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 846 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 847 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 848 else if (msr == MSR_IA32_CR_PAT) 849 *pdata = vcpu->arch.pat; 850 else { /* Variable MTRRs */ 851 int idx, is_mtrr_mask; 852 u64 *pt; 853 854 idx = (msr - 0x200) / 2; 855 is_mtrr_mask = msr - 0x200 - 2 * idx; 856 if (!is_mtrr_mask) 857 pt = 858 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 859 else 860 pt = 861 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 862 *pdata = *pt; 863 } 864 865 return 0; 866 } 867 868 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 869 { 870 u64 data; 871 872 switch (msr) { 873 case 0xc0010010: /* SYSCFG */ 874 case 0xc0010015: /* HWCR */ 875 case MSR_IA32_PLATFORM_ID: 876 case MSR_IA32_P5_MC_ADDR: 877 case MSR_IA32_P5_MC_TYPE: 878 case MSR_IA32_MC0_CTL: 879 case MSR_IA32_MCG_STATUS: 880 case MSR_IA32_MCG_CAP: 881 case MSR_IA32_MCG_CTL: 882 case MSR_IA32_MC0_MISC: 883 case MSR_IA32_MC0_MISC+4: 884 case MSR_IA32_MC0_MISC+8: 885 case MSR_IA32_MC0_MISC+12: 886 case MSR_IA32_MC0_MISC+16: 887 case MSR_IA32_MC0_MISC+20: 888 case MSR_IA32_UCODE_REV: 889 case MSR_IA32_EBL_CR_POWERON: 890 case MSR_IA32_DEBUGCTLMSR: 891 case MSR_IA32_LASTBRANCHFROMIP: 892 case MSR_IA32_LASTBRANCHTOIP: 893 case MSR_IA32_LASTINTFROMIP: 894 case MSR_IA32_LASTINTTOIP: 895 case MSR_VM_HSAVE_PA: 896 data = 0; 897 break; 898 case MSR_MTRRcap: 899 data = 0x500 | KVM_NR_VAR_MTRR; 900 break; 901 case 0x200 ... 0x2ff: 902 return get_msr_mtrr(vcpu, msr, pdata); 903 case 0xcd: /* fsb frequency */ 904 data = 3; 905 break; 906 case MSR_IA32_APICBASE: 907 data = kvm_get_apic_base(vcpu); 908 break; 909 case MSR_IA32_MISC_ENABLE: 910 data = vcpu->arch.ia32_misc_enable_msr; 911 break; 912 case MSR_IA32_PERF_STATUS: 913 /* TSC increment by tick */ 914 data = 1000ULL; 915 /* CPU multiplier */ 916 data |= (((uint64_t)4ULL) << 40); 917 break; 918 case MSR_EFER: 919 data = vcpu->arch.shadow_efer; 920 break; 921 case MSR_KVM_WALL_CLOCK: 922 data = vcpu->kvm->arch.wall_clock; 923 break; 924 case MSR_KVM_SYSTEM_TIME: 925 data = vcpu->arch.time; 926 break; 927 default: 928 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 929 return 1; 930 } 931 *pdata = data; 932 return 0; 933 } 934 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 935 936 /* 937 * Read or write a bunch of msrs. All parameters are kernel addresses. 938 * 939 * @return number of msrs set successfully. 940 */ 941 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 942 struct kvm_msr_entry *entries, 943 int (*do_msr)(struct kvm_vcpu *vcpu, 944 unsigned index, u64 *data)) 945 { 946 int i; 947 948 vcpu_load(vcpu); 949 950 down_read(&vcpu->kvm->slots_lock); 951 for (i = 0; i < msrs->nmsrs; ++i) 952 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 953 break; 954 up_read(&vcpu->kvm->slots_lock); 955 956 vcpu_put(vcpu); 957 958 return i; 959 } 960 961 /* 962 * Read or write a bunch of msrs. Parameters are user addresses. 963 * 964 * @return number of msrs set successfully. 965 */ 966 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 967 int (*do_msr)(struct kvm_vcpu *vcpu, 968 unsigned index, u64 *data), 969 int writeback) 970 { 971 struct kvm_msrs msrs; 972 struct kvm_msr_entry *entries; 973 int r, n; 974 unsigned size; 975 976 r = -EFAULT; 977 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 978 goto out; 979 980 r = -E2BIG; 981 if (msrs.nmsrs >= MAX_IO_MSRS) 982 goto out; 983 984 r = -ENOMEM; 985 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 986 entries = vmalloc(size); 987 if (!entries) 988 goto out; 989 990 r = -EFAULT; 991 if (copy_from_user(entries, user_msrs->entries, size)) 992 goto out_free; 993 994 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 995 if (r < 0) 996 goto out_free; 997 998 r = -EFAULT; 999 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1000 goto out_free; 1001 1002 r = n; 1003 1004 out_free: 1005 vfree(entries); 1006 out: 1007 return r; 1008 } 1009 1010 int kvm_dev_ioctl_check_extension(long ext) 1011 { 1012 int r; 1013 1014 switch (ext) { 1015 case KVM_CAP_IRQCHIP: 1016 case KVM_CAP_HLT: 1017 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1018 case KVM_CAP_SET_TSS_ADDR: 1019 case KVM_CAP_EXT_CPUID: 1020 case KVM_CAP_CLOCKSOURCE: 1021 case KVM_CAP_PIT: 1022 case KVM_CAP_NOP_IO_DELAY: 1023 case KVM_CAP_MP_STATE: 1024 case KVM_CAP_SYNC_MMU: 1025 case KVM_CAP_REINJECT_CONTROL: 1026 case KVM_CAP_IRQ_INJECT_STATUS: 1027 r = 1; 1028 break; 1029 case KVM_CAP_COALESCED_MMIO: 1030 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1031 break; 1032 case KVM_CAP_VAPIC: 1033 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1034 break; 1035 case KVM_CAP_NR_VCPUS: 1036 r = KVM_MAX_VCPUS; 1037 break; 1038 case KVM_CAP_NR_MEMSLOTS: 1039 r = KVM_MEMORY_SLOTS; 1040 break; 1041 case KVM_CAP_PV_MMU: 1042 r = !tdp_enabled; 1043 break; 1044 case KVM_CAP_IOMMU: 1045 r = iommu_found(); 1046 break; 1047 default: 1048 r = 0; 1049 break; 1050 } 1051 return r; 1052 1053 } 1054 1055 long kvm_arch_dev_ioctl(struct file *filp, 1056 unsigned int ioctl, unsigned long arg) 1057 { 1058 void __user *argp = (void __user *)arg; 1059 long r; 1060 1061 switch (ioctl) { 1062 case KVM_GET_MSR_INDEX_LIST: { 1063 struct kvm_msr_list __user *user_msr_list = argp; 1064 struct kvm_msr_list msr_list; 1065 unsigned n; 1066 1067 r = -EFAULT; 1068 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1069 goto out; 1070 n = msr_list.nmsrs; 1071 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1072 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1073 goto out; 1074 r = -E2BIG; 1075 if (n < num_msrs_to_save) 1076 goto out; 1077 r = -EFAULT; 1078 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1079 num_msrs_to_save * sizeof(u32))) 1080 goto out; 1081 if (copy_to_user(user_msr_list->indices 1082 + num_msrs_to_save * sizeof(u32), 1083 &emulated_msrs, 1084 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1085 goto out; 1086 r = 0; 1087 break; 1088 } 1089 case KVM_GET_SUPPORTED_CPUID: { 1090 struct kvm_cpuid2 __user *cpuid_arg = argp; 1091 struct kvm_cpuid2 cpuid; 1092 1093 r = -EFAULT; 1094 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1095 goto out; 1096 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1097 cpuid_arg->entries); 1098 if (r) 1099 goto out; 1100 1101 r = -EFAULT; 1102 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1103 goto out; 1104 r = 0; 1105 break; 1106 } 1107 default: 1108 r = -EINVAL; 1109 } 1110 out: 1111 return r; 1112 } 1113 1114 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1115 { 1116 kvm_x86_ops->vcpu_load(vcpu, cpu); 1117 kvm_request_guest_time_update(vcpu); 1118 } 1119 1120 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1121 { 1122 kvm_x86_ops->vcpu_put(vcpu); 1123 kvm_put_guest_fpu(vcpu); 1124 } 1125 1126 static int is_efer_nx(void) 1127 { 1128 unsigned long long efer = 0; 1129 1130 rdmsrl_safe(MSR_EFER, &efer); 1131 return efer & EFER_NX; 1132 } 1133 1134 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1135 { 1136 int i; 1137 struct kvm_cpuid_entry2 *e, *entry; 1138 1139 entry = NULL; 1140 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1141 e = &vcpu->arch.cpuid_entries[i]; 1142 if (e->function == 0x80000001) { 1143 entry = e; 1144 break; 1145 } 1146 } 1147 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1148 entry->edx &= ~(1 << 20); 1149 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1150 } 1151 } 1152 1153 /* when an old userspace process fills a new kernel module */ 1154 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1155 struct kvm_cpuid *cpuid, 1156 struct kvm_cpuid_entry __user *entries) 1157 { 1158 int r, i; 1159 struct kvm_cpuid_entry *cpuid_entries; 1160 1161 r = -E2BIG; 1162 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1163 goto out; 1164 r = -ENOMEM; 1165 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1166 if (!cpuid_entries) 1167 goto out; 1168 r = -EFAULT; 1169 if (copy_from_user(cpuid_entries, entries, 1170 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1171 goto out_free; 1172 for (i = 0; i < cpuid->nent; i++) { 1173 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1174 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1175 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1176 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1177 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1178 vcpu->arch.cpuid_entries[i].index = 0; 1179 vcpu->arch.cpuid_entries[i].flags = 0; 1180 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1181 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1182 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1183 } 1184 vcpu->arch.cpuid_nent = cpuid->nent; 1185 cpuid_fix_nx_cap(vcpu); 1186 r = 0; 1187 1188 out_free: 1189 vfree(cpuid_entries); 1190 out: 1191 return r; 1192 } 1193 1194 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1195 struct kvm_cpuid2 *cpuid, 1196 struct kvm_cpuid_entry2 __user *entries) 1197 { 1198 int r; 1199 1200 r = -E2BIG; 1201 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1202 goto out; 1203 r = -EFAULT; 1204 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1205 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1206 goto out; 1207 vcpu->arch.cpuid_nent = cpuid->nent; 1208 return 0; 1209 1210 out: 1211 return r; 1212 } 1213 1214 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1215 struct kvm_cpuid2 *cpuid, 1216 struct kvm_cpuid_entry2 __user *entries) 1217 { 1218 int r; 1219 1220 r = -E2BIG; 1221 if (cpuid->nent < vcpu->arch.cpuid_nent) 1222 goto out; 1223 r = -EFAULT; 1224 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1225 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1226 goto out; 1227 return 0; 1228 1229 out: 1230 cpuid->nent = vcpu->arch.cpuid_nent; 1231 return r; 1232 } 1233 1234 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1235 u32 index) 1236 { 1237 entry->function = function; 1238 entry->index = index; 1239 cpuid_count(entry->function, entry->index, 1240 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1241 entry->flags = 0; 1242 } 1243 1244 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1245 u32 index, int *nent, int maxnent) 1246 { 1247 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1248 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1249 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1250 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1251 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1252 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | 1253 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1254 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | 1255 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | 1256 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); 1257 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | 1258 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1259 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1260 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1261 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1262 bit(X86_FEATURE_PGE) | 1263 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1264 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | 1265 bit(X86_FEATURE_SYSCALL) | 1266 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) | 1267 #ifdef CONFIG_X86_64 1268 bit(X86_FEATURE_LM) | 1269 #endif 1270 bit(X86_FEATURE_FXSR_OPT) | 1271 bit(X86_FEATURE_MMXEXT) | 1272 bit(X86_FEATURE_3DNOWEXT) | 1273 bit(X86_FEATURE_3DNOW); 1274 const u32 kvm_supported_word3_x86_features = 1275 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1276 const u32 kvm_supported_word6_x86_features = 1277 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | 1278 bit(X86_FEATURE_SVM); 1279 1280 /* all calls to cpuid_count() should be made on the same cpu */ 1281 get_cpu(); 1282 do_cpuid_1_ent(entry, function, index); 1283 ++*nent; 1284 1285 switch (function) { 1286 case 0: 1287 entry->eax = min(entry->eax, (u32)0xb); 1288 break; 1289 case 1: 1290 entry->edx &= kvm_supported_word0_x86_features; 1291 entry->ecx &= kvm_supported_word3_x86_features; 1292 break; 1293 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1294 * may return different values. This forces us to get_cpu() before 1295 * issuing the first command, and also to emulate this annoying behavior 1296 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1297 case 2: { 1298 int t, times = entry->eax & 0xff; 1299 1300 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1301 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1302 for (t = 1; t < times && *nent < maxnent; ++t) { 1303 do_cpuid_1_ent(&entry[t], function, 0); 1304 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1305 ++*nent; 1306 } 1307 break; 1308 } 1309 /* function 4 and 0xb have additional index. */ 1310 case 4: { 1311 int i, cache_type; 1312 1313 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1314 /* read more entries until cache_type is zero */ 1315 for (i = 1; *nent < maxnent; ++i) { 1316 cache_type = entry[i - 1].eax & 0x1f; 1317 if (!cache_type) 1318 break; 1319 do_cpuid_1_ent(&entry[i], function, i); 1320 entry[i].flags |= 1321 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1322 ++*nent; 1323 } 1324 break; 1325 } 1326 case 0xb: { 1327 int i, level_type; 1328 1329 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1330 /* read more entries until level_type is zero */ 1331 for (i = 1; *nent < maxnent; ++i) { 1332 level_type = entry[i - 1].ecx & 0xff00; 1333 if (!level_type) 1334 break; 1335 do_cpuid_1_ent(&entry[i], function, i); 1336 entry[i].flags |= 1337 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1338 ++*nent; 1339 } 1340 break; 1341 } 1342 case 0x80000000: 1343 entry->eax = min(entry->eax, 0x8000001a); 1344 break; 1345 case 0x80000001: 1346 entry->edx &= kvm_supported_word1_x86_features; 1347 entry->ecx &= kvm_supported_word6_x86_features; 1348 break; 1349 } 1350 put_cpu(); 1351 } 1352 1353 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1354 struct kvm_cpuid_entry2 __user *entries) 1355 { 1356 struct kvm_cpuid_entry2 *cpuid_entries; 1357 int limit, nent = 0, r = -E2BIG; 1358 u32 func; 1359 1360 if (cpuid->nent < 1) 1361 goto out; 1362 r = -ENOMEM; 1363 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1364 if (!cpuid_entries) 1365 goto out; 1366 1367 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1368 limit = cpuid_entries[0].eax; 1369 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1370 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1371 &nent, cpuid->nent); 1372 r = -E2BIG; 1373 if (nent >= cpuid->nent) 1374 goto out_free; 1375 1376 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1377 limit = cpuid_entries[nent - 1].eax; 1378 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1379 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1380 &nent, cpuid->nent); 1381 r = -EFAULT; 1382 if (copy_to_user(entries, cpuid_entries, 1383 nent * sizeof(struct kvm_cpuid_entry2))) 1384 goto out_free; 1385 cpuid->nent = nent; 1386 r = 0; 1387 1388 out_free: 1389 vfree(cpuid_entries); 1390 out: 1391 return r; 1392 } 1393 1394 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1395 struct kvm_lapic_state *s) 1396 { 1397 vcpu_load(vcpu); 1398 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1399 vcpu_put(vcpu); 1400 1401 return 0; 1402 } 1403 1404 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1405 struct kvm_lapic_state *s) 1406 { 1407 vcpu_load(vcpu); 1408 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1409 kvm_apic_post_state_restore(vcpu); 1410 vcpu_put(vcpu); 1411 1412 return 0; 1413 } 1414 1415 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1416 struct kvm_interrupt *irq) 1417 { 1418 if (irq->irq < 0 || irq->irq >= 256) 1419 return -EINVAL; 1420 if (irqchip_in_kernel(vcpu->kvm)) 1421 return -ENXIO; 1422 vcpu_load(vcpu); 1423 1424 set_bit(irq->irq, vcpu->arch.irq_pending); 1425 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 1426 1427 vcpu_put(vcpu); 1428 1429 return 0; 1430 } 1431 1432 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1433 { 1434 vcpu_load(vcpu); 1435 kvm_inject_nmi(vcpu); 1436 vcpu_put(vcpu); 1437 1438 return 0; 1439 } 1440 1441 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1442 struct kvm_tpr_access_ctl *tac) 1443 { 1444 if (tac->flags) 1445 return -EINVAL; 1446 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1447 return 0; 1448 } 1449 1450 long kvm_arch_vcpu_ioctl(struct file *filp, 1451 unsigned int ioctl, unsigned long arg) 1452 { 1453 struct kvm_vcpu *vcpu = filp->private_data; 1454 void __user *argp = (void __user *)arg; 1455 int r; 1456 struct kvm_lapic_state *lapic = NULL; 1457 1458 switch (ioctl) { 1459 case KVM_GET_LAPIC: { 1460 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1461 1462 r = -ENOMEM; 1463 if (!lapic) 1464 goto out; 1465 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1466 if (r) 1467 goto out; 1468 r = -EFAULT; 1469 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1470 goto out; 1471 r = 0; 1472 break; 1473 } 1474 case KVM_SET_LAPIC: { 1475 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1476 r = -ENOMEM; 1477 if (!lapic) 1478 goto out; 1479 r = -EFAULT; 1480 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1481 goto out; 1482 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1483 if (r) 1484 goto out; 1485 r = 0; 1486 break; 1487 } 1488 case KVM_INTERRUPT: { 1489 struct kvm_interrupt irq; 1490 1491 r = -EFAULT; 1492 if (copy_from_user(&irq, argp, sizeof irq)) 1493 goto out; 1494 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1495 if (r) 1496 goto out; 1497 r = 0; 1498 break; 1499 } 1500 case KVM_NMI: { 1501 r = kvm_vcpu_ioctl_nmi(vcpu); 1502 if (r) 1503 goto out; 1504 r = 0; 1505 break; 1506 } 1507 case KVM_SET_CPUID: { 1508 struct kvm_cpuid __user *cpuid_arg = argp; 1509 struct kvm_cpuid cpuid; 1510 1511 r = -EFAULT; 1512 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1513 goto out; 1514 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1515 if (r) 1516 goto out; 1517 break; 1518 } 1519 case KVM_SET_CPUID2: { 1520 struct kvm_cpuid2 __user *cpuid_arg = argp; 1521 struct kvm_cpuid2 cpuid; 1522 1523 r = -EFAULT; 1524 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1525 goto out; 1526 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1527 cpuid_arg->entries); 1528 if (r) 1529 goto out; 1530 break; 1531 } 1532 case KVM_GET_CPUID2: { 1533 struct kvm_cpuid2 __user *cpuid_arg = argp; 1534 struct kvm_cpuid2 cpuid; 1535 1536 r = -EFAULT; 1537 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1538 goto out; 1539 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1540 cpuid_arg->entries); 1541 if (r) 1542 goto out; 1543 r = -EFAULT; 1544 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1545 goto out; 1546 r = 0; 1547 break; 1548 } 1549 case KVM_GET_MSRS: 1550 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1551 break; 1552 case KVM_SET_MSRS: 1553 r = msr_io(vcpu, argp, do_set_msr, 0); 1554 break; 1555 case KVM_TPR_ACCESS_REPORTING: { 1556 struct kvm_tpr_access_ctl tac; 1557 1558 r = -EFAULT; 1559 if (copy_from_user(&tac, argp, sizeof tac)) 1560 goto out; 1561 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1562 if (r) 1563 goto out; 1564 r = -EFAULT; 1565 if (copy_to_user(argp, &tac, sizeof tac)) 1566 goto out; 1567 r = 0; 1568 break; 1569 }; 1570 case KVM_SET_VAPIC_ADDR: { 1571 struct kvm_vapic_addr va; 1572 1573 r = -EINVAL; 1574 if (!irqchip_in_kernel(vcpu->kvm)) 1575 goto out; 1576 r = -EFAULT; 1577 if (copy_from_user(&va, argp, sizeof va)) 1578 goto out; 1579 r = 0; 1580 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1581 break; 1582 } 1583 default: 1584 r = -EINVAL; 1585 } 1586 out: 1587 if (lapic) 1588 kfree(lapic); 1589 return r; 1590 } 1591 1592 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1593 { 1594 int ret; 1595 1596 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1597 return -1; 1598 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1599 return ret; 1600 } 1601 1602 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1603 u32 kvm_nr_mmu_pages) 1604 { 1605 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1606 return -EINVAL; 1607 1608 down_write(&kvm->slots_lock); 1609 1610 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1611 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1612 1613 up_write(&kvm->slots_lock); 1614 return 0; 1615 } 1616 1617 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1618 { 1619 return kvm->arch.n_alloc_mmu_pages; 1620 } 1621 1622 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1623 { 1624 int i; 1625 struct kvm_mem_alias *alias; 1626 1627 for (i = 0; i < kvm->arch.naliases; ++i) { 1628 alias = &kvm->arch.aliases[i]; 1629 if (gfn >= alias->base_gfn 1630 && gfn < alias->base_gfn + alias->npages) 1631 return alias->target_gfn + gfn - alias->base_gfn; 1632 } 1633 return gfn; 1634 } 1635 1636 /* 1637 * Set a new alias region. Aliases map a portion of physical memory into 1638 * another portion. This is useful for memory windows, for example the PC 1639 * VGA region. 1640 */ 1641 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1642 struct kvm_memory_alias *alias) 1643 { 1644 int r, n; 1645 struct kvm_mem_alias *p; 1646 1647 r = -EINVAL; 1648 /* General sanity checks */ 1649 if (alias->memory_size & (PAGE_SIZE - 1)) 1650 goto out; 1651 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1652 goto out; 1653 if (alias->slot >= KVM_ALIAS_SLOTS) 1654 goto out; 1655 if (alias->guest_phys_addr + alias->memory_size 1656 < alias->guest_phys_addr) 1657 goto out; 1658 if (alias->target_phys_addr + alias->memory_size 1659 < alias->target_phys_addr) 1660 goto out; 1661 1662 down_write(&kvm->slots_lock); 1663 spin_lock(&kvm->mmu_lock); 1664 1665 p = &kvm->arch.aliases[alias->slot]; 1666 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1667 p->npages = alias->memory_size >> PAGE_SHIFT; 1668 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1669 1670 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1671 if (kvm->arch.aliases[n - 1].npages) 1672 break; 1673 kvm->arch.naliases = n; 1674 1675 spin_unlock(&kvm->mmu_lock); 1676 kvm_mmu_zap_all(kvm); 1677 1678 up_write(&kvm->slots_lock); 1679 1680 return 0; 1681 1682 out: 1683 return r; 1684 } 1685 1686 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1687 { 1688 int r; 1689 1690 r = 0; 1691 switch (chip->chip_id) { 1692 case KVM_IRQCHIP_PIC_MASTER: 1693 memcpy(&chip->chip.pic, 1694 &pic_irqchip(kvm)->pics[0], 1695 sizeof(struct kvm_pic_state)); 1696 break; 1697 case KVM_IRQCHIP_PIC_SLAVE: 1698 memcpy(&chip->chip.pic, 1699 &pic_irqchip(kvm)->pics[1], 1700 sizeof(struct kvm_pic_state)); 1701 break; 1702 case KVM_IRQCHIP_IOAPIC: 1703 memcpy(&chip->chip.ioapic, 1704 ioapic_irqchip(kvm), 1705 sizeof(struct kvm_ioapic_state)); 1706 break; 1707 default: 1708 r = -EINVAL; 1709 break; 1710 } 1711 return r; 1712 } 1713 1714 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1715 { 1716 int r; 1717 1718 r = 0; 1719 switch (chip->chip_id) { 1720 case KVM_IRQCHIP_PIC_MASTER: 1721 memcpy(&pic_irqchip(kvm)->pics[0], 1722 &chip->chip.pic, 1723 sizeof(struct kvm_pic_state)); 1724 break; 1725 case KVM_IRQCHIP_PIC_SLAVE: 1726 memcpy(&pic_irqchip(kvm)->pics[1], 1727 &chip->chip.pic, 1728 sizeof(struct kvm_pic_state)); 1729 break; 1730 case KVM_IRQCHIP_IOAPIC: 1731 memcpy(ioapic_irqchip(kvm), 1732 &chip->chip.ioapic, 1733 sizeof(struct kvm_ioapic_state)); 1734 break; 1735 default: 1736 r = -EINVAL; 1737 break; 1738 } 1739 kvm_pic_update_irq(pic_irqchip(kvm)); 1740 return r; 1741 } 1742 1743 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1744 { 1745 int r = 0; 1746 1747 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1748 return r; 1749 } 1750 1751 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1752 { 1753 int r = 0; 1754 1755 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1756 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1757 return r; 1758 } 1759 1760 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 1761 struct kvm_reinject_control *control) 1762 { 1763 if (!kvm->arch.vpit) 1764 return -ENXIO; 1765 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 1766 return 0; 1767 } 1768 1769 /* 1770 * Get (and clear) the dirty memory log for a memory slot. 1771 */ 1772 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1773 struct kvm_dirty_log *log) 1774 { 1775 int r; 1776 int n; 1777 struct kvm_memory_slot *memslot; 1778 int is_dirty = 0; 1779 1780 down_write(&kvm->slots_lock); 1781 1782 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1783 if (r) 1784 goto out; 1785 1786 /* If nothing is dirty, don't bother messing with page tables. */ 1787 if (is_dirty) { 1788 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1789 kvm_flush_remote_tlbs(kvm); 1790 memslot = &kvm->memslots[log->slot]; 1791 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1792 memset(memslot->dirty_bitmap, 0, n); 1793 } 1794 r = 0; 1795 out: 1796 up_write(&kvm->slots_lock); 1797 return r; 1798 } 1799 1800 long kvm_arch_vm_ioctl(struct file *filp, 1801 unsigned int ioctl, unsigned long arg) 1802 { 1803 struct kvm *kvm = filp->private_data; 1804 void __user *argp = (void __user *)arg; 1805 int r = -EINVAL; 1806 /* 1807 * This union makes it completely explicit to gcc-3.x 1808 * that these two variables' stack usage should be 1809 * combined, not added together. 1810 */ 1811 union { 1812 struct kvm_pit_state ps; 1813 struct kvm_memory_alias alias; 1814 } u; 1815 1816 switch (ioctl) { 1817 case KVM_SET_TSS_ADDR: 1818 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1819 if (r < 0) 1820 goto out; 1821 break; 1822 case KVM_SET_MEMORY_REGION: { 1823 struct kvm_memory_region kvm_mem; 1824 struct kvm_userspace_memory_region kvm_userspace_mem; 1825 1826 r = -EFAULT; 1827 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1828 goto out; 1829 kvm_userspace_mem.slot = kvm_mem.slot; 1830 kvm_userspace_mem.flags = kvm_mem.flags; 1831 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1832 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1833 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1834 if (r) 1835 goto out; 1836 break; 1837 } 1838 case KVM_SET_NR_MMU_PAGES: 1839 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1840 if (r) 1841 goto out; 1842 break; 1843 case KVM_GET_NR_MMU_PAGES: 1844 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1845 break; 1846 case KVM_SET_MEMORY_ALIAS: 1847 r = -EFAULT; 1848 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 1849 goto out; 1850 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 1851 if (r) 1852 goto out; 1853 break; 1854 case KVM_CREATE_IRQCHIP: 1855 r = -ENOMEM; 1856 kvm->arch.vpic = kvm_create_pic(kvm); 1857 if (kvm->arch.vpic) { 1858 r = kvm_ioapic_init(kvm); 1859 if (r) { 1860 kfree(kvm->arch.vpic); 1861 kvm->arch.vpic = NULL; 1862 goto out; 1863 } 1864 } else 1865 goto out; 1866 r = kvm_setup_default_irq_routing(kvm); 1867 if (r) { 1868 kfree(kvm->arch.vpic); 1869 kfree(kvm->arch.vioapic); 1870 goto out; 1871 } 1872 break; 1873 case KVM_CREATE_PIT: 1874 mutex_lock(&kvm->lock); 1875 r = -EEXIST; 1876 if (kvm->arch.vpit) 1877 goto create_pit_unlock; 1878 r = -ENOMEM; 1879 kvm->arch.vpit = kvm_create_pit(kvm); 1880 if (kvm->arch.vpit) 1881 r = 0; 1882 create_pit_unlock: 1883 mutex_unlock(&kvm->lock); 1884 break; 1885 case KVM_IRQ_LINE_STATUS: 1886 case KVM_IRQ_LINE: { 1887 struct kvm_irq_level irq_event; 1888 1889 r = -EFAULT; 1890 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1891 goto out; 1892 if (irqchip_in_kernel(kvm)) { 1893 __s32 status; 1894 mutex_lock(&kvm->lock); 1895 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1896 irq_event.irq, irq_event.level); 1897 mutex_unlock(&kvm->lock); 1898 if (ioctl == KVM_IRQ_LINE_STATUS) { 1899 irq_event.status = status; 1900 if (copy_to_user(argp, &irq_event, 1901 sizeof irq_event)) 1902 goto out; 1903 } 1904 r = 0; 1905 } 1906 break; 1907 } 1908 case KVM_GET_IRQCHIP: { 1909 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1910 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1911 1912 r = -ENOMEM; 1913 if (!chip) 1914 goto out; 1915 r = -EFAULT; 1916 if (copy_from_user(chip, argp, sizeof *chip)) 1917 goto get_irqchip_out; 1918 r = -ENXIO; 1919 if (!irqchip_in_kernel(kvm)) 1920 goto get_irqchip_out; 1921 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 1922 if (r) 1923 goto get_irqchip_out; 1924 r = -EFAULT; 1925 if (copy_to_user(argp, chip, sizeof *chip)) 1926 goto get_irqchip_out; 1927 r = 0; 1928 get_irqchip_out: 1929 kfree(chip); 1930 if (r) 1931 goto out; 1932 break; 1933 } 1934 case KVM_SET_IRQCHIP: { 1935 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1936 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1937 1938 r = -ENOMEM; 1939 if (!chip) 1940 goto out; 1941 r = -EFAULT; 1942 if (copy_from_user(chip, argp, sizeof *chip)) 1943 goto set_irqchip_out; 1944 r = -ENXIO; 1945 if (!irqchip_in_kernel(kvm)) 1946 goto set_irqchip_out; 1947 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 1948 if (r) 1949 goto set_irqchip_out; 1950 r = 0; 1951 set_irqchip_out: 1952 kfree(chip); 1953 if (r) 1954 goto out; 1955 break; 1956 } 1957 case KVM_GET_PIT: { 1958 r = -EFAULT; 1959 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 1960 goto out; 1961 r = -ENXIO; 1962 if (!kvm->arch.vpit) 1963 goto out; 1964 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 1965 if (r) 1966 goto out; 1967 r = -EFAULT; 1968 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 1969 goto out; 1970 r = 0; 1971 break; 1972 } 1973 case KVM_SET_PIT: { 1974 r = -EFAULT; 1975 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 1976 goto out; 1977 r = -ENXIO; 1978 if (!kvm->arch.vpit) 1979 goto out; 1980 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 1981 if (r) 1982 goto out; 1983 r = 0; 1984 break; 1985 } 1986 case KVM_REINJECT_CONTROL: { 1987 struct kvm_reinject_control control; 1988 r = -EFAULT; 1989 if (copy_from_user(&control, argp, sizeof(control))) 1990 goto out; 1991 r = kvm_vm_ioctl_reinject(kvm, &control); 1992 if (r) 1993 goto out; 1994 r = 0; 1995 break; 1996 } 1997 default: 1998 ; 1999 } 2000 out: 2001 return r; 2002 } 2003 2004 static void kvm_init_msr_list(void) 2005 { 2006 u32 dummy[2]; 2007 unsigned i, j; 2008 2009 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2010 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2011 continue; 2012 if (j < i) 2013 msrs_to_save[j] = msrs_to_save[i]; 2014 j++; 2015 } 2016 num_msrs_to_save = j; 2017 } 2018 2019 /* 2020 * Only apic need an MMIO device hook, so shortcut now.. 2021 */ 2022 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 2023 gpa_t addr, int len, 2024 int is_write) 2025 { 2026 struct kvm_io_device *dev; 2027 2028 if (vcpu->arch.apic) { 2029 dev = &vcpu->arch.apic->dev; 2030 if (dev->in_range(dev, addr, len, is_write)) 2031 return dev; 2032 } 2033 return NULL; 2034 } 2035 2036 2037 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 2038 gpa_t addr, int len, 2039 int is_write) 2040 { 2041 struct kvm_io_device *dev; 2042 2043 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 2044 if (dev == NULL) 2045 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, 2046 is_write); 2047 return dev; 2048 } 2049 2050 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2051 struct kvm_vcpu *vcpu) 2052 { 2053 void *data = val; 2054 int r = X86EMUL_CONTINUE; 2055 2056 while (bytes) { 2057 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2058 unsigned offset = addr & (PAGE_SIZE-1); 2059 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2060 int ret; 2061 2062 if (gpa == UNMAPPED_GVA) { 2063 r = X86EMUL_PROPAGATE_FAULT; 2064 goto out; 2065 } 2066 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2067 if (ret < 0) { 2068 r = X86EMUL_UNHANDLEABLE; 2069 goto out; 2070 } 2071 2072 bytes -= toread; 2073 data += toread; 2074 addr += toread; 2075 } 2076 out: 2077 return r; 2078 } 2079 2080 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2081 struct kvm_vcpu *vcpu) 2082 { 2083 void *data = val; 2084 int r = X86EMUL_CONTINUE; 2085 2086 while (bytes) { 2087 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2088 unsigned offset = addr & (PAGE_SIZE-1); 2089 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2090 int ret; 2091 2092 if (gpa == UNMAPPED_GVA) { 2093 r = X86EMUL_PROPAGATE_FAULT; 2094 goto out; 2095 } 2096 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2097 if (ret < 0) { 2098 r = X86EMUL_UNHANDLEABLE; 2099 goto out; 2100 } 2101 2102 bytes -= towrite; 2103 data += towrite; 2104 addr += towrite; 2105 } 2106 out: 2107 return r; 2108 } 2109 2110 2111 static int emulator_read_emulated(unsigned long addr, 2112 void *val, 2113 unsigned int bytes, 2114 struct kvm_vcpu *vcpu) 2115 { 2116 struct kvm_io_device *mmio_dev; 2117 gpa_t gpa; 2118 2119 if (vcpu->mmio_read_completed) { 2120 memcpy(val, vcpu->mmio_data, bytes); 2121 vcpu->mmio_read_completed = 0; 2122 return X86EMUL_CONTINUE; 2123 } 2124 2125 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2126 2127 /* For APIC access vmexit */ 2128 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2129 goto mmio; 2130 2131 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2132 == X86EMUL_CONTINUE) 2133 return X86EMUL_CONTINUE; 2134 if (gpa == UNMAPPED_GVA) 2135 return X86EMUL_PROPAGATE_FAULT; 2136 2137 mmio: 2138 /* 2139 * Is this MMIO handled locally? 2140 */ 2141 mutex_lock(&vcpu->kvm->lock); 2142 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2143 if (mmio_dev) { 2144 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 2145 mutex_unlock(&vcpu->kvm->lock); 2146 return X86EMUL_CONTINUE; 2147 } 2148 mutex_unlock(&vcpu->kvm->lock); 2149 2150 vcpu->mmio_needed = 1; 2151 vcpu->mmio_phys_addr = gpa; 2152 vcpu->mmio_size = bytes; 2153 vcpu->mmio_is_write = 0; 2154 2155 return X86EMUL_UNHANDLEABLE; 2156 } 2157 2158 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2159 const void *val, int bytes) 2160 { 2161 int ret; 2162 2163 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2164 if (ret < 0) 2165 return 0; 2166 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2167 return 1; 2168 } 2169 2170 static int emulator_write_emulated_onepage(unsigned long addr, 2171 const void *val, 2172 unsigned int bytes, 2173 struct kvm_vcpu *vcpu) 2174 { 2175 struct kvm_io_device *mmio_dev; 2176 gpa_t gpa; 2177 2178 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2179 2180 if (gpa == UNMAPPED_GVA) { 2181 kvm_inject_page_fault(vcpu, addr, 2); 2182 return X86EMUL_PROPAGATE_FAULT; 2183 } 2184 2185 /* For APIC access vmexit */ 2186 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2187 goto mmio; 2188 2189 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2190 return X86EMUL_CONTINUE; 2191 2192 mmio: 2193 /* 2194 * Is this MMIO handled locally? 2195 */ 2196 mutex_lock(&vcpu->kvm->lock); 2197 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); 2198 if (mmio_dev) { 2199 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2200 mutex_unlock(&vcpu->kvm->lock); 2201 return X86EMUL_CONTINUE; 2202 } 2203 mutex_unlock(&vcpu->kvm->lock); 2204 2205 vcpu->mmio_needed = 1; 2206 vcpu->mmio_phys_addr = gpa; 2207 vcpu->mmio_size = bytes; 2208 vcpu->mmio_is_write = 1; 2209 memcpy(vcpu->mmio_data, val, bytes); 2210 2211 return X86EMUL_CONTINUE; 2212 } 2213 2214 int emulator_write_emulated(unsigned long addr, 2215 const void *val, 2216 unsigned int bytes, 2217 struct kvm_vcpu *vcpu) 2218 { 2219 /* Crossing a page boundary? */ 2220 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2221 int rc, now; 2222 2223 now = -addr & ~PAGE_MASK; 2224 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2225 if (rc != X86EMUL_CONTINUE) 2226 return rc; 2227 addr += now; 2228 val += now; 2229 bytes -= now; 2230 } 2231 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2232 } 2233 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2234 2235 static int emulator_cmpxchg_emulated(unsigned long addr, 2236 const void *old, 2237 const void *new, 2238 unsigned int bytes, 2239 struct kvm_vcpu *vcpu) 2240 { 2241 static int reported; 2242 2243 if (!reported) { 2244 reported = 1; 2245 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 2246 } 2247 #ifndef CONFIG_X86_64 2248 /* guests cmpxchg8b have to be emulated atomically */ 2249 if (bytes == 8) { 2250 gpa_t gpa; 2251 struct page *page; 2252 char *kaddr; 2253 u64 val; 2254 2255 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2256 2257 if (gpa == UNMAPPED_GVA || 2258 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2259 goto emul_write; 2260 2261 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2262 goto emul_write; 2263 2264 val = *(u64 *)new; 2265 2266 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2267 2268 kaddr = kmap_atomic(page, KM_USER0); 2269 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2270 kunmap_atomic(kaddr, KM_USER0); 2271 kvm_release_page_dirty(page); 2272 } 2273 emul_write: 2274 #endif 2275 2276 return emulator_write_emulated(addr, new, bytes, vcpu); 2277 } 2278 2279 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2280 { 2281 return kvm_x86_ops->get_segment_base(vcpu, seg); 2282 } 2283 2284 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2285 { 2286 kvm_mmu_invlpg(vcpu, address); 2287 return X86EMUL_CONTINUE; 2288 } 2289 2290 int emulate_clts(struct kvm_vcpu *vcpu) 2291 { 2292 KVMTRACE_0D(CLTS, vcpu, handler); 2293 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2294 return X86EMUL_CONTINUE; 2295 } 2296 2297 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2298 { 2299 struct kvm_vcpu *vcpu = ctxt->vcpu; 2300 2301 switch (dr) { 2302 case 0 ... 3: 2303 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2304 return X86EMUL_CONTINUE; 2305 default: 2306 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2307 return X86EMUL_UNHANDLEABLE; 2308 } 2309 } 2310 2311 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2312 { 2313 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2314 int exception; 2315 2316 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2317 if (exception) { 2318 /* FIXME: better handling */ 2319 return X86EMUL_UNHANDLEABLE; 2320 } 2321 return X86EMUL_CONTINUE; 2322 } 2323 2324 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2325 { 2326 u8 opcodes[4]; 2327 unsigned long rip = kvm_rip_read(vcpu); 2328 unsigned long rip_linear; 2329 2330 if (!printk_ratelimit()) 2331 return; 2332 2333 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2334 2335 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2336 2337 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2338 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2339 } 2340 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2341 2342 static struct x86_emulate_ops emulate_ops = { 2343 .read_std = kvm_read_guest_virt, 2344 .read_emulated = emulator_read_emulated, 2345 .write_emulated = emulator_write_emulated, 2346 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2347 }; 2348 2349 static void cache_all_regs(struct kvm_vcpu *vcpu) 2350 { 2351 kvm_register_read(vcpu, VCPU_REGS_RAX); 2352 kvm_register_read(vcpu, VCPU_REGS_RSP); 2353 kvm_register_read(vcpu, VCPU_REGS_RIP); 2354 vcpu->arch.regs_dirty = ~0; 2355 } 2356 2357 int emulate_instruction(struct kvm_vcpu *vcpu, 2358 struct kvm_run *run, 2359 unsigned long cr2, 2360 u16 error_code, 2361 int emulation_type) 2362 { 2363 int r; 2364 struct decode_cache *c; 2365 2366 kvm_clear_exception_queue(vcpu); 2367 vcpu->arch.mmio_fault_cr2 = cr2; 2368 /* 2369 * TODO: fix x86_emulate.c to use guest_read/write_register 2370 * instead of direct ->regs accesses, can save hundred cycles 2371 * on Intel for instructions that don't read/change RSP, for 2372 * for example. 2373 */ 2374 cache_all_regs(vcpu); 2375 2376 vcpu->mmio_is_write = 0; 2377 vcpu->arch.pio.string = 0; 2378 2379 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2380 int cs_db, cs_l; 2381 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2382 2383 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2384 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2385 vcpu->arch.emulate_ctxt.mode = 2386 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2387 ? X86EMUL_MODE_REAL : cs_l 2388 ? X86EMUL_MODE_PROT64 : cs_db 2389 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2390 2391 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2392 2393 /* Reject the instructions other than VMCALL/VMMCALL when 2394 * try to emulate invalid opcode */ 2395 c = &vcpu->arch.emulate_ctxt.decode; 2396 if ((emulation_type & EMULTYPE_TRAP_UD) && 2397 (!(c->twobyte && c->b == 0x01 && 2398 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2399 c->modrm_mod == 3 && c->modrm_rm == 1))) 2400 return EMULATE_FAIL; 2401 2402 ++vcpu->stat.insn_emulation; 2403 if (r) { 2404 ++vcpu->stat.insn_emulation_fail; 2405 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2406 return EMULATE_DONE; 2407 return EMULATE_FAIL; 2408 } 2409 } 2410 2411 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2412 2413 if (vcpu->arch.pio.string) 2414 return EMULATE_DO_MMIO; 2415 2416 if ((r || vcpu->mmio_is_write) && run) { 2417 run->exit_reason = KVM_EXIT_MMIO; 2418 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2419 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2420 run->mmio.len = vcpu->mmio_size; 2421 run->mmio.is_write = vcpu->mmio_is_write; 2422 } 2423 2424 if (r) { 2425 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2426 return EMULATE_DONE; 2427 if (!vcpu->mmio_needed) { 2428 kvm_report_emulation_failure(vcpu, "mmio"); 2429 return EMULATE_FAIL; 2430 } 2431 return EMULATE_DO_MMIO; 2432 } 2433 2434 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2435 2436 if (vcpu->mmio_is_write) { 2437 vcpu->mmio_needed = 0; 2438 return EMULATE_DO_MMIO; 2439 } 2440 2441 return EMULATE_DONE; 2442 } 2443 EXPORT_SYMBOL_GPL(emulate_instruction); 2444 2445 static int pio_copy_data(struct kvm_vcpu *vcpu) 2446 { 2447 void *p = vcpu->arch.pio_data; 2448 gva_t q = vcpu->arch.pio.guest_gva; 2449 unsigned bytes; 2450 int ret; 2451 2452 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2453 if (vcpu->arch.pio.in) 2454 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2455 else 2456 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2457 return ret; 2458 } 2459 2460 int complete_pio(struct kvm_vcpu *vcpu) 2461 { 2462 struct kvm_pio_request *io = &vcpu->arch.pio; 2463 long delta; 2464 int r; 2465 unsigned long val; 2466 2467 if (!io->string) { 2468 if (io->in) { 2469 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2470 memcpy(&val, vcpu->arch.pio_data, io->size); 2471 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2472 } 2473 } else { 2474 if (io->in) { 2475 r = pio_copy_data(vcpu); 2476 if (r) 2477 return r; 2478 } 2479 2480 delta = 1; 2481 if (io->rep) { 2482 delta *= io->cur_count; 2483 /* 2484 * The size of the register should really depend on 2485 * current address size. 2486 */ 2487 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2488 val -= delta; 2489 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2490 } 2491 if (io->down) 2492 delta = -delta; 2493 delta *= io->size; 2494 if (io->in) { 2495 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2496 val += delta; 2497 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2498 } else { 2499 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2500 val += delta; 2501 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2502 } 2503 } 2504 2505 io->count -= io->cur_count; 2506 io->cur_count = 0; 2507 2508 return 0; 2509 } 2510 2511 static void kernel_pio(struct kvm_io_device *pio_dev, 2512 struct kvm_vcpu *vcpu, 2513 void *pd) 2514 { 2515 /* TODO: String I/O for in kernel device */ 2516 2517 mutex_lock(&vcpu->kvm->lock); 2518 if (vcpu->arch.pio.in) 2519 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2520 vcpu->arch.pio.size, 2521 pd); 2522 else 2523 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2524 vcpu->arch.pio.size, 2525 pd); 2526 mutex_unlock(&vcpu->kvm->lock); 2527 } 2528 2529 static void pio_string_write(struct kvm_io_device *pio_dev, 2530 struct kvm_vcpu *vcpu) 2531 { 2532 struct kvm_pio_request *io = &vcpu->arch.pio; 2533 void *pd = vcpu->arch.pio_data; 2534 int i; 2535 2536 mutex_lock(&vcpu->kvm->lock); 2537 for (i = 0; i < io->cur_count; i++) { 2538 kvm_iodevice_write(pio_dev, io->port, 2539 io->size, 2540 pd); 2541 pd += io->size; 2542 } 2543 mutex_unlock(&vcpu->kvm->lock); 2544 } 2545 2546 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2547 gpa_t addr, int len, 2548 int is_write) 2549 { 2550 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); 2551 } 2552 2553 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2554 int size, unsigned port) 2555 { 2556 struct kvm_io_device *pio_dev; 2557 unsigned long val; 2558 2559 vcpu->run->exit_reason = KVM_EXIT_IO; 2560 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2561 vcpu->run->io.size = vcpu->arch.pio.size = size; 2562 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2563 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2564 vcpu->run->io.port = vcpu->arch.pio.port = port; 2565 vcpu->arch.pio.in = in; 2566 vcpu->arch.pio.string = 0; 2567 vcpu->arch.pio.down = 0; 2568 vcpu->arch.pio.rep = 0; 2569 2570 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2571 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2572 handler); 2573 else 2574 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2575 handler); 2576 2577 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2578 memcpy(vcpu->arch.pio_data, &val, 4); 2579 2580 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2581 if (pio_dev) { 2582 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2583 complete_pio(vcpu); 2584 return 1; 2585 } 2586 return 0; 2587 } 2588 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2589 2590 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2591 int size, unsigned long count, int down, 2592 gva_t address, int rep, unsigned port) 2593 { 2594 unsigned now, in_page; 2595 int ret = 0; 2596 struct kvm_io_device *pio_dev; 2597 2598 vcpu->run->exit_reason = KVM_EXIT_IO; 2599 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2600 vcpu->run->io.size = vcpu->arch.pio.size = size; 2601 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2602 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2603 vcpu->run->io.port = vcpu->arch.pio.port = port; 2604 vcpu->arch.pio.in = in; 2605 vcpu->arch.pio.string = 1; 2606 vcpu->arch.pio.down = down; 2607 vcpu->arch.pio.rep = rep; 2608 2609 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2610 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2611 handler); 2612 else 2613 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2614 handler); 2615 2616 if (!count) { 2617 kvm_x86_ops->skip_emulated_instruction(vcpu); 2618 return 1; 2619 } 2620 2621 if (!down) 2622 in_page = PAGE_SIZE - offset_in_page(address); 2623 else 2624 in_page = offset_in_page(address) + size; 2625 now = min(count, (unsigned long)in_page / size); 2626 if (!now) 2627 now = 1; 2628 if (down) { 2629 /* 2630 * String I/O in reverse. Yuck. Kill the guest, fix later. 2631 */ 2632 pr_unimpl(vcpu, "guest string pio down\n"); 2633 kvm_inject_gp(vcpu, 0); 2634 return 1; 2635 } 2636 vcpu->run->io.count = now; 2637 vcpu->arch.pio.cur_count = now; 2638 2639 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2640 kvm_x86_ops->skip_emulated_instruction(vcpu); 2641 2642 vcpu->arch.pio.guest_gva = address; 2643 2644 pio_dev = vcpu_find_pio_dev(vcpu, port, 2645 vcpu->arch.pio.cur_count, 2646 !vcpu->arch.pio.in); 2647 if (!vcpu->arch.pio.in) { 2648 /* string PIO write */ 2649 ret = pio_copy_data(vcpu); 2650 if (ret == X86EMUL_PROPAGATE_FAULT) { 2651 kvm_inject_gp(vcpu, 0); 2652 return 1; 2653 } 2654 if (ret == 0 && pio_dev) { 2655 pio_string_write(pio_dev, vcpu); 2656 complete_pio(vcpu); 2657 if (vcpu->arch.pio.count == 0) 2658 ret = 1; 2659 } 2660 } else if (pio_dev) 2661 pr_unimpl(vcpu, "no string pio read support yet, " 2662 "port %x size %d count %ld\n", 2663 port, size, count); 2664 2665 return ret; 2666 } 2667 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2668 2669 static void bounce_off(void *info) 2670 { 2671 /* nothing */ 2672 } 2673 2674 static unsigned int ref_freq; 2675 static unsigned long tsc_khz_ref; 2676 2677 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 2678 void *data) 2679 { 2680 struct cpufreq_freqs *freq = data; 2681 struct kvm *kvm; 2682 struct kvm_vcpu *vcpu; 2683 int i, send_ipi = 0; 2684 2685 if (!ref_freq) 2686 ref_freq = freq->old; 2687 2688 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 2689 return 0; 2690 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 2691 return 0; 2692 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 2693 2694 spin_lock(&kvm_lock); 2695 list_for_each_entry(kvm, &vm_list, vm_list) { 2696 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2697 vcpu = kvm->vcpus[i]; 2698 if (!vcpu) 2699 continue; 2700 if (vcpu->cpu != freq->cpu) 2701 continue; 2702 if (!kvm_request_guest_time_update(vcpu)) 2703 continue; 2704 if (vcpu->cpu != smp_processor_id()) 2705 send_ipi++; 2706 } 2707 } 2708 spin_unlock(&kvm_lock); 2709 2710 if (freq->old < freq->new && send_ipi) { 2711 /* 2712 * We upscale the frequency. Must make the guest 2713 * doesn't see old kvmclock values while running with 2714 * the new frequency, otherwise we risk the guest sees 2715 * time go backwards. 2716 * 2717 * In case we update the frequency for another cpu 2718 * (which might be in guest context) send an interrupt 2719 * to kick the cpu out of guest context. Next time 2720 * guest context is entered kvmclock will be updated, 2721 * so the guest will not see stale values. 2722 */ 2723 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 2724 } 2725 return 0; 2726 } 2727 2728 static struct notifier_block kvmclock_cpufreq_notifier_block = { 2729 .notifier_call = kvmclock_cpufreq_notifier 2730 }; 2731 2732 int kvm_arch_init(void *opaque) 2733 { 2734 int r, cpu; 2735 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2736 2737 if (kvm_x86_ops) { 2738 printk(KERN_ERR "kvm: already loaded the other module\n"); 2739 r = -EEXIST; 2740 goto out; 2741 } 2742 2743 if (!ops->cpu_has_kvm_support()) { 2744 printk(KERN_ERR "kvm: no hardware support\n"); 2745 r = -EOPNOTSUPP; 2746 goto out; 2747 } 2748 if (ops->disabled_by_bios()) { 2749 printk(KERN_ERR "kvm: disabled by bios\n"); 2750 r = -EOPNOTSUPP; 2751 goto out; 2752 } 2753 2754 r = kvm_mmu_module_init(); 2755 if (r) 2756 goto out; 2757 2758 kvm_init_msr_list(); 2759 2760 kvm_x86_ops = ops; 2761 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2762 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2763 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2764 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2765 2766 for_each_possible_cpu(cpu) 2767 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2768 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 2769 tsc_khz_ref = tsc_khz; 2770 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 2771 CPUFREQ_TRANSITION_NOTIFIER); 2772 } 2773 2774 return 0; 2775 2776 out: 2777 return r; 2778 } 2779 2780 void kvm_arch_exit(void) 2781 { 2782 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 2783 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 2784 CPUFREQ_TRANSITION_NOTIFIER); 2785 kvm_x86_ops = NULL; 2786 kvm_mmu_module_exit(); 2787 } 2788 2789 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2790 { 2791 ++vcpu->stat.halt_exits; 2792 KVMTRACE_0D(HLT, vcpu, handler); 2793 if (irqchip_in_kernel(vcpu->kvm)) { 2794 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2795 return 1; 2796 } else { 2797 vcpu->run->exit_reason = KVM_EXIT_HLT; 2798 return 0; 2799 } 2800 } 2801 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2802 2803 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2804 unsigned long a1) 2805 { 2806 if (is_long_mode(vcpu)) 2807 return a0; 2808 else 2809 return a0 | ((gpa_t)a1 << 32); 2810 } 2811 2812 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2813 { 2814 unsigned long nr, a0, a1, a2, a3, ret; 2815 int r = 1; 2816 2817 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 2818 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 2819 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 2820 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 2821 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 2822 2823 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2824 2825 if (!is_long_mode(vcpu)) { 2826 nr &= 0xFFFFFFFF; 2827 a0 &= 0xFFFFFFFF; 2828 a1 &= 0xFFFFFFFF; 2829 a2 &= 0xFFFFFFFF; 2830 a3 &= 0xFFFFFFFF; 2831 } 2832 2833 switch (nr) { 2834 case KVM_HC_VAPIC_POLL_IRQ: 2835 ret = 0; 2836 break; 2837 case KVM_HC_MMU_OP: 2838 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2839 break; 2840 default: 2841 ret = -KVM_ENOSYS; 2842 break; 2843 } 2844 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 2845 ++vcpu->stat.hypercalls; 2846 return r; 2847 } 2848 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2849 2850 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2851 { 2852 char instruction[3]; 2853 int ret = 0; 2854 unsigned long rip = kvm_rip_read(vcpu); 2855 2856 2857 /* 2858 * Blow out the MMU to ensure that no other VCPU has an active mapping 2859 * to ensure that the updated hypercall appears atomically across all 2860 * VCPUs. 2861 */ 2862 kvm_mmu_zap_all(vcpu->kvm); 2863 2864 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2865 if (emulator_write_emulated(rip, instruction, 3, vcpu) 2866 != X86EMUL_CONTINUE) 2867 ret = -EFAULT; 2868 2869 return ret; 2870 } 2871 2872 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2873 { 2874 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2875 } 2876 2877 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2878 { 2879 struct descriptor_table dt = { limit, base }; 2880 2881 kvm_x86_ops->set_gdt(vcpu, &dt); 2882 } 2883 2884 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2885 { 2886 struct descriptor_table dt = { limit, base }; 2887 2888 kvm_x86_ops->set_idt(vcpu, &dt); 2889 } 2890 2891 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2892 unsigned long *rflags) 2893 { 2894 kvm_lmsw(vcpu, msw); 2895 *rflags = kvm_x86_ops->get_rflags(vcpu); 2896 } 2897 2898 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2899 { 2900 unsigned long value; 2901 2902 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2903 switch (cr) { 2904 case 0: 2905 value = vcpu->arch.cr0; 2906 break; 2907 case 2: 2908 value = vcpu->arch.cr2; 2909 break; 2910 case 3: 2911 value = vcpu->arch.cr3; 2912 break; 2913 case 4: 2914 value = vcpu->arch.cr4; 2915 break; 2916 case 8: 2917 value = kvm_get_cr8(vcpu); 2918 break; 2919 default: 2920 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2921 return 0; 2922 } 2923 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, 2924 (u32)((u64)value >> 32), handler); 2925 2926 return value; 2927 } 2928 2929 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2930 unsigned long *rflags) 2931 { 2932 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, 2933 (u32)((u64)val >> 32), handler); 2934 2935 switch (cr) { 2936 case 0: 2937 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2938 *rflags = kvm_x86_ops->get_rflags(vcpu); 2939 break; 2940 case 2: 2941 vcpu->arch.cr2 = val; 2942 break; 2943 case 3: 2944 kvm_set_cr3(vcpu, val); 2945 break; 2946 case 4: 2947 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2948 break; 2949 case 8: 2950 kvm_set_cr8(vcpu, val & 0xfUL); 2951 break; 2952 default: 2953 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2954 } 2955 } 2956 2957 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2958 { 2959 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2960 int j, nent = vcpu->arch.cpuid_nent; 2961 2962 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2963 /* when no next entry is found, the current entry[i] is reselected */ 2964 for (j = i + 1; ; j = (j + 1) % nent) { 2965 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2966 if (ej->function == e->function) { 2967 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2968 return j; 2969 } 2970 } 2971 return 0; /* silence gcc, even though control never reaches here */ 2972 } 2973 2974 /* find an entry with matching function, matching index (if needed), and that 2975 * should be read next (if it's stateful) */ 2976 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 2977 u32 function, u32 index) 2978 { 2979 if (e->function != function) 2980 return 0; 2981 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 2982 return 0; 2983 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 2984 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 2985 return 0; 2986 return 1; 2987 } 2988 2989 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 2990 u32 function, u32 index) 2991 { 2992 int i; 2993 struct kvm_cpuid_entry2 *best = NULL; 2994 2995 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2996 struct kvm_cpuid_entry2 *e; 2997 2998 e = &vcpu->arch.cpuid_entries[i]; 2999 if (is_matching_cpuid_entry(e, function, index)) { 3000 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3001 move_to_next_stateful_cpuid_entry(vcpu, i); 3002 best = e; 3003 break; 3004 } 3005 /* 3006 * Both basic or both extended? 3007 */ 3008 if (((e->function ^ function) & 0x80000000) == 0) 3009 if (!best || e->function > best->function) 3010 best = e; 3011 } 3012 return best; 3013 } 3014 3015 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3016 { 3017 u32 function, index; 3018 struct kvm_cpuid_entry2 *best; 3019 3020 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3021 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3022 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3023 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3024 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3025 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3026 best = kvm_find_cpuid_entry(vcpu, function, index); 3027 if (best) { 3028 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3029 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3030 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3031 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3032 } 3033 kvm_x86_ops->skip_emulated_instruction(vcpu); 3034 KVMTRACE_5D(CPUID, vcpu, function, 3035 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 3036 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 3037 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 3038 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 3039 } 3040 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3041 3042 /* 3043 * Check if userspace requested an interrupt window, and that the 3044 * interrupt window is open. 3045 * 3046 * No need to exit to userspace if we already have an interrupt queued. 3047 */ 3048 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3049 struct kvm_run *kvm_run) 3050 { 3051 return (!vcpu->arch.irq_summary && 3052 kvm_run->request_interrupt_window && 3053 vcpu->arch.interrupt_window_open && 3054 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 3055 } 3056 3057 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3058 struct kvm_run *kvm_run) 3059 { 3060 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3061 kvm_run->cr8 = kvm_get_cr8(vcpu); 3062 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3063 if (irqchip_in_kernel(vcpu->kvm)) 3064 kvm_run->ready_for_interrupt_injection = 1; 3065 else 3066 kvm_run->ready_for_interrupt_injection = 3067 (vcpu->arch.interrupt_window_open && 3068 vcpu->arch.irq_summary == 0); 3069 } 3070 3071 static void vapic_enter(struct kvm_vcpu *vcpu) 3072 { 3073 struct kvm_lapic *apic = vcpu->arch.apic; 3074 struct page *page; 3075 3076 if (!apic || !apic->vapic_addr) 3077 return; 3078 3079 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3080 3081 vcpu->arch.apic->vapic_page = page; 3082 } 3083 3084 static void vapic_exit(struct kvm_vcpu *vcpu) 3085 { 3086 struct kvm_lapic *apic = vcpu->arch.apic; 3087 3088 if (!apic || !apic->vapic_addr) 3089 return; 3090 3091 down_read(&vcpu->kvm->slots_lock); 3092 kvm_release_page_dirty(apic->vapic_page); 3093 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3094 up_read(&vcpu->kvm->slots_lock); 3095 } 3096 3097 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3098 { 3099 int r; 3100 3101 if (vcpu->requests) 3102 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3103 kvm_mmu_unload(vcpu); 3104 3105 r = kvm_mmu_reload(vcpu); 3106 if (unlikely(r)) 3107 goto out; 3108 3109 if (vcpu->requests) { 3110 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3111 __kvm_migrate_timers(vcpu); 3112 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3113 kvm_write_guest_time(vcpu); 3114 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3115 kvm_mmu_sync_roots(vcpu); 3116 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3117 kvm_x86_ops->tlb_flush(vcpu); 3118 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3119 &vcpu->requests)) { 3120 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3121 r = 0; 3122 goto out; 3123 } 3124 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3125 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3126 r = 0; 3127 goto out; 3128 } 3129 } 3130 3131 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3132 kvm_inject_pending_timer_irqs(vcpu); 3133 3134 preempt_disable(); 3135 3136 kvm_x86_ops->prepare_guest_switch(vcpu); 3137 kvm_load_guest_fpu(vcpu); 3138 3139 local_irq_disable(); 3140 3141 if (vcpu->requests || need_resched() || signal_pending(current)) { 3142 local_irq_enable(); 3143 preempt_enable(); 3144 r = 1; 3145 goto out; 3146 } 3147 3148 vcpu->guest_mode = 1; 3149 /* 3150 * Make sure that guest_mode assignment won't happen after 3151 * testing the pending IRQ vector bitmap. 3152 */ 3153 smp_wmb(); 3154 3155 if (vcpu->arch.exception.pending) 3156 __queue_exception(vcpu); 3157 else if (irqchip_in_kernel(vcpu->kvm)) 3158 kvm_x86_ops->inject_pending_irq(vcpu); 3159 else 3160 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 3161 3162 kvm_lapic_sync_to_vapic(vcpu); 3163 3164 up_read(&vcpu->kvm->slots_lock); 3165 3166 kvm_guest_enter(); 3167 3168 get_debugreg(vcpu->arch.host_dr6, 6); 3169 get_debugreg(vcpu->arch.host_dr7, 7); 3170 if (unlikely(vcpu->arch.switch_db_regs)) { 3171 get_debugreg(vcpu->arch.host_db[0], 0); 3172 get_debugreg(vcpu->arch.host_db[1], 1); 3173 get_debugreg(vcpu->arch.host_db[2], 2); 3174 get_debugreg(vcpu->arch.host_db[3], 3); 3175 3176 set_debugreg(0, 7); 3177 set_debugreg(vcpu->arch.eff_db[0], 0); 3178 set_debugreg(vcpu->arch.eff_db[1], 1); 3179 set_debugreg(vcpu->arch.eff_db[2], 2); 3180 set_debugreg(vcpu->arch.eff_db[3], 3); 3181 } 3182 3183 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3184 kvm_x86_ops->run(vcpu, kvm_run); 3185 3186 if (unlikely(vcpu->arch.switch_db_regs)) { 3187 set_debugreg(0, 7); 3188 set_debugreg(vcpu->arch.host_db[0], 0); 3189 set_debugreg(vcpu->arch.host_db[1], 1); 3190 set_debugreg(vcpu->arch.host_db[2], 2); 3191 set_debugreg(vcpu->arch.host_db[3], 3); 3192 } 3193 set_debugreg(vcpu->arch.host_dr6, 6); 3194 set_debugreg(vcpu->arch.host_dr7, 7); 3195 3196 vcpu->guest_mode = 0; 3197 local_irq_enable(); 3198 3199 ++vcpu->stat.exits; 3200 3201 /* 3202 * We must have an instruction between local_irq_enable() and 3203 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3204 * the interrupt shadow. The stat.exits increment will do nicely. 3205 * But we need to prevent reordering, hence this barrier(): 3206 */ 3207 barrier(); 3208 3209 kvm_guest_exit(); 3210 3211 preempt_enable(); 3212 3213 down_read(&vcpu->kvm->slots_lock); 3214 3215 /* 3216 * Profile KVM exit RIPs: 3217 */ 3218 if (unlikely(prof_on == KVM_PROFILING)) { 3219 unsigned long rip = kvm_rip_read(vcpu); 3220 profile_hit(KVM_PROFILING, (void *)rip); 3221 } 3222 3223 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 3224 vcpu->arch.exception.pending = false; 3225 3226 kvm_lapic_sync_from_vapic(vcpu); 3227 3228 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3229 out: 3230 return r; 3231 } 3232 3233 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3234 { 3235 int r; 3236 3237 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3238 pr_debug("vcpu %d received sipi with vector # %x\n", 3239 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3240 kvm_lapic_reset(vcpu); 3241 r = kvm_arch_vcpu_reset(vcpu); 3242 if (r) 3243 return r; 3244 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3245 } 3246 3247 down_read(&vcpu->kvm->slots_lock); 3248 vapic_enter(vcpu); 3249 3250 r = 1; 3251 while (r > 0) { 3252 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3253 r = vcpu_enter_guest(vcpu, kvm_run); 3254 else { 3255 up_read(&vcpu->kvm->slots_lock); 3256 kvm_vcpu_block(vcpu); 3257 down_read(&vcpu->kvm->slots_lock); 3258 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3259 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3260 vcpu->arch.mp_state = 3261 KVM_MP_STATE_RUNNABLE; 3262 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3263 r = -EINTR; 3264 } 3265 3266 if (r > 0) { 3267 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3268 r = -EINTR; 3269 kvm_run->exit_reason = KVM_EXIT_INTR; 3270 ++vcpu->stat.request_irq_exits; 3271 } 3272 if (signal_pending(current)) { 3273 r = -EINTR; 3274 kvm_run->exit_reason = KVM_EXIT_INTR; 3275 ++vcpu->stat.signal_exits; 3276 } 3277 if (need_resched()) { 3278 up_read(&vcpu->kvm->slots_lock); 3279 kvm_resched(vcpu); 3280 down_read(&vcpu->kvm->slots_lock); 3281 } 3282 } 3283 } 3284 3285 up_read(&vcpu->kvm->slots_lock); 3286 post_kvm_run_save(vcpu, kvm_run); 3287 3288 vapic_exit(vcpu); 3289 3290 return r; 3291 } 3292 3293 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3294 { 3295 int r; 3296 sigset_t sigsaved; 3297 3298 vcpu_load(vcpu); 3299 3300 if (vcpu->sigset_active) 3301 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3302 3303 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3304 kvm_vcpu_block(vcpu); 3305 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3306 r = -EAGAIN; 3307 goto out; 3308 } 3309 3310 /* re-sync apic's tpr */ 3311 if (!irqchip_in_kernel(vcpu->kvm)) 3312 kvm_set_cr8(vcpu, kvm_run->cr8); 3313 3314 if (vcpu->arch.pio.cur_count) { 3315 r = complete_pio(vcpu); 3316 if (r) 3317 goto out; 3318 } 3319 #if CONFIG_HAS_IOMEM 3320 if (vcpu->mmio_needed) { 3321 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3322 vcpu->mmio_read_completed = 1; 3323 vcpu->mmio_needed = 0; 3324 3325 down_read(&vcpu->kvm->slots_lock); 3326 r = emulate_instruction(vcpu, kvm_run, 3327 vcpu->arch.mmio_fault_cr2, 0, 3328 EMULTYPE_NO_DECODE); 3329 up_read(&vcpu->kvm->slots_lock); 3330 if (r == EMULATE_DO_MMIO) { 3331 /* 3332 * Read-modify-write. Back to userspace. 3333 */ 3334 r = 0; 3335 goto out; 3336 } 3337 } 3338 #endif 3339 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3340 kvm_register_write(vcpu, VCPU_REGS_RAX, 3341 kvm_run->hypercall.ret); 3342 3343 r = __vcpu_run(vcpu, kvm_run); 3344 3345 out: 3346 if (vcpu->sigset_active) 3347 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3348 3349 vcpu_put(vcpu); 3350 return r; 3351 } 3352 3353 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3354 { 3355 vcpu_load(vcpu); 3356 3357 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3358 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3359 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3360 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3361 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3362 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3363 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3364 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3365 #ifdef CONFIG_X86_64 3366 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3367 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3368 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3369 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3370 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3371 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3372 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3373 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3374 #endif 3375 3376 regs->rip = kvm_rip_read(vcpu); 3377 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3378 3379 /* 3380 * Don't leak debug flags in case they were set for guest debugging 3381 */ 3382 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3383 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3384 3385 vcpu_put(vcpu); 3386 3387 return 0; 3388 } 3389 3390 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3391 { 3392 vcpu_load(vcpu); 3393 3394 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3395 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3396 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3397 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3398 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3399 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3400 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3401 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3402 #ifdef CONFIG_X86_64 3403 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3404 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3405 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3406 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3407 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3408 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3409 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3410 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3411 3412 #endif 3413 3414 kvm_rip_write(vcpu, regs->rip); 3415 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3416 3417 3418 vcpu->arch.exception.pending = false; 3419 3420 vcpu_put(vcpu); 3421 3422 return 0; 3423 } 3424 3425 void kvm_get_segment(struct kvm_vcpu *vcpu, 3426 struct kvm_segment *var, int seg) 3427 { 3428 kvm_x86_ops->get_segment(vcpu, var, seg); 3429 } 3430 3431 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3432 { 3433 struct kvm_segment cs; 3434 3435 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3436 *db = cs.db; 3437 *l = cs.l; 3438 } 3439 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3440 3441 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3442 struct kvm_sregs *sregs) 3443 { 3444 struct descriptor_table dt; 3445 int pending_vec; 3446 3447 vcpu_load(vcpu); 3448 3449 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3450 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3451 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3452 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3453 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3454 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3455 3456 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3457 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3458 3459 kvm_x86_ops->get_idt(vcpu, &dt); 3460 sregs->idt.limit = dt.limit; 3461 sregs->idt.base = dt.base; 3462 kvm_x86_ops->get_gdt(vcpu, &dt); 3463 sregs->gdt.limit = dt.limit; 3464 sregs->gdt.base = dt.base; 3465 3466 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3467 sregs->cr0 = vcpu->arch.cr0; 3468 sregs->cr2 = vcpu->arch.cr2; 3469 sregs->cr3 = vcpu->arch.cr3; 3470 sregs->cr4 = vcpu->arch.cr4; 3471 sregs->cr8 = kvm_get_cr8(vcpu); 3472 sregs->efer = vcpu->arch.shadow_efer; 3473 sregs->apic_base = kvm_get_apic_base(vcpu); 3474 3475 if (irqchip_in_kernel(vcpu->kvm)) { 3476 memset(sregs->interrupt_bitmap, 0, 3477 sizeof sregs->interrupt_bitmap); 3478 pending_vec = kvm_x86_ops->get_irq(vcpu); 3479 if (pending_vec >= 0) 3480 set_bit(pending_vec, 3481 (unsigned long *)sregs->interrupt_bitmap); 3482 } else 3483 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, 3484 sizeof sregs->interrupt_bitmap); 3485 3486 vcpu_put(vcpu); 3487 3488 return 0; 3489 } 3490 3491 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3492 struct kvm_mp_state *mp_state) 3493 { 3494 vcpu_load(vcpu); 3495 mp_state->mp_state = vcpu->arch.mp_state; 3496 vcpu_put(vcpu); 3497 return 0; 3498 } 3499 3500 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3501 struct kvm_mp_state *mp_state) 3502 { 3503 vcpu_load(vcpu); 3504 vcpu->arch.mp_state = mp_state->mp_state; 3505 vcpu_put(vcpu); 3506 return 0; 3507 } 3508 3509 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3510 struct kvm_segment *var, int seg) 3511 { 3512 kvm_x86_ops->set_segment(vcpu, var, seg); 3513 } 3514 3515 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3516 struct kvm_segment *kvm_desct) 3517 { 3518 kvm_desct->base = seg_desc->base0; 3519 kvm_desct->base |= seg_desc->base1 << 16; 3520 kvm_desct->base |= seg_desc->base2 << 24; 3521 kvm_desct->limit = seg_desc->limit0; 3522 kvm_desct->limit |= seg_desc->limit << 16; 3523 if (seg_desc->g) { 3524 kvm_desct->limit <<= 12; 3525 kvm_desct->limit |= 0xfff; 3526 } 3527 kvm_desct->selector = selector; 3528 kvm_desct->type = seg_desc->type; 3529 kvm_desct->present = seg_desc->p; 3530 kvm_desct->dpl = seg_desc->dpl; 3531 kvm_desct->db = seg_desc->d; 3532 kvm_desct->s = seg_desc->s; 3533 kvm_desct->l = seg_desc->l; 3534 kvm_desct->g = seg_desc->g; 3535 kvm_desct->avl = seg_desc->avl; 3536 if (!selector) 3537 kvm_desct->unusable = 1; 3538 else 3539 kvm_desct->unusable = 0; 3540 kvm_desct->padding = 0; 3541 } 3542 3543 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 3544 u16 selector, 3545 struct descriptor_table *dtable) 3546 { 3547 if (selector & 1 << 2) { 3548 struct kvm_segment kvm_seg; 3549 3550 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3551 3552 if (kvm_seg.unusable) 3553 dtable->limit = 0; 3554 else 3555 dtable->limit = kvm_seg.limit; 3556 dtable->base = kvm_seg.base; 3557 } 3558 else 3559 kvm_x86_ops->get_gdt(vcpu, dtable); 3560 } 3561 3562 /* allowed just for 8 bytes segments */ 3563 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3564 struct desc_struct *seg_desc) 3565 { 3566 gpa_t gpa; 3567 struct descriptor_table dtable; 3568 u16 index = selector >> 3; 3569 3570 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3571 3572 if (dtable.limit < index * 8 + 7) { 3573 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3574 return 1; 3575 } 3576 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3577 gpa += index * 8; 3578 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); 3579 } 3580 3581 /* allowed just for 8 bytes segments */ 3582 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3583 struct desc_struct *seg_desc) 3584 { 3585 gpa_t gpa; 3586 struct descriptor_table dtable; 3587 u16 index = selector >> 3; 3588 3589 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3590 3591 if (dtable.limit < index * 8 + 7) 3592 return 1; 3593 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3594 gpa += index * 8; 3595 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); 3596 } 3597 3598 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3599 struct desc_struct *seg_desc) 3600 { 3601 u32 base_addr; 3602 3603 base_addr = seg_desc->base0; 3604 base_addr |= (seg_desc->base1 << 16); 3605 base_addr |= (seg_desc->base2 << 24); 3606 3607 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 3608 } 3609 3610 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3611 { 3612 struct kvm_segment kvm_seg; 3613 3614 kvm_get_segment(vcpu, &kvm_seg, seg); 3615 return kvm_seg.selector; 3616 } 3617 3618 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3619 u16 selector, 3620 struct kvm_segment *kvm_seg) 3621 { 3622 struct desc_struct seg_desc; 3623 3624 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3625 return 1; 3626 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3627 return 0; 3628 } 3629 3630 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 3631 { 3632 struct kvm_segment segvar = { 3633 .base = selector << 4, 3634 .limit = 0xffff, 3635 .selector = selector, 3636 .type = 3, 3637 .present = 1, 3638 .dpl = 3, 3639 .db = 0, 3640 .s = 1, 3641 .l = 0, 3642 .g = 0, 3643 .avl = 0, 3644 .unusable = 0, 3645 }; 3646 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 3647 return 0; 3648 } 3649 3650 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3651 int type_bits, int seg) 3652 { 3653 struct kvm_segment kvm_seg; 3654 3655 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 3656 return kvm_load_realmode_segment(vcpu, selector, seg); 3657 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3658 return 1; 3659 kvm_seg.type |= type_bits; 3660 3661 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3662 seg != VCPU_SREG_LDTR) 3663 if (!kvm_seg.s) 3664 kvm_seg.unusable = 1; 3665 3666 kvm_set_segment(vcpu, &kvm_seg, seg); 3667 return 0; 3668 } 3669 3670 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3671 struct tss_segment_32 *tss) 3672 { 3673 tss->cr3 = vcpu->arch.cr3; 3674 tss->eip = kvm_rip_read(vcpu); 3675 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3676 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3677 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3678 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3679 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3680 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3681 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3682 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3683 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3684 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3685 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3686 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3687 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3688 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3689 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3690 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3691 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3692 } 3693 3694 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3695 struct tss_segment_32 *tss) 3696 { 3697 kvm_set_cr3(vcpu, tss->cr3); 3698 3699 kvm_rip_write(vcpu, tss->eip); 3700 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3701 3702 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 3703 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 3704 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 3705 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 3706 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 3707 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 3708 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 3709 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 3710 3711 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3712 return 1; 3713 3714 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3715 return 1; 3716 3717 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3718 return 1; 3719 3720 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3721 return 1; 3722 3723 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3724 return 1; 3725 3726 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3727 return 1; 3728 3729 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3730 return 1; 3731 return 0; 3732 } 3733 3734 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3735 struct tss_segment_16 *tss) 3736 { 3737 tss->ip = kvm_rip_read(vcpu); 3738 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3739 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3740 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3741 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3742 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3743 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3744 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3745 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 3746 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 3747 3748 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3749 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3750 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3751 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3752 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3753 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3754 } 3755 3756 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3757 struct tss_segment_16 *tss) 3758 { 3759 kvm_rip_write(vcpu, tss->ip); 3760 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3761 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 3762 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 3763 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 3764 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 3765 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 3766 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 3767 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 3768 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 3769 3770 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3771 return 1; 3772 3773 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3774 return 1; 3775 3776 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3777 return 1; 3778 3779 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3780 return 1; 3781 3782 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3783 return 1; 3784 return 0; 3785 } 3786 3787 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3788 u32 old_tss_base, 3789 struct desc_struct *nseg_desc) 3790 { 3791 struct tss_segment_16 tss_segment_16; 3792 int ret = 0; 3793 3794 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3795 sizeof tss_segment_16)) 3796 goto out; 3797 3798 save_state_to_tss16(vcpu, &tss_segment_16); 3799 3800 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3801 sizeof tss_segment_16)) 3802 goto out; 3803 3804 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3805 &tss_segment_16, sizeof tss_segment_16)) 3806 goto out; 3807 3808 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3809 goto out; 3810 3811 ret = 1; 3812 out: 3813 return ret; 3814 } 3815 3816 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3817 u32 old_tss_base, 3818 struct desc_struct *nseg_desc) 3819 { 3820 struct tss_segment_32 tss_segment_32; 3821 int ret = 0; 3822 3823 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3824 sizeof tss_segment_32)) 3825 goto out; 3826 3827 save_state_to_tss32(vcpu, &tss_segment_32); 3828 3829 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3830 sizeof tss_segment_32)) 3831 goto out; 3832 3833 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3834 &tss_segment_32, sizeof tss_segment_32)) 3835 goto out; 3836 3837 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3838 goto out; 3839 3840 ret = 1; 3841 out: 3842 return ret; 3843 } 3844 3845 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3846 { 3847 struct kvm_segment tr_seg; 3848 struct desc_struct cseg_desc; 3849 struct desc_struct nseg_desc; 3850 int ret = 0; 3851 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 3852 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 3853 3854 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 3855 3856 /* FIXME: Handle errors. Failure to read either TSS or their 3857 * descriptors should generate a pagefault. 3858 */ 3859 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3860 goto out; 3861 3862 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 3863 goto out; 3864 3865 if (reason != TASK_SWITCH_IRET) { 3866 int cpl; 3867 3868 cpl = kvm_x86_ops->get_cpl(vcpu); 3869 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3870 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3871 return 1; 3872 } 3873 } 3874 3875 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3876 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3877 return 1; 3878 } 3879 3880 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3881 cseg_desc.type &= ~(1 << 1); //clear the B flag 3882 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 3883 } 3884 3885 if (reason == TASK_SWITCH_IRET) { 3886 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3887 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 3888 } 3889 3890 kvm_x86_ops->skip_emulated_instruction(vcpu); 3891 3892 if (nseg_desc.type & 8) 3893 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3894 &nseg_desc); 3895 else 3896 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 3897 &nseg_desc); 3898 3899 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3900 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3901 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 3902 } 3903 3904 if (reason != TASK_SWITCH_IRET) { 3905 nseg_desc.type |= (1 << 1); 3906 save_guest_segment_descriptor(vcpu, tss_selector, 3907 &nseg_desc); 3908 } 3909 3910 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3911 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3912 tr_seg.type = 11; 3913 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3914 out: 3915 return ret; 3916 } 3917 EXPORT_SYMBOL_GPL(kvm_task_switch); 3918 3919 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 3920 struct kvm_sregs *sregs) 3921 { 3922 int mmu_reset_needed = 0; 3923 int i, pending_vec, max_bits; 3924 struct descriptor_table dt; 3925 3926 vcpu_load(vcpu); 3927 3928 dt.limit = sregs->idt.limit; 3929 dt.base = sregs->idt.base; 3930 kvm_x86_ops->set_idt(vcpu, &dt); 3931 dt.limit = sregs->gdt.limit; 3932 dt.base = sregs->gdt.base; 3933 kvm_x86_ops->set_gdt(vcpu, &dt); 3934 3935 vcpu->arch.cr2 = sregs->cr2; 3936 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 3937 vcpu->arch.cr3 = sregs->cr3; 3938 3939 kvm_set_cr8(vcpu, sregs->cr8); 3940 3941 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 3942 kvm_x86_ops->set_efer(vcpu, sregs->efer); 3943 kvm_set_apic_base(vcpu, sregs->apic_base); 3944 3945 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3946 3947 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 3948 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 3949 vcpu->arch.cr0 = sregs->cr0; 3950 3951 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 3952 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 3953 if (!is_long_mode(vcpu) && is_pae(vcpu)) 3954 load_pdptrs(vcpu, vcpu->arch.cr3); 3955 3956 if (mmu_reset_needed) 3957 kvm_mmu_reset_context(vcpu); 3958 3959 if (!irqchip_in_kernel(vcpu->kvm)) { 3960 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 3961 sizeof vcpu->arch.irq_pending); 3962 vcpu->arch.irq_summary = 0; 3963 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 3964 if (vcpu->arch.irq_pending[i]) 3965 __set_bit(i, &vcpu->arch.irq_summary); 3966 } else { 3967 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 3968 pending_vec = find_first_bit( 3969 (const unsigned long *)sregs->interrupt_bitmap, 3970 max_bits); 3971 /* Only pending external irq is handled here */ 3972 if (pending_vec < max_bits) { 3973 kvm_x86_ops->set_irq(vcpu, pending_vec); 3974 pr_debug("Set back pending irq %d\n", 3975 pending_vec); 3976 } 3977 kvm_pic_clear_isr_ack(vcpu->kvm); 3978 } 3979 3980 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3981 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3982 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3983 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3984 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3985 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3986 3987 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3988 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3989 3990 /* Older userspace won't unhalt the vcpu on reset. */ 3991 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 3992 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 3993 !(vcpu->arch.cr0 & X86_CR0_PE)) 3994 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3995 3996 vcpu_put(vcpu); 3997 3998 return 0; 3999 } 4000 4001 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4002 struct kvm_guest_debug *dbg) 4003 { 4004 int i, r; 4005 4006 vcpu_load(vcpu); 4007 4008 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4009 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4010 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4011 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4012 vcpu->arch.switch_db_regs = 4013 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4014 } else { 4015 for (i = 0; i < KVM_NR_DB_REGS; i++) 4016 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4017 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4018 } 4019 4020 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4021 4022 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4023 kvm_queue_exception(vcpu, DB_VECTOR); 4024 else if (dbg->control & KVM_GUESTDBG_INJECT_BP) 4025 kvm_queue_exception(vcpu, BP_VECTOR); 4026 4027 vcpu_put(vcpu); 4028 4029 return r; 4030 } 4031 4032 /* 4033 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4034 * we have asm/x86/processor.h 4035 */ 4036 struct fxsave { 4037 u16 cwd; 4038 u16 swd; 4039 u16 twd; 4040 u16 fop; 4041 u64 rip; 4042 u64 rdp; 4043 u32 mxcsr; 4044 u32 mxcsr_mask; 4045 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4046 #ifdef CONFIG_X86_64 4047 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4048 #else 4049 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4050 #endif 4051 }; 4052 4053 /* 4054 * Translate a guest virtual address to a guest physical address. 4055 */ 4056 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4057 struct kvm_translation *tr) 4058 { 4059 unsigned long vaddr = tr->linear_address; 4060 gpa_t gpa; 4061 4062 vcpu_load(vcpu); 4063 down_read(&vcpu->kvm->slots_lock); 4064 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4065 up_read(&vcpu->kvm->slots_lock); 4066 tr->physical_address = gpa; 4067 tr->valid = gpa != UNMAPPED_GVA; 4068 tr->writeable = 1; 4069 tr->usermode = 0; 4070 vcpu_put(vcpu); 4071 4072 return 0; 4073 } 4074 4075 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4076 { 4077 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4078 4079 vcpu_load(vcpu); 4080 4081 memcpy(fpu->fpr, fxsave->st_space, 128); 4082 fpu->fcw = fxsave->cwd; 4083 fpu->fsw = fxsave->swd; 4084 fpu->ftwx = fxsave->twd; 4085 fpu->last_opcode = fxsave->fop; 4086 fpu->last_ip = fxsave->rip; 4087 fpu->last_dp = fxsave->rdp; 4088 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4089 4090 vcpu_put(vcpu); 4091 4092 return 0; 4093 } 4094 4095 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4096 { 4097 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4098 4099 vcpu_load(vcpu); 4100 4101 memcpy(fxsave->st_space, fpu->fpr, 128); 4102 fxsave->cwd = fpu->fcw; 4103 fxsave->swd = fpu->fsw; 4104 fxsave->twd = fpu->ftwx; 4105 fxsave->fop = fpu->last_opcode; 4106 fxsave->rip = fpu->last_ip; 4107 fxsave->rdp = fpu->last_dp; 4108 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4109 4110 vcpu_put(vcpu); 4111 4112 return 0; 4113 } 4114 4115 void fx_init(struct kvm_vcpu *vcpu) 4116 { 4117 unsigned after_mxcsr_mask; 4118 4119 /* 4120 * Touch the fpu the first time in non atomic context as if 4121 * this is the first fpu instruction the exception handler 4122 * will fire before the instruction returns and it'll have to 4123 * allocate ram with GFP_KERNEL. 4124 */ 4125 if (!used_math()) 4126 kvm_fx_save(&vcpu->arch.host_fx_image); 4127 4128 /* Initialize guest FPU by resetting ours and saving into guest's */ 4129 preempt_disable(); 4130 kvm_fx_save(&vcpu->arch.host_fx_image); 4131 kvm_fx_finit(); 4132 kvm_fx_save(&vcpu->arch.guest_fx_image); 4133 kvm_fx_restore(&vcpu->arch.host_fx_image); 4134 preempt_enable(); 4135 4136 vcpu->arch.cr0 |= X86_CR0_ET; 4137 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4138 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4139 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4140 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4141 } 4142 EXPORT_SYMBOL_GPL(fx_init); 4143 4144 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4145 { 4146 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4147 return; 4148 4149 vcpu->guest_fpu_loaded = 1; 4150 kvm_fx_save(&vcpu->arch.host_fx_image); 4151 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4152 } 4153 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4154 4155 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4156 { 4157 if (!vcpu->guest_fpu_loaded) 4158 return; 4159 4160 vcpu->guest_fpu_loaded = 0; 4161 kvm_fx_save(&vcpu->arch.guest_fx_image); 4162 kvm_fx_restore(&vcpu->arch.host_fx_image); 4163 ++vcpu->stat.fpu_reload; 4164 } 4165 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4166 4167 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4168 { 4169 if (vcpu->arch.time_page) { 4170 kvm_release_page_dirty(vcpu->arch.time_page); 4171 vcpu->arch.time_page = NULL; 4172 } 4173 4174 kvm_x86_ops->vcpu_free(vcpu); 4175 } 4176 4177 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4178 unsigned int id) 4179 { 4180 return kvm_x86_ops->vcpu_create(kvm, id); 4181 } 4182 4183 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4184 { 4185 int r; 4186 4187 /* We do fxsave: this must be aligned. */ 4188 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4189 4190 vcpu->arch.mtrr_state.have_fixed = 1; 4191 vcpu_load(vcpu); 4192 r = kvm_arch_vcpu_reset(vcpu); 4193 if (r == 0) 4194 r = kvm_mmu_setup(vcpu); 4195 vcpu_put(vcpu); 4196 if (r < 0) 4197 goto free_vcpu; 4198 4199 return 0; 4200 free_vcpu: 4201 kvm_x86_ops->vcpu_free(vcpu); 4202 return r; 4203 } 4204 4205 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4206 { 4207 vcpu_load(vcpu); 4208 kvm_mmu_unload(vcpu); 4209 vcpu_put(vcpu); 4210 4211 kvm_x86_ops->vcpu_free(vcpu); 4212 } 4213 4214 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4215 { 4216 vcpu->arch.nmi_pending = false; 4217 vcpu->arch.nmi_injected = false; 4218 4219 vcpu->arch.switch_db_regs = 0; 4220 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4221 vcpu->arch.dr6 = DR6_FIXED_1; 4222 vcpu->arch.dr7 = DR7_FIXED_1; 4223 4224 return kvm_x86_ops->vcpu_reset(vcpu); 4225 } 4226 4227 void kvm_arch_hardware_enable(void *garbage) 4228 { 4229 kvm_x86_ops->hardware_enable(garbage); 4230 } 4231 4232 void kvm_arch_hardware_disable(void *garbage) 4233 { 4234 kvm_x86_ops->hardware_disable(garbage); 4235 } 4236 4237 int kvm_arch_hardware_setup(void) 4238 { 4239 return kvm_x86_ops->hardware_setup(); 4240 } 4241 4242 void kvm_arch_hardware_unsetup(void) 4243 { 4244 kvm_x86_ops->hardware_unsetup(); 4245 } 4246 4247 void kvm_arch_check_processor_compat(void *rtn) 4248 { 4249 kvm_x86_ops->check_processor_compatibility(rtn); 4250 } 4251 4252 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4253 { 4254 struct page *page; 4255 struct kvm *kvm; 4256 int r; 4257 4258 BUG_ON(vcpu->kvm == NULL); 4259 kvm = vcpu->kvm; 4260 4261 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4262 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4263 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4264 else 4265 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4266 4267 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4268 if (!page) { 4269 r = -ENOMEM; 4270 goto fail; 4271 } 4272 vcpu->arch.pio_data = page_address(page); 4273 4274 r = kvm_mmu_create(vcpu); 4275 if (r < 0) 4276 goto fail_free_pio_data; 4277 4278 if (irqchip_in_kernel(kvm)) { 4279 r = kvm_create_lapic(vcpu); 4280 if (r < 0) 4281 goto fail_mmu_destroy; 4282 } 4283 4284 return 0; 4285 4286 fail_mmu_destroy: 4287 kvm_mmu_destroy(vcpu); 4288 fail_free_pio_data: 4289 free_page((unsigned long)vcpu->arch.pio_data); 4290 fail: 4291 return r; 4292 } 4293 4294 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4295 { 4296 kvm_free_lapic(vcpu); 4297 down_read(&vcpu->kvm->slots_lock); 4298 kvm_mmu_destroy(vcpu); 4299 up_read(&vcpu->kvm->slots_lock); 4300 free_page((unsigned long)vcpu->arch.pio_data); 4301 } 4302 4303 struct kvm *kvm_arch_create_vm(void) 4304 { 4305 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4306 4307 if (!kvm) 4308 return ERR_PTR(-ENOMEM); 4309 4310 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4311 INIT_LIST_HEAD(&kvm->arch.oos_global_pages); 4312 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4313 4314 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4315 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4316 4317 rdtscll(kvm->arch.vm_init_tsc); 4318 4319 return kvm; 4320 } 4321 4322 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4323 { 4324 vcpu_load(vcpu); 4325 kvm_mmu_unload(vcpu); 4326 vcpu_put(vcpu); 4327 } 4328 4329 static void kvm_free_vcpus(struct kvm *kvm) 4330 { 4331 unsigned int i; 4332 4333 /* 4334 * Unpin any mmu pages first. 4335 */ 4336 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4337 if (kvm->vcpus[i]) 4338 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4339 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4340 if (kvm->vcpus[i]) { 4341 kvm_arch_vcpu_free(kvm->vcpus[i]); 4342 kvm->vcpus[i] = NULL; 4343 } 4344 } 4345 4346 } 4347 4348 void kvm_arch_sync_events(struct kvm *kvm) 4349 { 4350 kvm_free_all_assigned_devices(kvm); 4351 } 4352 4353 void kvm_arch_destroy_vm(struct kvm *kvm) 4354 { 4355 kvm_iommu_unmap_guest(kvm); 4356 kvm_free_pit(kvm); 4357 kfree(kvm->arch.vpic); 4358 kfree(kvm->arch.vioapic); 4359 kvm_free_vcpus(kvm); 4360 kvm_free_physmem(kvm); 4361 if (kvm->arch.apic_access_page) 4362 put_page(kvm->arch.apic_access_page); 4363 if (kvm->arch.ept_identity_pagetable) 4364 put_page(kvm->arch.ept_identity_pagetable); 4365 kfree(kvm); 4366 } 4367 4368 int kvm_arch_set_memory_region(struct kvm *kvm, 4369 struct kvm_userspace_memory_region *mem, 4370 struct kvm_memory_slot old, 4371 int user_alloc) 4372 { 4373 int npages = mem->memory_size >> PAGE_SHIFT; 4374 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4375 4376 /*To keep backward compatibility with older userspace, 4377 *x86 needs to hanlde !user_alloc case. 4378 */ 4379 if (!user_alloc) { 4380 if (npages && !old.rmap) { 4381 unsigned long userspace_addr; 4382 4383 down_write(¤t->mm->mmap_sem); 4384 userspace_addr = do_mmap(NULL, 0, 4385 npages * PAGE_SIZE, 4386 PROT_READ | PROT_WRITE, 4387 MAP_PRIVATE | MAP_ANONYMOUS, 4388 0); 4389 up_write(¤t->mm->mmap_sem); 4390 4391 if (IS_ERR((void *)userspace_addr)) 4392 return PTR_ERR((void *)userspace_addr); 4393 4394 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4395 spin_lock(&kvm->mmu_lock); 4396 memslot->userspace_addr = userspace_addr; 4397 spin_unlock(&kvm->mmu_lock); 4398 } else { 4399 if (!old.user_alloc && old.rmap) { 4400 int ret; 4401 4402 down_write(¤t->mm->mmap_sem); 4403 ret = do_munmap(current->mm, old.userspace_addr, 4404 old.npages * PAGE_SIZE); 4405 up_write(¤t->mm->mmap_sem); 4406 if (ret < 0) 4407 printk(KERN_WARNING 4408 "kvm_vm_ioctl_set_memory_region: " 4409 "failed to munmap memory\n"); 4410 } 4411 } 4412 } 4413 4414 if (!kvm->arch.n_requested_mmu_pages) { 4415 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4416 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4417 } 4418 4419 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4420 kvm_flush_remote_tlbs(kvm); 4421 4422 return 0; 4423 } 4424 4425 void kvm_arch_flush_shadow(struct kvm *kvm) 4426 { 4427 kvm_mmu_zap_all(kvm); 4428 } 4429 4430 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4431 { 4432 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4433 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4434 || vcpu->arch.nmi_pending; 4435 } 4436 4437 static void vcpu_kick_intr(void *info) 4438 { 4439 #ifdef DEBUG 4440 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; 4441 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); 4442 #endif 4443 } 4444 4445 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4446 { 4447 int ipi_pcpu = vcpu->cpu; 4448 int cpu = get_cpu(); 4449 4450 if (waitqueue_active(&vcpu->wq)) { 4451 wake_up_interruptible(&vcpu->wq); 4452 ++vcpu->stat.halt_wakeup; 4453 } 4454 /* 4455 * We may be called synchronously with irqs disabled in guest mode, 4456 * So need not to call smp_call_function_single() in that case. 4457 */ 4458 if (vcpu->guest_mode && vcpu->cpu != cpu) 4459 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); 4460 put_cpu(); 4461 } 4462