1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 41 #include <asm/uaccess.h> 42 #include <asm/msr.h> 43 #include <asm/desc.h> 44 #include <asm/mtrr.h> 45 46 #define MAX_IO_MSRS 256 47 #define CR0_RESERVED_BITS \ 48 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 49 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 51 #define CR4_RESERVED_BITS \ 52 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 53 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 54 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 55 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 56 57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 58 /* EFER defaults: 59 * - enable syscall per default because its emulated by KVM 60 * - enable LME and LMA per default on 64 bit KVM 61 */ 62 #ifdef CONFIG_X86_64 63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 64 #else 65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 66 #endif 67 68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 70 71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 72 struct kvm_cpuid_entry2 __user *entries); 73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 74 u32 function, u32 index); 75 76 struct kvm_x86_ops *kvm_x86_ops; 77 EXPORT_SYMBOL_GPL(kvm_x86_ops); 78 79 struct kvm_stats_debugfs_item debugfs_entries[] = { 80 { "pf_fixed", VCPU_STAT(pf_fixed) }, 81 { "pf_guest", VCPU_STAT(pf_guest) }, 82 { "tlb_flush", VCPU_STAT(tlb_flush) }, 83 { "invlpg", VCPU_STAT(invlpg) }, 84 { "exits", VCPU_STAT(exits) }, 85 { "io_exits", VCPU_STAT(io_exits) }, 86 { "mmio_exits", VCPU_STAT(mmio_exits) }, 87 { "signal_exits", VCPU_STAT(signal_exits) }, 88 { "irq_window", VCPU_STAT(irq_window_exits) }, 89 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 90 { "halt_exits", VCPU_STAT(halt_exits) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 92 { "hypercalls", VCPU_STAT(hypercalls) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) }, 94 { "irq_exits", VCPU_STAT(irq_exits) }, 95 { "host_state_reload", VCPU_STAT(host_state_reload) }, 96 { "efer_reload", VCPU_STAT(efer_reload) }, 97 { "fpu_reload", VCPU_STAT(fpu_reload) }, 98 { "insn_emulation", VCPU_STAT(insn_emulation) }, 99 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 100 { "irq_injections", VCPU_STAT(irq_injections) }, 101 { "nmi_injections", VCPU_STAT(nmi_injections) }, 102 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 103 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 104 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 105 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 106 { "mmu_flooded", VM_STAT(mmu_flooded) }, 107 { "mmu_recycled", VM_STAT(mmu_recycled) }, 108 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 109 { "mmu_unsync", VM_STAT(mmu_unsync) }, 110 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 111 { "largepages", VM_STAT(lpages) }, 112 { NULL } 113 }; 114 115 unsigned long segment_base(u16 selector) 116 { 117 struct descriptor_table gdt; 118 struct desc_struct *d; 119 unsigned long table_base; 120 unsigned long v; 121 122 if (selector == 0) 123 return 0; 124 125 asm("sgdt %0" : "=m"(gdt)); 126 table_base = gdt.base; 127 128 if (selector & 4) { /* from ldt */ 129 u16 ldt_selector; 130 131 asm("sldt %0" : "=g"(ldt_selector)); 132 table_base = segment_base(ldt_selector); 133 } 134 d = (struct desc_struct *)(table_base + (selector & ~7)); 135 v = d->base0 | ((unsigned long)d->base1 << 16) | 136 ((unsigned long)d->base2 << 24); 137 #ifdef CONFIG_X86_64 138 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 139 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 140 #endif 141 return v; 142 } 143 EXPORT_SYMBOL_GPL(segment_base); 144 145 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 146 { 147 if (irqchip_in_kernel(vcpu->kvm)) 148 return vcpu->arch.apic_base; 149 else 150 return vcpu->arch.apic_base; 151 } 152 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 153 154 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 155 { 156 /* TODO: reserve bits check */ 157 if (irqchip_in_kernel(vcpu->kvm)) 158 kvm_lapic_set_base(vcpu, data); 159 else 160 vcpu->arch.apic_base = data; 161 } 162 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 163 164 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 165 { 166 WARN_ON(vcpu->arch.exception.pending); 167 vcpu->arch.exception.pending = true; 168 vcpu->arch.exception.has_error_code = false; 169 vcpu->arch.exception.nr = nr; 170 } 171 EXPORT_SYMBOL_GPL(kvm_queue_exception); 172 173 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 174 u32 error_code) 175 { 176 ++vcpu->stat.pf_guest; 177 178 if (vcpu->arch.exception.pending) { 179 if (vcpu->arch.exception.nr == PF_VECTOR) { 180 printk(KERN_DEBUG "kvm: inject_page_fault:" 181 " double fault 0x%lx\n", addr); 182 vcpu->arch.exception.nr = DF_VECTOR; 183 vcpu->arch.exception.error_code = 0; 184 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 185 /* triple fault -> shutdown */ 186 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 187 } 188 return; 189 } 190 vcpu->arch.cr2 = addr; 191 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 192 } 193 194 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 195 { 196 vcpu->arch.nmi_pending = 1; 197 } 198 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 199 200 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 201 { 202 WARN_ON(vcpu->arch.exception.pending); 203 vcpu->arch.exception.pending = true; 204 vcpu->arch.exception.has_error_code = true; 205 vcpu->arch.exception.nr = nr; 206 vcpu->arch.exception.error_code = error_code; 207 } 208 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 209 210 static void __queue_exception(struct kvm_vcpu *vcpu) 211 { 212 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 213 vcpu->arch.exception.has_error_code, 214 vcpu->arch.exception.error_code); 215 } 216 217 /* 218 * Load the pae pdptrs. Return true is they are all valid. 219 */ 220 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 221 { 222 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 223 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 224 int i; 225 int ret; 226 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 227 228 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 229 offset * sizeof(u64), sizeof(pdpte)); 230 if (ret < 0) { 231 ret = 0; 232 goto out; 233 } 234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 235 if (is_present_pte(pdpte[i]) && 236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 237 ret = 0; 238 goto out; 239 } 240 } 241 ret = 1; 242 243 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 244 out: 245 246 return ret; 247 } 248 EXPORT_SYMBOL_GPL(load_pdptrs); 249 250 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 251 { 252 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 253 bool changed = true; 254 int r; 255 256 if (is_long_mode(vcpu) || !is_pae(vcpu)) 257 return false; 258 259 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 260 if (r < 0) 261 goto out; 262 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 263 out: 264 265 return changed; 266 } 267 268 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 269 { 270 if (cr0 & CR0_RESERVED_BITS) { 271 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 272 cr0, vcpu->arch.cr0); 273 kvm_inject_gp(vcpu, 0); 274 return; 275 } 276 277 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 278 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 279 kvm_inject_gp(vcpu, 0); 280 return; 281 } 282 283 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 284 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 285 "and a clear PE flag\n"); 286 kvm_inject_gp(vcpu, 0); 287 return; 288 } 289 290 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 291 #ifdef CONFIG_X86_64 292 if ((vcpu->arch.shadow_efer & EFER_LME)) { 293 int cs_db, cs_l; 294 295 if (!is_pae(vcpu)) { 296 printk(KERN_DEBUG "set_cr0: #GP, start paging " 297 "in long mode while PAE is disabled\n"); 298 kvm_inject_gp(vcpu, 0); 299 return; 300 } 301 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 302 if (cs_l) { 303 printk(KERN_DEBUG "set_cr0: #GP, start paging " 304 "in long mode while CS.L == 1\n"); 305 kvm_inject_gp(vcpu, 0); 306 return; 307 308 } 309 } else 310 #endif 311 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 312 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 313 "reserved bits\n"); 314 kvm_inject_gp(vcpu, 0); 315 return; 316 } 317 318 } 319 320 kvm_x86_ops->set_cr0(vcpu, cr0); 321 vcpu->arch.cr0 = cr0; 322 323 kvm_mmu_reset_context(vcpu); 324 return; 325 } 326 EXPORT_SYMBOL_GPL(kvm_set_cr0); 327 328 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 329 { 330 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 331 KVMTRACE_1D(LMSW, vcpu, 332 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 333 handler); 334 } 335 EXPORT_SYMBOL_GPL(kvm_lmsw); 336 337 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 338 { 339 unsigned long old_cr4 = vcpu->arch.cr4; 340 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 341 342 if (cr4 & CR4_RESERVED_BITS) { 343 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 344 kvm_inject_gp(vcpu, 0); 345 return; 346 } 347 348 if (is_long_mode(vcpu)) { 349 if (!(cr4 & X86_CR4_PAE)) { 350 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 351 "in long mode\n"); 352 kvm_inject_gp(vcpu, 0); 353 return; 354 } 355 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 356 && ((cr4 ^ old_cr4) & pdptr_bits) 357 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 358 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 359 kvm_inject_gp(vcpu, 0); 360 return; 361 } 362 363 if (cr4 & X86_CR4_VMXE) { 364 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 365 kvm_inject_gp(vcpu, 0); 366 return; 367 } 368 kvm_x86_ops->set_cr4(vcpu, cr4); 369 vcpu->arch.cr4 = cr4; 370 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 371 kvm_mmu_reset_context(vcpu); 372 } 373 EXPORT_SYMBOL_GPL(kvm_set_cr4); 374 375 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 376 { 377 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 378 kvm_mmu_sync_roots(vcpu); 379 kvm_mmu_flush_tlb(vcpu); 380 return; 381 } 382 383 if (is_long_mode(vcpu)) { 384 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 385 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 386 kvm_inject_gp(vcpu, 0); 387 return; 388 } 389 } else { 390 if (is_pae(vcpu)) { 391 if (cr3 & CR3_PAE_RESERVED_BITS) { 392 printk(KERN_DEBUG 393 "set_cr3: #GP, reserved bits\n"); 394 kvm_inject_gp(vcpu, 0); 395 return; 396 } 397 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 398 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 399 "reserved bits\n"); 400 kvm_inject_gp(vcpu, 0); 401 return; 402 } 403 } 404 /* 405 * We don't check reserved bits in nonpae mode, because 406 * this isn't enforced, and VMware depends on this. 407 */ 408 } 409 410 /* 411 * Does the new cr3 value map to physical memory? (Note, we 412 * catch an invalid cr3 even in real-mode, because it would 413 * cause trouble later on when we turn on paging anyway.) 414 * 415 * A real CPU would silently accept an invalid cr3 and would 416 * attempt to use it - with largely undefined (and often hard 417 * to debug) behavior on the guest side. 418 */ 419 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 420 kvm_inject_gp(vcpu, 0); 421 else { 422 vcpu->arch.cr3 = cr3; 423 vcpu->arch.mmu.new_cr3(vcpu); 424 } 425 } 426 EXPORT_SYMBOL_GPL(kvm_set_cr3); 427 428 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 429 { 430 if (cr8 & CR8_RESERVED_BITS) { 431 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 432 kvm_inject_gp(vcpu, 0); 433 return; 434 } 435 if (irqchip_in_kernel(vcpu->kvm)) 436 kvm_lapic_set_tpr(vcpu, cr8); 437 else 438 vcpu->arch.cr8 = cr8; 439 } 440 EXPORT_SYMBOL_GPL(kvm_set_cr8); 441 442 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 443 { 444 if (irqchip_in_kernel(vcpu->kvm)) 445 return kvm_lapic_get_cr8(vcpu); 446 else 447 return vcpu->arch.cr8; 448 } 449 EXPORT_SYMBOL_GPL(kvm_get_cr8); 450 451 static inline u32 bit(int bitno) 452 { 453 return 1 << (bitno & 31); 454 } 455 456 /* 457 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 458 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 459 * 460 * This list is modified at module load time to reflect the 461 * capabilities of the host cpu. 462 */ 463 static u32 msrs_to_save[] = { 464 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 465 MSR_K6_STAR, 466 #ifdef CONFIG_X86_64 467 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 468 #endif 469 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 470 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 471 }; 472 473 static unsigned num_msrs_to_save; 474 475 static u32 emulated_msrs[] = { 476 MSR_IA32_MISC_ENABLE, 477 }; 478 479 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 480 { 481 if (efer & efer_reserved_bits) { 482 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 483 efer); 484 kvm_inject_gp(vcpu, 0); 485 return; 486 } 487 488 if (is_paging(vcpu) 489 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 490 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 491 kvm_inject_gp(vcpu, 0); 492 return; 493 } 494 495 if (efer & EFER_FFXSR) { 496 struct kvm_cpuid_entry2 *feat; 497 498 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 499 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 500 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 501 kvm_inject_gp(vcpu, 0); 502 return; 503 } 504 } 505 506 if (efer & EFER_SVME) { 507 struct kvm_cpuid_entry2 *feat; 508 509 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 510 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 511 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 512 kvm_inject_gp(vcpu, 0); 513 return; 514 } 515 } 516 517 kvm_x86_ops->set_efer(vcpu, efer); 518 519 efer &= ~EFER_LMA; 520 efer |= vcpu->arch.shadow_efer & EFER_LMA; 521 522 vcpu->arch.shadow_efer = efer; 523 524 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 525 kvm_mmu_reset_context(vcpu); 526 } 527 528 void kvm_enable_efer_bits(u64 mask) 529 { 530 efer_reserved_bits &= ~mask; 531 } 532 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 533 534 535 /* 536 * Writes msr value into into the appropriate "register". 537 * Returns 0 on success, non-0 otherwise. 538 * Assumes vcpu_load() was already called. 539 */ 540 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 541 { 542 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 543 } 544 545 /* 546 * Adapt set_msr() to msr_io()'s calling convention 547 */ 548 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 549 { 550 return kvm_set_msr(vcpu, index, *data); 551 } 552 553 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 554 { 555 static int version; 556 struct pvclock_wall_clock wc; 557 struct timespec now, sys, boot; 558 559 if (!wall_clock) 560 return; 561 562 version++; 563 564 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 565 566 /* 567 * The guest calculates current wall clock time by adding 568 * system time (updated by kvm_write_guest_time below) to the 569 * wall clock specified here. guest system time equals host 570 * system time for us, thus we must fill in host boot time here. 571 */ 572 now = current_kernel_time(); 573 ktime_get_ts(&sys); 574 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 575 576 wc.sec = boot.tv_sec; 577 wc.nsec = boot.tv_nsec; 578 wc.version = version; 579 580 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 581 582 version++; 583 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 584 } 585 586 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 587 { 588 uint32_t quotient, remainder; 589 590 /* Don't try to replace with do_div(), this one calculates 591 * "(dividend << 32) / divisor" */ 592 __asm__ ( "divl %4" 593 : "=a" (quotient), "=d" (remainder) 594 : "0" (0), "1" (dividend), "r" (divisor) ); 595 return quotient; 596 } 597 598 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 599 { 600 uint64_t nsecs = 1000000000LL; 601 int32_t shift = 0; 602 uint64_t tps64; 603 uint32_t tps32; 604 605 tps64 = tsc_khz * 1000LL; 606 while (tps64 > nsecs*2) { 607 tps64 >>= 1; 608 shift--; 609 } 610 611 tps32 = (uint32_t)tps64; 612 while (tps32 <= (uint32_t)nsecs) { 613 tps32 <<= 1; 614 shift++; 615 } 616 617 hv_clock->tsc_shift = shift; 618 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 619 620 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 621 __func__, tsc_khz, hv_clock->tsc_shift, 622 hv_clock->tsc_to_system_mul); 623 } 624 625 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 626 627 static void kvm_write_guest_time(struct kvm_vcpu *v) 628 { 629 struct timespec ts; 630 unsigned long flags; 631 struct kvm_vcpu_arch *vcpu = &v->arch; 632 void *shared_kaddr; 633 unsigned long this_tsc_khz; 634 635 if ((!vcpu->time_page)) 636 return; 637 638 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 639 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 640 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 641 vcpu->hv_clock_tsc_khz = this_tsc_khz; 642 } 643 put_cpu_var(cpu_tsc_khz); 644 645 /* Keep irq disabled to prevent changes to the clock */ 646 local_irq_save(flags); 647 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 648 &vcpu->hv_clock.tsc_timestamp); 649 ktime_get_ts(&ts); 650 local_irq_restore(flags); 651 652 /* With all the info we got, fill in the values */ 653 654 vcpu->hv_clock.system_time = ts.tv_nsec + 655 (NSEC_PER_SEC * (u64)ts.tv_sec); 656 /* 657 * The interface expects us to write an even number signaling that the 658 * update is finished. Since the guest won't see the intermediate 659 * state, we just increase by 2 at the end. 660 */ 661 vcpu->hv_clock.version += 2; 662 663 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 664 665 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 666 sizeof(vcpu->hv_clock)); 667 668 kunmap_atomic(shared_kaddr, KM_USER0); 669 670 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 671 } 672 673 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 674 { 675 struct kvm_vcpu_arch *vcpu = &v->arch; 676 677 if (!vcpu->time_page) 678 return 0; 679 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 680 return 1; 681 } 682 683 static bool msr_mtrr_valid(unsigned msr) 684 { 685 switch (msr) { 686 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 687 case MSR_MTRRfix64K_00000: 688 case MSR_MTRRfix16K_80000: 689 case MSR_MTRRfix16K_A0000: 690 case MSR_MTRRfix4K_C0000: 691 case MSR_MTRRfix4K_C8000: 692 case MSR_MTRRfix4K_D0000: 693 case MSR_MTRRfix4K_D8000: 694 case MSR_MTRRfix4K_E0000: 695 case MSR_MTRRfix4K_E8000: 696 case MSR_MTRRfix4K_F0000: 697 case MSR_MTRRfix4K_F8000: 698 case MSR_MTRRdefType: 699 case MSR_IA32_CR_PAT: 700 return true; 701 case 0x2f8: 702 return true; 703 } 704 return false; 705 } 706 707 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 708 { 709 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 710 711 if (!msr_mtrr_valid(msr)) 712 return 1; 713 714 if (msr == MSR_MTRRdefType) { 715 vcpu->arch.mtrr_state.def_type = data; 716 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 717 } else if (msr == MSR_MTRRfix64K_00000) 718 p[0] = data; 719 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 720 p[1 + msr - MSR_MTRRfix16K_80000] = data; 721 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 722 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 723 else if (msr == MSR_IA32_CR_PAT) 724 vcpu->arch.pat = data; 725 else { /* Variable MTRRs */ 726 int idx, is_mtrr_mask; 727 u64 *pt; 728 729 idx = (msr - 0x200) / 2; 730 is_mtrr_mask = msr - 0x200 - 2 * idx; 731 if (!is_mtrr_mask) 732 pt = 733 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 734 else 735 pt = 736 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 737 *pt = data; 738 } 739 740 kvm_mmu_reset_context(vcpu); 741 return 0; 742 } 743 744 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 745 { 746 switch (msr) { 747 case MSR_EFER: 748 set_efer(vcpu, data); 749 break; 750 case MSR_IA32_MC0_STATUS: 751 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 752 __func__, data); 753 break; 754 case MSR_IA32_MCG_STATUS: 755 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 756 __func__, data); 757 break; 758 case MSR_IA32_MCG_CTL: 759 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 760 __func__, data); 761 break; 762 case MSR_IA32_DEBUGCTLMSR: 763 if (!data) { 764 /* We support the non-activated case already */ 765 break; 766 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 767 /* Values other than LBR and BTF are vendor-specific, 768 thus reserved and should throw a #GP */ 769 return 1; 770 } 771 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 772 __func__, data); 773 break; 774 case MSR_IA32_UCODE_REV: 775 case MSR_IA32_UCODE_WRITE: 776 case MSR_VM_HSAVE_PA: 777 break; 778 case 0x200 ... 0x2ff: 779 return set_msr_mtrr(vcpu, msr, data); 780 case MSR_IA32_APICBASE: 781 kvm_set_apic_base(vcpu, data); 782 break; 783 case MSR_IA32_MISC_ENABLE: 784 vcpu->arch.ia32_misc_enable_msr = data; 785 break; 786 case MSR_KVM_WALL_CLOCK: 787 vcpu->kvm->arch.wall_clock = data; 788 kvm_write_wall_clock(vcpu->kvm, data); 789 break; 790 case MSR_KVM_SYSTEM_TIME: { 791 if (vcpu->arch.time_page) { 792 kvm_release_page_dirty(vcpu->arch.time_page); 793 vcpu->arch.time_page = NULL; 794 } 795 796 vcpu->arch.time = data; 797 798 /* we verify if the enable bit is set... */ 799 if (!(data & 1)) 800 break; 801 802 /* ...but clean it before doing the actual write */ 803 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 804 805 vcpu->arch.time_page = 806 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 807 808 if (is_error_page(vcpu->arch.time_page)) { 809 kvm_release_page_clean(vcpu->arch.time_page); 810 vcpu->arch.time_page = NULL; 811 } 812 813 kvm_request_guest_time_update(vcpu); 814 break; 815 } 816 default: 817 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 818 return 1; 819 } 820 return 0; 821 } 822 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 823 824 825 /* 826 * Reads an msr value (of 'msr_index') into 'pdata'. 827 * Returns 0 on success, non-0 otherwise. 828 * Assumes vcpu_load() was already called. 829 */ 830 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 831 { 832 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 833 } 834 835 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 836 { 837 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 838 839 if (!msr_mtrr_valid(msr)) 840 return 1; 841 842 if (msr == MSR_MTRRdefType) 843 *pdata = vcpu->arch.mtrr_state.def_type + 844 (vcpu->arch.mtrr_state.enabled << 10); 845 else if (msr == MSR_MTRRfix64K_00000) 846 *pdata = p[0]; 847 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 848 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 849 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 850 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 851 else if (msr == MSR_IA32_CR_PAT) 852 *pdata = vcpu->arch.pat; 853 else { /* Variable MTRRs */ 854 int idx, is_mtrr_mask; 855 u64 *pt; 856 857 idx = (msr - 0x200) / 2; 858 is_mtrr_mask = msr - 0x200 - 2 * idx; 859 if (!is_mtrr_mask) 860 pt = 861 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 862 else 863 pt = 864 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 865 *pdata = *pt; 866 } 867 868 return 0; 869 } 870 871 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 872 { 873 u64 data; 874 875 switch (msr) { 876 case 0xc0010010: /* SYSCFG */ 877 case 0xc0010015: /* HWCR */ 878 case MSR_IA32_PLATFORM_ID: 879 case MSR_IA32_P5_MC_ADDR: 880 case MSR_IA32_P5_MC_TYPE: 881 case MSR_IA32_MC0_CTL: 882 case MSR_IA32_MCG_STATUS: 883 case MSR_IA32_MCG_CAP: 884 case MSR_IA32_MCG_CTL: 885 case MSR_IA32_MC0_MISC: 886 case MSR_IA32_MC0_MISC+4: 887 case MSR_IA32_MC0_MISC+8: 888 case MSR_IA32_MC0_MISC+12: 889 case MSR_IA32_MC0_MISC+16: 890 case MSR_IA32_MC0_MISC+20: 891 case MSR_IA32_UCODE_REV: 892 case MSR_IA32_EBL_CR_POWERON: 893 case MSR_IA32_DEBUGCTLMSR: 894 case MSR_IA32_LASTBRANCHFROMIP: 895 case MSR_IA32_LASTBRANCHTOIP: 896 case MSR_IA32_LASTINTFROMIP: 897 case MSR_IA32_LASTINTTOIP: 898 case MSR_VM_HSAVE_PA: 899 case MSR_P6_EVNTSEL0: 900 case MSR_P6_EVNTSEL1: 901 case MSR_K7_EVNTSEL0: 902 data = 0; 903 break; 904 case MSR_MTRRcap: 905 data = 0x500 | KVM_NR_VAR_MTRR; 906 break; 907 case 0x200 ... 0x2ff: 908 return get_msr_mtrr(vcpu, msr, pdata); 909 case 0xcd: /* fsb frequency */ 910 data = 3; 911 break; 912 case MSR_IA32_APICBASE: 913 data = kvm_get_apic_base(vcpu); 914 break; 915 case MSR_IA32_MISC_ENABLE: 916 data = vcpu->arch.ia32_misc_enable_msr; 917 break; 918 case MSR_IA32_PERF_STATUS: 919 /* TSC increment by tick */ 920 data = 1000ULL; 921 /* CPU multiplier */ 922 data |= (((uint64_t)4ULL) << 40); 923 break; 924 case MSR_EFER: 925 data = vcpu->arch.shadow_efer; 926 break; 927 case MSR_KVM_WALL_CLOCK: 928 data = vcpu->kvm->arch.wall_clock; 929 break; 930 case MSR_KVM_SYSTEM_TIME: 931 data = vcpu->arch.time; 932 break; 933 default: 934 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 935 return 1; 936 } 937 *pdata = data; 938 return 0; 939 } 940 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 941 942 /* 943 * Read or write a bunch of msrs. All parameters are kernel addresses. 944 * 945 * @return number of msrs set successfully. 946 */ 947 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 948 struct kvm_msr_entry *entries, 949 int (*do_msr)(struct kvm_vcpu *vcpu, 950 unsigned index, u64 *data)) 951 { 952 int i; 953 954 vcpu_load(vcpu); 955 956 down_read(&vcpu->kvm->slots_lock); 957 for (i = 0; i < msrs->nmsrs; ++i) 958 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 959 break; 960 up_read(&vcpu->kvm->slots_lock); 961 962 vcpu_put(vcpu); 963 964 return i; 965 } 966 967 /* 968 * Read or write a bunch of msrs. Parameters are user addresses. 969 * 970 * @return number of msrs set successfully. 971 */ 972 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 973 int (*do_msr)(struct kvm_vcpu *vcpu, 974 unsigned index, u64 *data), 975 int writeback) 976 { 977 struct kvm_msrs msrs; 978 struct kvm_msr_entry *entries; 979 int r, n; 980 unsigned size; 981 982 r = -EFAULT; 983 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 984 goto out; 985 986 r = -E2BIG; 987 if (msrs.nmsrs >= MAX_IO_MSRS) 988 goto out; 989 990 r = -ENOMEM; 991 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 992 entries = vmalloc(size); 993 if (!entries) 994 goto out; 995 996 r = -EFAULT; 997 if (copy_from_user(entries, user_msrs->entries, size)) 998 goto out_free; 999 1000 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1001 if (r < 0) 1002 goto out_free; 1003 1004 r = -EFAULT; 1005 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1006 goto out_free; 1007 1008 r = n; 1009 1010 out_free: 1011 vfree(entries); 1012 out: 1013 return r; 1014 } 1015 1016 int kvm_dev_ioctl_check_extension(long ext) 1017 { 1018 int r; 1019 1020 switch (ext) { 1021 case KVM_CAP_IRQCHIP: 1022 case KVM_CAP_HLT: 1023 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1024 case KVM_CAP_SET_TSS_ADDR: 1025 case KVM_CAP_EXT_CPUID: 1026 case KVM_CAP_CLOCKSOURCE: 1027 case KVM_CAP_PIT: 1028 case KVM_CAP_NOP_IO_DELAY: 1029 case KVM_CAP_MP_STATE: 1030 case KVM_CAP_SYNC_MMU: 1031 case KVM_CAP_REINJECT_CONTROL: 1032 case KVM_CAP_IRQ_INJECT_STATUS: 1033 case KVM_CAP_ASSIGN_DEV_IRQ: 1034 r = 1; 1035 break; 1036 case KVM_CAP_COALESCED_MMIO: 1037 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1038 break; 1039 case KVM_CAP_VAPIC: 1040 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1041 break; 1042 case KVM_CAP_NR_VCPUS: 1043 r = KVM_MAX_VCPUS; 1044 break; 1045 case KVM_CAP_NR_MEMSLOTS: 1046 r = KVM_MEMORY_SLOTS; 1047 break; 1048 case KVM_CAP_PV_MMU: 1049 r = !tdp_enabled; 1050 break; 1051 case KVM_CAP_IOMMU: 1052 r = iommu_found(); 1053 break; 1054 default: 1055 r = 0; 1056 break; 1057 } 1058 return r; 1059 1060 } 1061 1062 long kvm_arch_dev_ioctl(struct file *filp, 1063 unsigned int ioctl, unsigned long arg) 1064 { 1065 void __user *argp = (void __user *)arg; 1066 long r; 1067 1068 switch (ioctl) { 1069 case KVM_GET_MSR_INDEX_LIST: { 1070 struct kvm_msr_list __user *user_msr_list = argp; 1071 struct kvm_msr_list msr_list; 1072 unsigned n; 1073 1074 r = -EFAULT; 1075 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1076 goto out; 1077 n = msr_list.nmsrs; 1078 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1079 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1080 goto out; 1081 r = -E2BIG; 1082 if (n < num_msrs_to_save) 1083 goto out; 1084 r = -EFAULT; 1085 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1086 num_msrs_to_save * sizeof(u32))) 1087 goto out; 1088 if (copy_to_user(user_msr_list->indices 1089 + num_msrs_to_save * sizeof(u32), 1090 &emulated_msrs, 1091 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1092 goto out; 1093 r = 0; 1094 break; 1095 } 1096 case KVM_GET_SUPPORTED_CPUID: { 1097 struct kvm_cpuid2 __user *cpuid_arg = argp; 1098 struct kvm_cpuid2 cpuid; 1099 1100 r = -EFAULT; 1101 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1102 goto out; 1103 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1104 cpuid_arg->entries); 1105 if (r) 1106 goto out; 1107 1108 r = -EFAULT; 1109 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1110 goto out; 1111 r = 0; 1112 break; 1113 } 1114 default: 1115 r = -EINVAL; 1116 } 1117 out: 1118 return r; 1119 } 1120 1121 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1122 { 1123 kvm_x86_ops->vcpu_load(vcpu, cpu); 1124 kvm_request_guest_time_update(vcpu); 1125 } 1126 1127 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1128 { 1129 kvm_x86_ops->vcpu_put(vcpu); 1130 kvm_put_guest_fpu(vcpu); 1131 } 1132 1133 static int is_efer_nx(void) 1134 { 1135 unsigned long long efer = 0; 1136 1137 rdmsrl_safe(MSR_EFER, &efer); 1138 return efer & EFER_NX; 1139 } 1140 1141 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1142 { 1143 int i; 1144 struct kvm_cpuid_entry2 *e, *entry; 1145 1146 entry = NULL; 1147 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1148 e = &vcpu->arch.cpuid_entries[i]; 1149 if (e->function == 0x80000001) { 1150 entry = e; 1151 break; 1152 } 1153 } 1154 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1155 entry->edx &= ~(1 << 20); 1156 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1157 } 1158 } 1159 1160 /* when an old userspace process fills a new kernel module */ 1161 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1162 struct kvm_cpuid *cpuid, 1163 struct kvm_cpuid_entry __user *entries) 1164 { 1165 int r, i; 1166 struct kvm_cpuid_entry *cpuid_entries; 1167 1168 r = -E2BIG; 1169 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1170 goto out; 1171 r = -ENOMEM; 1172 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1173 if (!cpuid_entries) 1174 goto out; 1175 r = -EFAULT; 1176 if (copy_from_user(cpuid_entries, entries, 1177 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1178 goto out_free; 1179 for (i = 0; i < cpuid->nent; i++) { 1180 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1181 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1182 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1183 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1184 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1185 vcpu->arch.cpuid_entries[i].index = 0; 1186 vcpu->arch.cpuid_entries[i].flags = 0; 1187 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1188 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1189 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1190 } 1191 vcpu->arch.cpuid_nent = cpuid->nent; 1192 cpuid_fix_nx_cap(vcpu); 1193 r = 0; 1194 1195 out_free: 1196 vfree(cpuid_entries); 1197 out: 1198 return r; 1199 } 1200 1201 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1202 struct kvm_cpuid2 *cpuid, 1203 struct kvm_cpuid_entry2 __user *entries) 1204 { 1205 int r; 1206 1207 r = -E2BIG; 1208 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1209 goto out; 1210 r = -EFAULT; 1211 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1212 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1213 goto out; 1214 vcpu->arch.cpuid_nent = cpuid->nent; 1215 return 0; 1216 1217 out: 1218 return r; 1219 } 1220 1221 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1222 struct kvm_cpuid2 *cpuid, 1223 struct kvm_cpuid_entry2 __user *entries) 1224 { 1225 int r; 1226 1227 r = -E2BIG; 1228 if (cpuid->nent < vcpu->arch.cpuid_nent) 1229 goto out; 1230 r = -EFAULT; 1231 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1232 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1233 goto out; 1234 return 0; 1235 1236 out: 1237 cpuid->nent = vcpu->arch.cpuid_nent; 1238 return r; 1239 } 1240 1241 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1242 u32 index) 1243 { 1244 entry->function = function; 1245 entry->index = index; 1246 cpuid_count(entry->function, entry->index, 1247 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1248 entry->flags = 0; 1249 } 1250 1251 #define F(x) bit(X86_FEATURE_##x) 1252 1253 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1254 u32 index, int *nent, int maxnent) 1255 { 1256 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1257 #ifdef CONFIG_X86_64 1258 unsigned f_lm = F(LM); 1259 #else 1260 unsigned f_lm = 0; 1261 #endif 1262 1263 /* cpuid 1.edx */ 1264 const u32 kvm_supported_word0_x86_features = 1265 F(FPU) | F(VME) | F(DE) | F(PSE) | 1266 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1267 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1268 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1269 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1270 0 /* Reserved, DS, ACPI */ | F(MMX) | 1271 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1272 0 /* HTT, TM, Reserved, PBE */; 1273 /* cpuid 0x80000001.edx */ 1274 const u32 kvm_supported_word1_x86_features = 1275 F(FPU) | F(VME) | F(DE) | F(PSE) | 1276 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1277 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1278 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1279 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1280 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1281 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | 1282 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1283 /* cpuid 1.ecx */ 1284 const u32 kvm_supported_word4_x86_features = 1285 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1286 0 /* DS-CPL, VMX, SMX, EST */ | 1287 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1288 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1289 0 /* Reserved, DCA */ | F(XMM4_1) | 1290 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | 1291 0 /* Reserved, XSAVE, OSXSAVE */; 1292 /* cpuid 0x80000001.ecx */ 1293 const u32 kvm_supported_word6_x86_features = 1294 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1295 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1296 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1297 0 /* SKINIT */ | 0 /* WDT */; 1298 1299 /* all calls to cpuid_count() should be made on the same cpu */ 1300 get_cpu(); 1301 do_cpuid_1_ent(entry, function, index); 1302 ++*nent; 1303 1304 switch (function) { 1305 case 0: 1306 entry->eax = min(entry->eax, (u32)0xb); 1307 break; 1308 case 1: 1309 entry->edx &= kvm_supported_word0_x86_features; 1310 entry->ecx &= kvm_supported_word4_x86_features; 1311 break; 1312 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1313 * may return different values. This forces us to get_cpu() before 1314 * issuing the first command, and also to emulate this annoying behavior 1315 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1316 case 2: { 1317 int t, times = entry->eax & 0xff; 1318 1319 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1320 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1321 for (t = 1; t < times && *nent < maxnent; ++t) { 1322 do_cpuid_1_ent(&entry[t], function, 0); 1323 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1324 ++*nent; 1325 } 1326 break; 1327 } 1328 /* function 4 and 0xb have additional index. */ 1329 case 4: { 1330 int i, cache_type; 1331 1332 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1333 /* read more entries until cache_type is zero */ 1334 for (i = 1; *nent < maxnent; ++i) { 1335 cache_type = entry[i - 1].eax & 0x1f; 1336 if (!cache_type) 1337 break; 1338 do_cpuid_1_ent(&entry[i], function, i); 1339 entry[i].flags |= 1340 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1341 ++*nent; 1342 } 1343 break; 1344 } 1345 case 0xb: { 1346 int i, level_type; 1347 1348 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1349 /* read more entries until level_type is zero */ 1350 for (i = 1; *nent < maxnent; ++i) { 1351 level_type = entry[i - 1].ecx & 0xff00; 1352 if (!level_type) 1353 break; 1354 do_cpuid_1_ent(&entry[i], function, i); 1355 entry[i].flags |= 1356 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1357 ++*nent; 1358 } 1359 break; 1360 } 1361 case 0x80000000: 1362 entry->eax = min(entry->eax, 0x8000001a); 1363 break; 1364 case 0x80000001: 1365 entry->edx &= kvm_supported_word1_x86_features; 1366 entry->ecx &= kvm_supported_word6_x86_features; 1367 break; 1368 } 1369 put_cpu(); 1370 } 1371 1372 #undef F 1373 1374 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1375 struct kvm_cpuid_entry2 __user *entries) 1376 { 1377 struct kvm_cpuid_entry2 *cpuid_entries; 1378 int limit, nent = 0, r = -E2BIG; 1379 u32 func; 1380 1381 if (cpuid->nent < 1) 1382 goto out; 1383 r = -ENOMEM; 1384 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1385 if (!cpuid_entries) 1386 goto out; 1387 1388 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1389 limit = cpuid_entries[0].eax; 1390 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1391 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1392 &nent, cpuid->nent); 1393 r = -E2BIG; 1394 if (nent >= cpuid->nent) 1395 goto out_free; 1396 1397 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1398 limit = cpuid_entries[nent - 1].eax; 1399 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1400 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1401 &nent, cpuid->nent); 1402 r = -EFAULT; 1403 if (copy_to_user(entries, cpuid_entries, 1404 nent * sizeof(struct kvm_cpuid_entry2))) 1405 goto out_free; 1406 cpuid->nent = nent; 1407 r = 0; 1408 1409 out_free: 1410 vfree(cpuid_entries); 1411 out: 1412 return r; 1413 } 1414 1415 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1416 struct kvm_lapic_state *s) 1417 { 1418 vcpu_load(vcpu); 1419 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1420 vcpu_put(vcpu); 1421 1422 return 0; 1423 } 1424 1425 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1426 struct kvm_lapic_state *s) 1427 { 1428 vcpu_load(vcpu); 1429 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1430 kvm_apic_post_state_restore(vcpu); 1431 vcpu_put(vcpu); 1432 1433 return 0; 1434 } 1435 1436 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1437 struct kvm_interrupt *irq) 1438 { 1439 if (irq->irq < 0 || irq->irq >= 256) 1440 return -EINVAL; 1441 if (irqchip_in_kernel(vcpu->kvm)) 1442 return -ENXIO; 1443 vcpu_load(vcpu); 1444 1445 kvm_queue_interrupt(vcpu, irq->irq, false); 1446 1447 vcpu_put(vcpu); 1448 1449 return 0; 1450 } 1451 1452 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1453 { 1454 vcpu_load(vcpu); 1455 kvm_inject_nmi(vcpu); 1456 vcpu_put(vcpu); 1457 1458 return 0; 1459 } 1460 1461 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1462 struct kvm_tpr_access_ctl *tac) 1463 { 1464 if (tac->flags) 1465 return -EINVAL; 1466 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1467 return 0; 1468 } 1469 1470 long kvm_arch_vcpu_ioctl(struct file *filp, 1471 unsigned int ioctl, unsigned long arg) 1472 { 1473 struct kvm_vcpu *vcpu = filp->private_data; 1474 void __user *argp = (void __user *)arg; 1475 int r; 1476 struct kvm_lapic_state *lapic = NULL; 1477 1478 switch (ioctl) { 1479 case KVM_GET_LAPIC: { 1480 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1481 1482 r = -ENOMEM; 1483 if (!lapic) 1484 goto out; 1485 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1486 if (r) 1487 goto out; 1488 r = -EFAULT; 1489 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1490 goto out; 1491 r = 0; 1492 break; 1493 } 1494 case KVM_SET_LAPIC: { 1495 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1496 r = -ENOMEM; 1497 if (!lapic) 1498 goto out; 1499 r = -EFAULT; 1500 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1501 goto out; 1502 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1503 if (r) 1504 goto out; 1505 r = 0; 1506 break; 1507 } 1508 case KVM_INTERRUPT: { 1509 struct kvm_interrupt irq; 1510 1511 r = -EFAULT; 1512 if (copy_from_user(&irq, argp, sizeof irq)) 1513 goto out; 1514 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1515 if (r) 1516 goto out; 1517 r = 0; 1518 break; 1519 } 1520 case KVM_NMI: { 1521 r = kvm_vcpu_ioctl_nmi(vcpu); 1522 if (r) 1523 goto out; 1524 r = 0; 1525 break; 1526 } 1527 case KVM_SET_CPUID: { 1528 struct kvm_cpuid __user *cpuid_arg = argp; 1529 struct kvm_cpuid cpuid; 1530 1531 r = -EFAULT; 1532 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1533 goto out; 1534 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1535 if (r) 1536 goto out; 1537 break; 1538 } 1539 case KVM_SET_CPUID2: { 1540 struct kvm_cpuid2 __user *cpuid_arg = argp; 1541 struct kvm_cpuid2 cpuid; 1542 1543 r = -EFAULT; 1544 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1545 goto out; 1546 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1547 cpuid_arg->entries); 1548 if (r) 1549 goto out; 1550 break; 1551 } 1552 case KVM_GET_CPUID2: { 1553 struct kvm_cpuid2 __user *cpuid_arg = argp; 1554 struct kvm_cpuid2 cpuid; 1555 1556 r = -EFAULT; 1557 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1558 goto out; 1559 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1560 cpuid_arg->entries); 1561 if (r) 1562 goto out; 1563 r = -EFAULT; 1564 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1565 goto out; 1566 r = 0; 1567 break; 1568 } 1569 case KVM_GET_MSRS: 1570 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1571 break; 1572 case KVM_SET_MSRS: 1573 r = msr_io(vcpu, argp, do_set_msr, 0); 1574 break; 1575 case KVM_TPR_ACCESS_REPORTING: { 1576 struct kvm_tpr_access_ctl tac; 1577 1578 r = -EFAULT; 1579 if (copy_from_user(&tac, argp, sizeof tac)) 1580 goto out; 1581 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1582 if (r) 1583 goto out; 1584 r = -EFAULT; 1585 if (copy_to_user(argp, &tac, sizeof tac)) 1586 goto out; 1587 r = 0; 1588 break; 1589 }; 1590 case KVM_SET_VAPIC_ADDR: { 1591 struct kvm_vapic_addr va; 1592 1593 r = -EINVAL; 1594 if (!irqchip_in_kernel(vcpu->kvm)) 1595 goto out; 1596 r = -EFAULT; 1597 if (copy_from_user(&va, argp, sizeof va)) 1598 goto out; 1599 r = 0; 1600 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1601 break; 1602 } 1603 default: 1604 r = -EINVAL; 1605 } 1606 out: 1607 kfree(lapic); 1608 return r; 1609 } 1610 1611 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1612 { 1613 int ret; 1614 1615 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1616 return -1; 1617 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1618 return ret; 1619 } 1620 1621 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1622 u32 kvm_nr_mmu_pages) 1623 { 1624 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1625 return -EINVAL; 1626 1627 down_write(&kvm->slots_lock); 1628 spin_lock(&kvm->mmu_lock); 1629 1630 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1631 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1632 1633 spin_unlock(&kvm->mmu_lock); 1634 up_write(&kvm->slots_lock); 1635 return 0; 1636 } 1637 1638 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1639 { 1640 return kvm->arch.n_alloc_mmu_pages; 1641 } 1642 1643 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1644 { 1645 int i; 1646 struct kvm_mem_alias *alias; 1647 1648 for (i = 0; i < kvm->arch.naliases; ++i) { 1649 alias = &kvm->arch.aliases[i]; 1650 if (gfn >= alias->base_gfn 1651 && gfn < alias->base_gfn + alias->npages) 1652 return alias->target_gfn + gfn - alias->base_gfn; 1653 } 1654 return gfn; 1655 } 1656 1657 /* 1658 * Set a new alias region. Aliases map a portion of physical memory into 1659 * another portion. This is useful for memory windows, for example the PC 1660 * VGA region. 1661 */ 1662 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1663 struct kvm_memory_alias *alias) 1664 { 1665 int r, n; 1666 struct kvm_mem_alias *p; 1667 1668 r = -EINVAL; 1669 /* General sanity checks */ 1670 if (alias->memory_size & (PAGE_SIZE - 1)) 1671 goto out; 1672 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1673 goto out; 1674 if (alias->slot >= KVM_ALIAS_SLOTS) 1675 goto out; 1676 if (alias->guest_phys_addr + alias->memory_size 1677 < alias->guest_phys_addr) 1678 goto out; 1679 if (alias->target_phys_addr + alias->memory_size 1680 < alias->target_phys_addr) 1681 goto out; 1682 1683 down_write(&kvm->slots_lock); 1684 spin_lock(&kvm->mmu_lock); 1685 1686 p = &kvm->arch.aliases[alias->slot]; 1687 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1688 p->npages = alias->memory_size >> PAGE_SHIFT; 1689 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1690 1691 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1692 if (kvm->arch.aliases[n - 1].npages) 1693 break; 1694 kvm->arch.naliases = n; 1695 1696 spin_unlock(&kvm->mmu_lock); 1697 kvm_mmu_zap_all(kvm); 1698 1699 up_write(&kvm->slots_lock); 1700 1701 return 0; 1702 1703 out: 1704 return r; 1705 } 1706 1707 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1708 { 1709 int r; 1710 1711 r = 0; 1712 switch (chip->chip_id) { 1713 case KVM_IRQCHIP_PIC_MASTER: 1714 memcpy(&chip->chip.pic, 1715 &pic_irqchip(kvm)->pics[0], 1716 sizeof(struct kvm_pic_state)); 1717 break; 1718 case KVM_IRQCHIP_PIC_SLAVE: 1719 memcpy(&chip->chip.pic, 1720 &pic_irqchip(kvm)->pics[1], 1721 sizeof(struct kvm_pic_state)); 1722 break; 1723 case KVM_IRQCHIP_IOAPIC: 1724 memcpy(&chip->chip.ioapic, 1725 ioapic_irqchip(kvm), 1726 sizeof(struct kvm_ioapic_state)); 1727 break; 1728 default: 1729 r = -EINVAL; 1730 break; 1731 } 1732 return r; 1733 } 1734 1735 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1736 { 1737 int r; 1738 1739 r = 0; 1740 switch (chip->chip_id) { 1741 case KVM_IRQCHIP_PIC_MASTER: 1742 memcpy(&pic_irqchip(kvm)->pics[0], 1743 &chip->chip.pic, 1744 sizeof(struct kvm_pic_state)); 1745 break; 1746 case KVM_IRQCHIP_PIC_SLAVE: 1747 memcpy(&pic_irqchip(kvm)->pics[1], 1748 &chip->chip.pic, 1749 sizeof(struct kvm_pic_state)); 1750 break; 1751 case KVM_IRQCHIP_IOAPIC: 1752 memcpy(ioapic_irqchip(kvm), 1753 &chip->chip.ioapic, 1754 sizeof(struct kvm_ioapic_state)); 1755 break; 1756 default: 1757 r = -EINVAL; 1758 break; 1759 } 1760 kvm_pic_update_irq(pic_irqchip(kvm)); 1761 return r; 1762 } 1763 1764 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1765 { 1766 int r = 0; 1767 1768 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1769 return r; 1770 } 1771 1772 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1773 { 1774 int r = 0; 1775 1776 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1777 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1778 return r; 1779 } 1780 1781 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 1782 struct kvm_reinject_control *control) 1783 { 1784 if (!kvm->arch.vpit) 1785 return -ENXIO; 1786 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 1787 return 0; 1788 } 1789 1790 /* 1791 * Get (and clear) the dirty memory log for a memory slot. 1792 */ 1793 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1794 struct kvm_dirty_log *log) 1795 { 1796 int r; 1797 int n; 1798 struct kvm_memory_slot *memslot; 1799 int is_dirty = 0; 1800 1801 down_write(&kvm->slots_lock); 1802 1803 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1804 if (r) 1805 goto out; 1806 1807 /* If nothing is dirty, don't bother messing with page tables. */ 1808 if (is_dirty) { 1809 spin_lock(&kvm->mmu_lock); 1810 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1811 spin_unlock(&kvm->mmu_lock); 1812 kvm_flush_remote_tlbs(kvm); 1813 memslot = &kvm->memslots[log->slot]; 1814 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1815 memset(memslot->dirty_bitmap, 0, n); 1816 } 1817 r = 0; 1818 out: 1819 up_write(&kvm->slots_lock); 1820 return r; 1821 } 1822 1823 long kvm_arch_vm_ioctl(struct file *filp, 1824 unsigned int ioctl, unsigned long arg) 1825 { 1826 struct kvm *kvm = filp->private_data; 1827 void __user *argp = (void __user *)arg; 1828 int r = -EINVAL; 1829 /* 1830 * This union makes it completely explicit to gcc-3.x 1831 * that these two variables' stack usage should be 1832 * combined, not added together. 1833 */ 1834 union { 1835 struct kvm_pit_state ps; 1836 struct kvm_memory_alias alias; 1837 } u; 1838 1839 switch (ioctl) { 1840 case KVM_SET_TSS_ADDR: 1841 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1842 if (r < 0) 1843 goto out; 1844 break; 1845 case KVM_SET_MEMORY_REGION: { 1846 struct kvm_memory_region kvm_mem; 1847 struct kvm_userspace_memory_region kvm_userspace_mem; 1848 1849 r = -EFAULT; 1850 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1851 goto out; 1852 kvm_userspace_mem.slot = kvm_mem.slot; 1853 kvm_userspace_mem.flags = kvm_mem.flags; 1854 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1855 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1856 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1857 if (r) 1858 goto out; 1859 break; 1860 } 1861 case KVM_SET_NR_MMU_PAGES: 1862 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1863 if (r) 1864 goto out; 1865 break; 1866 case KVM_GET_NR_MMU_PAGES: 1867 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1868 break; 1869 case KVM_SET_MEMORY_ALIAS: 1870 r = -EFAULT; 1871 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 1872 goto out; 1873 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 1874 if (r) 1875 goto out; 1876 break; 1877 case KVM_CREATE_IRQCHIP: 1878 r = -ENOMEM; 1879 kvm->arch.vpic = kvm_create_pic(kvm); 1880 if (kvm->arch.vpic) { 1881 r = kvm_ioapic_init(kvm); 1882 if (r) { 1883 kfree(kvm->arch.vpic); 1884 kvm->arch.vpic = NULL; 1885 goto out; 1886 } 1887 } else 1888 goto out; 1889 r = kvm_setup_default_irq_routing(kvm); 1890 if (r) { 1891 kfree(kvm->arch.vpic); 1892 kfree(kvm->arch.vioapic); 1893 goto out; 1894 } 1895 break; 1896 case KVM_CREATE_PIT: 1897 mutex_lock(&kvm->lock); 1898 r = -EEXIST; 1899 if (kvm->arch.vpit) 1900 goto create_pit_unlock; 1901 r = -ENOMEM; 1902 kvm->arch.vpit = kvm_create_pit(kvm); 1903 if (kvm->arch.vpit) 1904 r = 0; 1905 create_pit_unlock: 1906 mutex_unlock(&kvm->lock); 1907 break; 1908 case KVM_IRQ_LINE_STATUS: 1909 case KVM_IRQ_LINE: { 1910 struct kvm_irq_level irq_event; 1911 1912 r = -EFAULT; 1913 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1914 goto out; 1915 if (irqchip_in_kernel(kvm)) { 1916 __s32 status; 1917 mutex_lock(&kvm->lock); 1918 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1919 irq_event.irq, irq_event.level); 1920 mutex_unlock(&kvm->lock); 1921 if (ioctl == KVM_IRQ_LINE_STATUS) { 1922 irq_event.status = status; 1923 if (copy_to_user(argp, &irq_event, 1924 sizeof irq_event)) 1925 goto out; 1926 } 1927 r = 0; 1928 } 1929 break; 1930 } 1931 case KVM_GET_IRQCHIP: { 1932 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1933 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1934 1935 r = -ENOMEM; 1936 if (!chip) 1937 goto out; 1938 r = -EFAULT; 1939 if (copy_from_user(chip, argp, sizeof *chip)) 1940 goto get_irqchip_out; 1941 r = -ENXIO; 1942 if (!irqchip_in_kernel(kvm)) 1943 goto get_irqchip_out; 1944 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 1945 if (r) 1946 goto get_irqchip_out; 1947 r = -EFAULT; 1948 if (copy_to_user(argp, chip, sizeof *chip)) 1949 goto get_irqchip_out; 1950 r = 0; 1951 get_irqchip_out: 1952 kfree(chip); 1953 if (r) 1954 goto out; 1955 break; 1956 } 1957 case KVM_SET_IRQCHIP: { 1958 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1959 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1960 1961 r = -ENOMEM; 1962 if (!chip) 1963 goto out; 1964 r = -EFAULT; 1965 if (copy_from_user(chip, argp, sizeof *chip)) 1966 goto set_irqchip_out; 1967 r = -ENXIO; 1968 if (!irqchip_in_kernel(kvm)) 1969 goto set_irqchip_out; 1970 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 1971 if (r) 1972 goto set_irqchip_out; 1973 r = 0; 1974 set_irqchip_out: 1975 kfree(chip); 1976 if (r) 1977 goto out; 1978 break; 1979 } 1980 case KVM_GET_PIT: { 1981 r = -EFAULT; 1982 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 1983 goto out; 1984 r = -ENXIO; 1985 if (!kvm->arch.vpit) 1986 goto out; 1987 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 1988 if (r) 1989 goto out; 1990 r = -EFAULT; 1991 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 1992 goto out; 1993 r = 0; 1994 break; 1995 } 1996 case KVM_SET_PIT: { 1997 r = -EFAULT; 1998 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 1999 goto out; 2000 r = -ENXIO; 2001 if (!kvm->arch.vpit) 2002 goto out; 2003 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2004 if (r) 2005 goto out; 2006 r = 0; 2007 break; 2008 } 2009 case KVM_REINJECT_CONTROL: { 2010 struct kvm_reinject_control control; 2011 r = -EFAULT; 2012 if (copy_from_user(&control, argp, sizeof(control))) 2013 goto out; 2014 r = kvm_vm_ioctl_reinject(kvm, &control); 2015 if (r) 2016 goto out; 2017 r = 0; 2018 break; 2019 } 2020 default: 2021 ; 2022 } 2023 out: 2024 return r; 2025 } 2026 2027 static void kvm_init_msr_list(void) 2028 { 2029 u32 dummy[2]; 2030 unsigned i, j; 2031 2032 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2033 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2034 continue; 2035 if (j < i) 2036 msrs_to_save[j] = msrs_to_save[i]; 2037 j++; 2038 } 2039 num_msrs_to_save = j; 2040 } 2041 2042 /* 2043 * Only apic need an MMIO device hook, so shortcut now.. 2044 */ 2045 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 2046 gpa_t addr, int len, 2047 int is_write) 2048 { 2049 struct kvm_io_device *dev; 2050 2051 if (vcpu->arch.apic) { 2052 dev = &vcpu->arch.apic->dev; 2053 if (dev->in_range(dev, addr, len, is_write)) 2054 return dev; 2055 } 2056 return NULL; 2057 } 2058 2059 2060 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 2061 gpa_t addr, int len, 2062 int is_write) 2063 { 2064 struct kvm_io_device *dev; 2065 2066 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 2067 if (dev == NULL) 2068 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, 2069 is_write); 2070 return dev; 2071 } 2072 2073 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2074 struct kvm_vcpu *vcpu) 2075 { 2076 void *data = val; 2077 int r = X86EMUL_CONTINUE; 2078 2079 while (bytes) { 2080 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2081 unsigned offset = addr & (PAGE_SIZE-1); 2082 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2083 int ret; 2084 2085 if (gpa == UNMAPPED_GVA) { 2086 r = X86EMUL_PROPAGATE_FAULT; 2087 goto out; 2088 } 2089 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2090 if (ret < 0) { 2091 r = X86EMUL_UNHANDLEABLE; 2092 goto out; 2093 } 2094 2095 bytes -= toread; 2096 data += toread; 2097 addr += toread; 2098 } 2099 out: 2100 return r; 2101 } 2102 2103 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2104 struct kvm_vcpu *vcpu) 2105 { 2106 void *data = val; 2107 int r = X86EMUL_CONTINUE; 2108 2109 while (bytes) { 2110 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2111 unsigned offset = addr & (PAGE_SIZE-1); 2112 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2113 int ret; 2114 2115 if (gpa == UNMAPPED_GVA) { 2116 r = X86EMUL_PROPAGATE_FAULT; 2117 goto out; 2118 } 2119 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2120 if (ret < 0) { 2121 r = X86EMUL_UNHANDLEABLE; 2122 goto out; 2123 } 2124 2125 bytes -= towrite; 2126 data += towrite; 2127 addr += towrite; 2128 } 2129 out: 2130 return r; 2131 } 2132 2133 2134 static int emulator_read_emulated(unsigned long addr, 2135 void *val, 2136 unsigned int bytes, 2137 struct kvm_vcpu *vcpu) 2138 { 2139 struct kvm_io_device *mmio_dev; 2140 gpa_t gpa; 2141 2142 if (vcpu->mmio_read_completed) { 2143 memcpy(val, vcpu->mmio_data, bytes); 2144 vcpu->mmio_read_completed = 0; 2145 return X86EMUL_CONTINUE; 2146 } 2147 2148 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2149 2150 /* For APIC access vmexit */ 2151 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2152 goto mmio; 2153 2154 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2155 == X86EMUL_CONTINUE) 2156 return X86EMUL_CONTINUE; 2157 if (gpa == UNMAPPED_GVA) 2158 return X86EMUL_PROPAGATE_FAULT; 2159 2160 mmio: 2161 /* 2162 * Is this MMIO handled locally? 2163 */ 2164 mutex_lock(&vcpu->kvm->lock); 2165 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2166 if (mmio_dev) { 2167 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 2168 mutex_unlock(&vcpu->kvm->lock); 2169 return X86EMUL_CONTINUE; 2170 } 2171 mutex_unlock(&vcpu->kvm->lock); 2172 2173 vcpu->mmio_needed = 1; 2174 vcpu->mmio_phys_addr = gpa; 2175 vcpu->mmio_size = bytes; 2176 vcpu->mmio_is_write = 0; 2177 2178 return X86EMUL_UNHANDLEABLE; 2179 } 2180 2181 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2182 const void *val, int bytes) 2183 { 2184 int ret; 2185 2186 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2187 if (ret < 0) 2188 return 0; 2189 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2190 return 1; 2191 } 2192 2193 static int emulator_write_emulated_onepage(unsigned long addr, 2194 const void *val, 2195 unsigned int bytes, 2196 struct kvm_vcpu *vcpu) 2197 { 2198 struct kvm_io_device *mmio_dev; 2199 gpa_t gpa; 2200 2201 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2202 2203 if (gpa == UNMAPPED_GVA) { 2204 kvm_inject_page_fault(vcpu, addr, 2); 2205 return X86EMUL_PROPAGATE_FAULT; 2206 } 2207 2208 /* For APIC access vmexit */ 2209 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2210 goto mmio; 2211 2212 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2213 return X86EMUL_CONTINUE; 2214 2215 mmio: 2216 /* 2217 * Is this MMIO handled locally? 2218 */ 2219 mutex_lock(&vcpu->kvm->lock); 2220 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); 2221 if (mmio_dev) { 2222 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2223 mutex_unlock(&vcpu->kvm->lock); 2224 return X86EMUL_CONTINUE; 2225 } 2226 mutex_unlock(&vcpu->kvm->lock); 2227 2228 vcpu->mmio_needed = 1; 2229 vcpu->mmio_phys_addr = gpa; 2230 vcpu->mmio_size = bytes; 2231 vcpu->mmio_is_write = 1; 2232 memcpy(vcpu->mmio_data, val, bytes); 2233 2234 return X86EMUL_CONTINUE; 2235 } 2236 2237 int emulator_write_emulated(unsigned long addr, 2238 const void *val, 2239 unsigned int bytes, 2240 struct kvm_vcpu *vcpu) 2241 { 2242 /* Crossing a page boundary? */ 2243 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2244 int rc, now; 2245 2246 now = -addr & ~PAGE_MASK; 2247 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2248 if (rc != X86EMUL_CONTINUE) 2249 return rc; 2250 addr += now; 2251 val += now; 2252 bytes -= now; 2253 } 2254 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2255 } 2256 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2257 2258 static int emulator_cmpxchg_emulated(unsigned long addr, 2259 const void *old, 2260 const void *new, 2261 unsigned int bytes, 2262 struct kvm_vcpu *vcpu) 2263 { 2264 static int reported; 2265 2266 if (!reported) { 2267 reported = 1; 2268 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 2269 } 2270 #ifndef CONFIG_X86_64 2271 /* guests cmpxchg8b have to be emulated atomically */ 2272 if (bytes == 8) { 2273 gpa_t gpa; 2274 struct page *page; 2275 char *kaddr; 2276 u64 val; 2277 2278 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2279 2280 if (gpa == UNMAPPED_GVA || 2281 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2282 goto emul_write; 2283 2284 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2285 goto emul_write; 2286 2287 val = *(u64 *)new; 2288 2289 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2290 2291 kaddr = kmap_atomic(page, KM_USER0); 2292 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2293 kunmap_atomic(kaddr, KM_USER0); 2294 kvm_release_page_dirty(page); 2295 } 2296 emul_write: 2297 #endif 2298 2299 return emulator_write_emulated(addr, new, bytes, vcpu); 2300 } 2301 2302 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2303 { 2304 return kvm_x86_ops->get_segment_base(vcpu, seg); 2305 } 2306 2307 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2308 { 2309 kvm_mmu_invlpg(vcpu, address); 2310 return X86EMUL_CONTINUE; 2311 } 2312 2313 int emulate_clts(struct kvm_vcpu *vcpu) 2314 { 2315 KVMTRACE_0D(CLTS, vcpu, handler); 2316 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2317 return X86EMUL_CONTINUE; 2318 } 2319 2320 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2321 { 2322 struct kvm_vcpu *vcpu = ctxt->vcpu; 2323 2324 switch (dr) { 2325 case 0 ... 3: 2326 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2327 return X86EMUL_CONTINUE; 2328 default: 2329 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2330 return X86EMUL_UNHANDLEABLE; 2331 } 2332 } 2333 2334 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2335 { 2336 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2337 int exception; 2338 2339 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2340 if (exception) { 2341 /* FIXME: better handling */ 2342 return X86EMUL_UNHANDLEABLE; 2343 } 2344 return X86EMUL_CONTINUE; 2345 } 2346 2347 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2348 { 2349 u8 opcodes[4]; 2350 unsigned long rip = kvm_rip_read(vcpu); 2351 unsigned long rip_linear; 2352 2353 if (!printk_ratelimit()) 2354 return; 2355 2356 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2357 2358 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2359 2360 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2361 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2362 } 2363 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2364 2365 static struct x86_emulate_ops emulate_ops = { 2366 .read_std = kvm_read_guest_virt, 2367 .read_emulated = emulator_read_emulated, 2368 .write_emulated = emulator_write_emulated, 2369 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2370 }; 2371 2372 static void cache_all_regs(struct kvm_vcpu *vcpu) 2373 { 2374 kvm_register_read(vcpu, VCPU_REGS_RAX); 2375 kvm_register_read(vcpu, VCPU_REGS_RSP); 2376 kvm_register_read(vcpu, VCPU_REGS_RIP); 2377 vcpu->arch.regs_dirty = ~0; 2378 } 2379 2380 int emulate_instruction(struct kvm_vcpu *vcpu, 2381 struct kvm_run *run, 2382 unsigned long cr2, 2383 u16 error_code, 2384 int emulation_type) 2385 { 2386 int r, shadow_mask; 2387 struct decode_cache *c; 2388 2389 kvm_clear_exception_queue(vcpu); 2390 vcpu->arch.mmio_fault_cr2 = cr2; 2391 /* 2392 * TODO: fix x86_emulate.c to use guest_read/write_register 2393 * instead of direct ->regs accesses, can save hundred cycles 2394 * on Intel for instructions that don't read/change RSP, for 2395 * for example. 2396 */ 2397 cache_all_regs(vcpu); 2398 2399 vcpu->mmio_is_write = 0; 2400 vcpu->arch.pio.string = 0; 2401 2402 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2403 int cs_db, cs_l; 2404 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2405 2406 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2407 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2408 vcpu->arch.emulate_ctxt.mode = 2409 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2410 ? X86EMUL_MODE_REAL : cs_l 2411 ? X86EMUL_MODE_PROT64 : cs_db 2412 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2413 2414 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2415 2416 /* Reject the instructions other than VMCALL/VMMCALL when 2417 * try to emulate invalid opcode */ 2418 c = &vcpu->arch.emulate_ctxt.decode; 2419 if ((emulation_type & EMULTYPE_TRAP_UD) && 2420 (!(c->twobyte && c->b == 0x01 && 2421 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2422 c->modrm_mod == 3 && c->modrm_rm == 1))) 2423 return EMULATE_FAIL; 2424 2425 ++vcpu->stat.insn_emulation; 2426 if (r) { 2427 ++vcpu->stat.insn_emulation_fail; 2428 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2429 return EMULATE_DONE; 2430 return EMULATE_FAIL; 2431 } 2432 } 2433 2434 if (emulation_type & EMULTYPE_SKIP) { 2435 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 2436 return EMULATE_DONE; 2437 } 2438 2439 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2440 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 2441 2442 if (r == 0) 2443 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 2444 2445 if (vcpu->arch.pio.string) 2446 return EMULATE_DO_MMIO; 2447 2448 if ((r || vcpu->mmio_is_write) && run) { 2449 run->exit_reason = KVM_EXIT_MMIO; 2450 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2451 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2452 run->mmio.len = vcpu->mmio_size; 2453 run->mmio.is_write = vcpu->mmio_is_write; 2454 } 2455 2456 if (r) { 2457 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2458 return EMULATE_DONE; 2459 if (!vcpu->mmio_needed) { 2460 kvm_report_emulation_failure(vcpu, "mmio"); 2461 return EMULATE_FAIL; 2462 } 2463 return EMULATE_DO_MMIO; 2464 } 2465 2466 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2467 2468 if (vcpu->mmio_is_write) { 2469 vcpu->mmio_needed = 0; 2470 return EMULATE_DO_MMIO; 2471 } 2472 2473 return EMULATE_DONE; 2474 } 2475 EXPORT_SYMBOL_GPL(emulate_instruction); 2476 2477 static int pio_copy_data(struct kvm_vcpu *vcpu) 2478 { 2479 void *p = vcpu->arch.pio_data; 2480 gva_t q = vcpu->arch.pio.guest_gva; 2481 unsigned bytes; 2482 int ret; 2483 2484 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2485 if (vcpu->arch.pio.in) 2486 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2487 else 2488 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2489 return ret; 2490 } 2491 2492 int complete_pio(struct kvm_vcpu *vcpu) 2493 { 2494 struct kvm_pio_request *io = &vcpu->arch.pio; 2495 long delta; 2496 int r; 2497 unsigned long val; 2498 2499 if (!io->string) { 2500 if (io->in) { 2501 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2502 memcpy(&val, vcpu->arch.pio_data, io->size); 2503 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2504 } 2505 } else { 2506 if (io->in) { 2507 r = pio_copy_data(vcpu); 2508 if (r) 2509 return r; 2510 } 2511 2512 delta = 1; 2513 if (io->rep) { 2514 delta *= io->cur_count; 2515 /* 2516 * The size of the register should really depend on 2517 * current address size. 2518 */ 2519 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2520 val -= delta; 2521 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2522 } 2523 if (io->down) 2524 delta = -delta; 2525 delta *= io->size; 2526 if (io->in) { 2527 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2528 val += delta; 2529 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2530 } else { 2531 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2532 val += delta; 2533 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2534 } 2535 } 2536 2537 io->count -= io->cur_count; 2538 io->cur_count = 0; 2539 2540 return 0; 2541 } 2542 2543 static void kernel_pio(struct kvm_io_device *pio_dev, 2544 struct kvm_vcpu *vcpu, 2545 void *pd) 2546 { 2547 /* TODO: String I/O for in kernel device */ 2548 2549 mutex_lock(&vcpu->kvm->lock); 2550 if (vcpu->arch.pio.in) 2551 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2552 vcpu->arch.pio.size, 2553 pd); 2554 else 2555 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2556 vcpu->arch.pio.size, 2557 pd); 2558 mutex_unlock(&vcpu->kvm->lock); 2559 } 2560 2561 static void pio_string_write(struct kvm_io_device *pio_dev, 2562 struct kvm_vcpu *vcpu) 2563 { 2564 struct kvm_pio_request *io = &vcpu->arch.pio; 2565 void *pd = vcpu->arch.pio_data; 2566 int i; 2567 2568 mutex_lock(&vcpu->kvm->lock); 2569 for (i = 0; i < io->cur_count; i++) { 2570 kvm_iodevice_write(pio_dev, io->port, 2571 io->size, 2572 pd); 2573 pd += io->size; 2574 } 2575 mutex_unlock(&vcpu->kvm->lock); 2576 } 2577 2578 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2579 gpa_t addr, int len, 2580 int is_write) 2581 { 2582 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); 2583 } 2584 2585 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2586 int size, unsigned port) 2587 { 2588 struct kvm_io_device *pio_dev; 2589 unsigned long val; 2590 2591 vcpu->run->exit_reason = KVM_EXIT_IO; 2592 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2593 vcpu->run->io.size = vcpu->arch.pio.size = size; 2594 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2595 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2596 vcpu->run->io.port = vcpu->arch.pio.port = port; 2597 vcpu->arch.pio.in = in; 2598 vcpu->arch.pio.string = 0; 2599 vcpu->arch.pio.down = 0; 2600 vcpu->arch.pio.rep = 0; 2601 2602 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2603 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2604 handler); 2605 else 2606 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2607 handler); 2608 2609 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2610 memcpy(vcpu->arch.pio_data, &val, 4); 2611 2612 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2613 if (pio_dev) { 2614 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2615 complete_pio(vcpu); 2616 return 1; 2617 } 2618 return 0; 2619 } 2620 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2621 2622 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2623 int size, unsigned long count, int down, 2624 gva_t address, int rep, unsigned port) 2625 { 2626 unsigned now, in_page; 2627 int ret = 0; 2628 struct kvm_io_device *pio_dev; 2629 2630 vcpu->run->exit_reason = KVM_EXIT_IO; 2631 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2632 vcpu->run->io.size = vcpu->arch.pio.size = size; 2633 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2634 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2635 vcpu->run->io.port = vcpu->arch.pio.port = port; 2636 vcpu->arch.pio.in = in; 2637 vcpu->arch.pio.string = 1; 2638 vcpu->arch.pio.down = down; 2639 vcpu->arch.pio.rep = rep; 2640 2641 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2642 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2643 handler); 2644 else 2645 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2646 handler); 2647 2648 if (!count) { 2649 kvm_x86_ops->skip_emulated_instruction(vcpu); 2650 return 1; 2651 } 2652 2653 if (!down) 2654 in_page = PAGE_SIZE - offset_in_page(address); 2655 else 2656 in_page = offset_in_page(address) + size; 2657 now = min(count, (unsigned long)in_page / size); 2658 if (!now) 2659 now = 1; 2660 if (down) { 2661 /* 2662 * String I/O in reverse. Yuck. Kill the guest, fix later. 2663 */ 2664 pr_unimpl(vcpu, "guest string pio down\n"); 2665 kvm_inject_gp(vcpu, 0); 2666 return 1; 2667 } 2668 vcpu->run->io.count = now; 2669 vcpu->arch.pio.cur_count = now; 2670 2671 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2672 kvm_x86_ops->skip_emulated_instruction(vcpu); 2673 2674 vcpu->arch.pio.guest_gva = address; 2675 2676 pio_dev = vcpu_find_pio_dev(vcpu, port, 2677 vcpu->arch.pio.cur_count, 2678 !vcpu->arch.pio.in); 2679 if (!vcpu->arch.pio.in) { 2680 /* string PIO write */ 2681 ret = pio_copy_data(vcpu); 2682 if (ret == X86EMUL_PROPAGATE_FAULT) { 2683 kvm_inject_gp(vcpu, 0); 2684 return 1; 2685 } 2686 if (ret == 0 && pio_dev) { 2687 pio_string_write(pio_dev, vcpu); 2688 complete_pio(vcpu); 2689 if (vcpu->arch.pio.count == 0) 2690 ret = 1; 2691 } 2692 } else if (pio_dev) 2693 pr_unimpl(vcpu, "no string pio read support yet, " 2694 "port %x size %d count %ld\n", 2695 port, size, count); 2696 2697 return ret; 2698 } 2699 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2700 2701 static void bounce_off(void *info) 2702 { 2703 /* nothing */ 2704 } 2705 2706 static unsigned int ref_freq; 2707 static unsigned long tsc_khz_ref; 2708 2709 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 2710 void *data) 2711 { 2712 struct cpufreq_freqs *freq = data; 2713 struct kvm *kvm; 2714 struct kvm_vcpu *vcpu; 2715 int i, send_ipi = 0; 2716 2717 if (!ref_freq) 2718 ref_freq = freq->old; 2719 2720 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 2721 return 0; 2722 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 2723 return 0; 2724 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 2725 2726 spin_lock(&kvm_lock); 2727 list_for_each_entry(kvm, &vm_list, vm_list) { 2728 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2729 vcpu = kvm->vcpus[i]; 2730 if (!vcpu) 2731 continue; 2732 if (vcpu->cpu != freq->cpu) 2733 continue; 2734 if (!kvm_request_guest_time_update(vcpu)) 2735 continue; 2736 if (vcpu->cpu != smp_processor_id()) 2737 send_ipi++; 2738 } 2739 } 2740 spin_unlock(&kvm_lock); 2741 2742 if (freq->old < freq->new && send_ipi) { 2743 /* 2744 * We upscale the frequency. Must make the guest 2745 * doesn't see old kvmclock values while running with 2746 * the new frequency, otherwise we risk the guest sees 2747 * time go backwards. 2748 * 2749 * In case we update the frequency for another cpu 2750 * (which might be in guest context) send an interrupt 2751 * to kick the cpu out of guest context. Next time 2752 * guest context is entered kvmclock will be updated, 2753 * so the guest will not see stale values. 2754 */ 2755 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 2756 } 2757 return 0; 2758 } 2759 2760 static struct notifier_block kvmclock_cpufreq_notifier_block = { 2761 .notifier_call = kvmclock_cpufreq_notifier 2762 }; 2763 2764 int kvm_arch_init(void *opaque) 2765 { 2766 int r, cpu; 2767 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2768 2769 if (kvm_x86_ops) { 2770 printk(KERN_ERR "kvm: already loaded the other module\n"); 2771 r = -EEXIST; 2772 goto out; 2773 } 2774 2775 if (!ops->cpu_has_kvm_support()) { 2776 printk(KERN_ERR "kvm: no hardware support\n"); 2777 r = -EOPNOTSUPP; 2778 goto out; 2779 } 2780 if (ops->disabled_by_bios()) { 2781 printk(KERN_ERR "kvm: disabled by bios\n"); 2782 r = -EOPNOTSUPP; 2783 goto out; 2784 } 2785 2786 r = kvm_mmu_module_init(); 2787 if (r) 2788 goto out; 2789 2790 kvm_init_msr_list(); 2791 2792 kvm_x86_ops = ops; 2793 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2794 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2795 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2796 PT_DIRTY_MASK, PT64_NX_MASK, 0); 2797 2798 for_each_possible_cpu(cpu) 2799 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2800 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 2801 tsc_khz_ref = tsc_khz; 2802 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 2803 CPUFREQ_TRANSITION_NOTIFIER); 2804 } 2805 2806 return 0; 2807 2808 out: 2809 return r; 2810 } 2811 2812 void kvm_arch_exit(void) 2813 { 2814 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 2815 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 2816 CPUFREQ_TRANSITION_NOTIFIER); 2817 kvm_x86_ops = NULL; 2818 kvm_mmu_module_exit(); 2819 } 2820 2821 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2822 { 2823 ++vcpu->stat.halt_exits; 2824 KVMTRACE_0D(HLT, vcpu, handler); 2825 if (irqchip_in_kernel(vcpu->kvm)) { 2826 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2827 return 1; 2828 } else { 2829 vcpu->run->exit_reason = KVM_EXIT_HLT; 2830 return 0; 2831 } 2832 } 2833 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2834 2835 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2836 unsigned long a1) 2837 { 2838 if (is_long_mode(vcpu)) 2839 return a0; 2840 else 2841 return a0 | ((gpa_t)a1 << 32); 2842 } 2843 2844 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2845 { 2846 unsigned long nr, a0, a1, a2, a3, ret; 2847 int r = 1; 2848 2849 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 2850 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 2851 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 2852 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 2853 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 2854 2855 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2856 2857 if (!is_long_mode(vcpu)) { 2858 nr &= 0xFFFFFFFF; 2859 a0 &= 0xFFFFFFFF; 2860 a1 &= 0xFFFFFFFF; 2861 a2 &= 0xFFFFFFFF; 2862 a3 &= 0xFFFFFFFF; 2863 } 2864 2865 switch (nr) { 2866 case KVM_HC_VAPIC_POLL_IRQ: 2867 ret = 0; 2868 break; 2869 case KVM_HC_MMU_OP: 2870 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2871 break; 2872 default: 2873 ret = -KVM_ENOSYS; 2874 break; 2875 } 2876 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 2877 ++vcpu->stat.hypercalls; 2878 return r; 2879 } 2880 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2881 2882 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2883 { 2884 char instruction[3]; 2885 int ret = 0; 2886 unsigned long rip = kvm_rip_read(vcpu); 2887 2888 2889 /* 2890 * Blow out the MMU to ensure that no other VCPU has an active mapping 2891 * to ensure that the updated hypercall appears atomically across all 2892 * VCPUs. 2893 */ 2894 kvm_mmu_zap_all(vcpu->kvm); 2895 2896 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2897 if (emulator_write_emulated(rip, instruction, 3, vcpu) 2898 != X86EMUL_CONTINUE) 2899 ret = -EFAULT; 2900 2901 return ret; 2902 } 2903 2904 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2905 { 2906 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2907 } 2908 2909 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2910 { 2911 struct descriptor_table dt = { limit, base }; 2912 2913 kvm_x86_ops->set_gdt(vcpu, &dt); 2914 } 2915 2916 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2917 { 2918 struct descriptor_table dt = { limit, base }; 2919 2920 kvm_x86_ops->set_idt(vcpu, &dt); 2921 } 2922 2923 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2924 unsigned long *rflags) 2925 { 2926 kvm_lmsw(vcpu, msw); 2927 *rflags = kvm_x86_ops->get_rflags(vcpu); 2928 } 2929 2930 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2931 { 2932 unsigned long value; 2933 2934 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2935 switch (cr) { 2936 case 0: 2937 value = vcpu->arch.cr0; 2938 break; 2939 case 2: 2940 value = vcpu->arch.cr2; 2941 break; 2942 case 3: 2943 value = vcpu->arch.cr3; 2944 break; 2945 case 4: 2946 value = vcpu->arch.cr4; 2947 break; 2948 case 8: 2949 value = kvm_get_cr8(vcpu); 2950 break; 2951 default: 2952 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2953 return 0; 2954 } 2955 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, 2956 (u32)((u64)value >> 32), handler); 2957 2958 return value; 2959 } 2960 2961 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2962 unsigned long *rflags) 2963 { 2964 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, 2965 (u32)((u64)val >> 32), handler); 2966 2967 switch (cr) { 2968 case 0: 2969 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2970 *rflags = kvm_x86_ops->get_rflags(vcpu); 2971 break; 2972 case 2: 2973 vcpu->arch.cr2 = val; 2974 break; 2975 case 3: 2976 kvm_set_cr3(vcpu, val); 2977 break; 2978 case 4: 2979 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2980 break; 2981 case 8: 2982 kvm_set_cr8(vcpu, val & 0xfUL); 2983 break; 2984 default: 2985 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2986 } 2987 } 2988 2989 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2990 { 2991 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2992 int j, nent = vcpu->arch.cpuid_nent; 2993 2994 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2995 /* when no next entry is found, the current entry[i] is reselected */ 2996 for (j = i + 1; ; j = (j + 1) % nent) { 2997 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2998 if (ej->function == e->function) { 2999 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 3000 return j; 3001 } 3002 } 3003 return 0; /* silence gcc, even though control never reaches here */ 3004 } 3005 3006 /* find an entry with matching function, matching index (if needed), and that 3007 * should be read next (if it's stateful) */ 3008 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3009 u32 function, u32 index) 3010 { 3011 if (e->function != function) 3012 return 0; 3013 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3014 return 0; 3015 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3016 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3017 return 0; 3018 return 1; 3019 } 3020 3021 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3022 u32 function, u32 index) 3023 { 3024 int i; 3025 struct kvm_cpuid_entry2 *best = NULL; 3026 3027 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3028 struct kvm_cpuid_entry2 *e; 3029 3030 e = &vcpu->arch.cpuid_entries[i]; 3031 if (is_matching_cpuid_entry(e, function, index)) { 3032 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3033 move_to_next_stateful_cpuid_entry(vcpu, i); 3034 best = e; 3035 break; 3036 } 3037 /* 3038 * Both basic or both extended? 3039 */ 3040 if (((e->function ^ function) & 0x80000000) == 0) 3041 if (!best || e->function > best->function) 3042 best = e; 3043 } 3044 return best; 3045 } 3046 3047 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3048 { 3049 struct kvm_cpuid_entry2 *best; 3050 3051 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3052 if (best) 3053 return best->eax & 0xff; 3054 return 36; 3055 } 3056 3057 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3058 { 3059 u32 function, index; 3060 struct kvm_cpuid_entry2 *best; 3061 3062 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3063 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3064 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3065 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3066 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3067 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3068 best = kvm_find_cpuid_entry(vcpu, function, index); 3069 if (best) { 3070 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3071 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3072 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3073 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3074 } 3075 kvm_x86_ops->skip_emulated_instruction(vcpu); 3076 KVMTRACE_5D(CPUID, vcpu, function, 3077 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 3078 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 3079 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 3080 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 3081 } 3082 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3083 3084 /* 3085 * Check if userspace requested an interrupt window, and that the 3086 * interrupt window is open. 3087 * 3088 * No need to exit to userspace if we already have an interrupt queued. 3089 */ 3090 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3091 struct kvm_run *kvm_run) 3092 { 3093 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3094 kvm_run->request_interrupt_window && 3095 kvm_arch_interrupt_allowed(vcpu)); 3096 } 3097 3098 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3099 struct kvm_run *kvm_run) 3100 { 3101 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3102 kvm_run->cr8 = kvm_get_cr8(vcpu); 3103 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3104 if (irqchip_in_kernel(vcpu->kvm)) 3105 kvm_run->ready_for_interrupt_injection = 1; 3106 else 3107 kvm_run->ready_for_interrupt_injection = 3108 kvm_arch_interrupt_allowed(vcpu) && 3109 !kvm_cpu_has_interrupt(vcpu) && 3110 !kvm_event_needs_reinjection(vcpu); 3111 } 3112 3113 static void vapic_enter(struct kvm_vcpu *vcpu) 3114 { 3115 struct kvm_lapic *apic = vcpu->arch.apic; 3116 struct page *page; 3117 3118 if (!apic || !apic->vapic_addr) 3119 return; 3120 3121 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3122 3123 vcpu->arch.apic->vapic_page = page; 3124 } 3125 3126 static void vapic_exit(struct kvm_vcpu *vcpu) 3127 { 3128 struct kvm_lapic *apic = vcpu->arch.apic; 3129 3130 if (!apic || !apic->vapic_addr) 3131 return; 3132 3133 down_read(&vcpu->kvm->slots_lock); 3134 kvm_release_page_dirty(apic->vapic_page); 3135 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3136 up_read(&vcpu->kvm->slots_lock); 3137 } 3138 3139 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3140 { 3141 int max_irr, tpr; 3142 3143 if (!kvm_x86_ops->update_cr8_intercept) 3144 return; 3145 3146 if (!vcpu->arch.apic->vapic_addr) 3147 max_irr = kvm_lapic_find_highest_irr(vcpu); 3148 else 3149 max_irr = -1; 3150 3151 if (max_irr != -1) 3152 max_irr >>= 4; 3153 3154 tpr = kvm_lapic_get_cr8(vcpu); 3155 3156 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3157 } 3158 3159 static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3160 { 3161 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3162 kvm_x86_ops->set_interrupt_shadow(vcpu, 0); 3163 3164 /* try to reinject previous events if any */ 3165 if (vcpu->arch.nmi_injected) { 3166 kvm_x86_ops->set_nmi(vcpu); 3167 return; 3168 } 3169 3170 if (vcpu->arch.interrupt.pending) { 3171 kvm_x86_ops->set_irq(vcpu); 3172 return; 3173 } 3174 3175 /* try to inject new event if pending */ 3176 if (vcpu->arch.nmi_pending) { 3177 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3178 vcpu->arch.nmi_pending = false; 3179 vcpu->arch.nmi_injected = true; 3180 kvm_x86_ops->set_nmi(vcpu); 3181 } 3182 } else if (kvm_cpu_has_interrupt(vcpu)) { 3183 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3184 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3185 false); 3186 kvm_x86_ops->set_irq(vcpu); 3187 } 3188 } 3189 } 3190 3191 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192 { 3193 int r; 3194 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3195 kvm_run->request_interrupt_window; 3196 3197 if (vcpu->requests) 3198 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3199 kvm_mmu_unload(vcpu); 3200 3201 r = kvm_mmu_reload(vcpu); 3202 if (unlikely(r)) 3203 goto out; 3204 3205 if (vcpu->requests) { 3206 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3207 __kvm_migrate_timers(vcpu); 3208 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3209 kvm_write_guest_time(vcpu); 3210 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3211 kvm_mmu_sync_roots(vcpu); 3212 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3213 kvm_x86_ops->tlb_flush(vcpu); 3214 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3215 &vcpu->requests)) { 3216 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3217 r = 0; 3218 goto out; 3219 } 3220 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3221 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3222 r = 0; 3223 goto out; 3224 } 3225 } 3226 3227 preempt_disable(); 3228 3229 kvm_x86_ops->prepare_guest_switch(vcpu); 3230 kvm_load_guest_fpu(vcpu); 3231 3232 local_irq_disable(); 3233 3234 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3235 smp_mb__after_clear_bit(); 3236 3237 if (vcpu->requests || need_resched() || signal_pending(current)) { 3238 local_irq_enable(); 3239 preempt_enable(); 3240 r = 1; 3241 goto out; 3242 } 3243 3244 if (vcpu->arch.exception.pending) 3245 __queue_exception(vcpu); 3246 else 3247 inject_pending_irq(vcpu, kvm_run); 3248 3249 /* enable NMI/IRQ window open exits if needed */ 3250 if (vcpu->arch.nmi_pending) 3251 kvm_x86_ops->enable_nmi_window(vcpu); 3252 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3253 kvm_x86_ops->enable_irq_window(vcpu); 3254 3255 if (kvm_lapic_enabled(vcpu)) { 3256 update_cr8_intercept(vcpu); 3257 kvm_lapic_sync_to_vapic(vcpu); 3258 } 3259 3260 up_read(&vcpu->kvm->slots_lock); 3261 3262 kvm_guest_enter(); 3263 3264 get_debugreg(vcpu->arch.host_dr6, 6); 3265 get_debugreg(vcpu->arch.host_dr7, 7); 3266 if (unlikely(vcpu->arch.switch_db_regs)) { 3267 get_debugreg(vcpu->arch.host_db[0], 0); 3268 get_debugreg(vcpu->arch.host_db[1], 1); 3269 get_debugreg(vcpu->arch.host_db[2], 2); 3270 get_debugreg(vcpu->arch.host_db[3], 3); 3271 3272 set_debugreg(0, 7); 3273 set_debugreg(vcpu->arch.eff_db[0], 0); 3274 set_debugreg(vcpu->arch.eff_db[1], 1); 3275 set_debugreg(vcpu->arch.eff_db[2], 2); 3276 set_debugreg(vcpu->arch.eff_db[3], 3); 3277 } 3278 3279 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3280 kvm_x86_ops->run(vcpu, kvm_run); 3281 3282 if (unlikely(vcpu->arch.switch_db_regs)) { 3283 set_debugreg(0, 7); 3284 set_debugreg(vcpu->arch.host_db[0], 0); 3285 set_debugreg(vcpu->arch.host_db[1], 1); 3286 set_debugreg(vcpu->arch.host_db[2], 2); 3287 set_debugreg(vcpu->arch.host_db[3], 3); 3288 } 3289 set_debugreg(vcpu->arch.host_dr6, 6); 3290 set_debugreg(vcpu->arch.host_dr7, 7); 3291 3292 set_bit(KVM_REQ_KICK, &vcpu->requests); 3293 local_irq_enable(); 3294 3295 ++vcpu->stat.exits; 3296 3297 /* 3298 * We must have an instruction between local_irq_enable() and 3299 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3300 * the interrupt shadow. The stat.exits increment will do nicely. 3301 * But we need to prevent reordering, hence this barrier(): 3302 */ 3303 barrier(); 3304 3305 kvm_guest_exit(); 3306 3307 preempt_enable(); 3308 3309 down_read(&vcpu->kvm->slots_lock); 3310 3311 /* 3312 * Profile KVM exit RIPs: 3313 */ 3314 if (unlikely(prof_on == KVM_PROFILING)) { 3315 unsigned long rip = kvm_rip_read(vcpu); 3316 profile_hit(KVM_PROFILING, (void *)rip); 3317 } 3318 3319 3320 kvm_lapic_sync_from_vapic(vcpu); 3321 3322 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3323 out: 3324 return r; 3325 } 3326 3327 3328 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3329 { 3330 int r; 3331 3332 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3333 pr_debug("vcpu %d received sipi with vector # %x\n", 3334 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3335 kvm_lapic_reset(vcpu); 3336 r = kvm_arch_vcpu_reset(vcpu); 3337 if (r) 3338 return r; 3339 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3340 } 3341 3342 down_read(&vcpu->kvm->slots_lock); 3343 vapic_enter(vcpu); 3344 3345 r = 1; 3346 while (r > 0) { 3347 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3348 r = vcpu_enter_guest(vcpu, kvm_run); 3349 else { 3350 up_read(&vcpu->kvm->slots_lock); 3351 kvm_vcpu_block(vcpu); 3352 down_read(&vcpu->kvm->slots_lock); 3353 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3354 { 3355 switch(vcpu->arch.mp_state) { 3356 case KVM_MP_STATE_HALTED: 3357 vcpu->arch.mp_state = 3358 KVM_MP_STATE_RUNNABLE; 3359 case KVM_MP_STATE_RUNNABLE: 3360 break; 3361 case KVM_MP_STATE_SIPI_RECEIVED: 3362 default: 3363 r = -EINTR; 3364 break; 3365 } 3366 } 3367 } 3368 3369 if (r <= 0) 3370 break; 3371 3372 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3373 if (kvm_cpu_has_pending_timer(vcpu)) 3374 kvm_inject_pending_timer_irqs(vcpu); 3375 3376 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3377 r = -EINTR; 3378 kvm_run->exit_reason = KVM_EXIT_INTR; 3379 ++vcpu->stat.request_irq_exits; 3380 } 3381 if (signal_pending(current)) { 3382 r = -EINTR; 3383 kvm_run->exit_reason = KVM_EXIT_INTR; 3384 ++vcpu->stat.signal_exits; 3385 } 3386 if (need_resched()) { 3387 up_read(&vcpu->kvm->slots_lock); 3388 kvm_resched(vcpu); 3389 down_read(&vcpu->kvm->slots_lock); 3390 } 3391 } 3392 3393 up_read(&vcpu->kvm->slots_lock); 3394 post_kvm_run_save(vcpu, kvm_run); 3395 3396 vapic_exit(vcpu); 3397 3398 return r; 3399 } 3400 3401 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3402 { 3403 int r; 3404 sigset_t sigsaved; 3405 3406 vcpu_load(vcpu); 3407 3408 if (vcpu->sigset_active) 3409 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3410 3411 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3412 kvm_vcpu_block(vcpu); 3413 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3414 r = -EAGAIN; 3415 goto out; 3416 } 3417 3418 /* re-sync apic's tpr */ 3419 if (!irqchip_in_kernel(vcpu->kvm)) 3420 kvm_set_cr8(vcpu, kvm_run->cr8); 3421 3422 if (vcpu->arch.pio.cur_count) { 3423 r = complete_pio(vcpu); 3424 if (r) 3425 goto out; 3426 } 3427 #if CONFIG_HAS_IOMEM 3428 if (vcpu->mmio_needed) { 3429 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3430 vcpu->mmio_read_completed = 1; 3431 vcpu->mmio_needed = 0; 3432 3433 down_read(&vcpu->kvm->slots_lock); 3434 r = emulate_instruction(vcpu, kvm_run, 3435 vcpu->arch.mmio_fault_cr2, 0, 3436 EMULTYPE_NO_DECODE); 3437 up_read(&vcpu->kvm->slots_lock); 3438 if (r == EMULATE_DO_MMIO) { 3439 /* 3440 * Read-modify-write. Back to userspace. 3441 */ 3442 r = 0; 3443 goto out; 3444 } 3445 } 3446 #endif 3447 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3448 kvm_register_write(vcpu, VCPU_REGS_RAX, 3449 kvm_run->hypercall.ret); 3450 3451 r = __vcpu_run(vcpu, kvm_run); 3452 3453 out: 3454 if (vcpu->sigset_active) 3455 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3456 3457 vcpu_put(vcpu); 3458 return r; 3459 } 3460 3461 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3462 { 3463 vcpu_load(vcpu); 3464 3465 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3466 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3467 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3468 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3469 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3470 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3471 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3472 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3473 #ifdef CONFIG_X86_64 3474 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3475 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3476 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3477 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3478 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3479 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3480 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3481 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3482 #endif 3483 3484 regs->rip = kvm_rip_read(vcpu); 3485 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3486 3487 /* 3488 * Don't leak debug flags in case they were set for guest debugging 3489 */ 3490 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3491 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3492 3493 vcpu_put(vcpu); 3494 3495 return 0; 3496 } 3497 3498 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3499 { 3500 vcpu_load(vcpu); 3501 3502 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3503 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3504 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3505 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3506 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3507 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3508 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3509 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3510 #ifdef CONFIG_X86_64 3511 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3512 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3513 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3514 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3515 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3516 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3517 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3518 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3519 3520 #endif 3521 3522 kvm_rip_write(vcpu, regs->rip); 3523 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3524 3525 3526 vcpu->arch.exception.pending = false; 3527 3528 vcpu_put(vcpu); 3529 3530 return 0; 3531 } 3532 3533 void kvm_get_segment(struct kvm_vcpu *vcpu, 3534 struct kvm_segment *var, int seg) 3535 { 3536 kvm_x86_ops->get_segment(vcpu, var, seg); 3537 } 3538 3539 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3540 { 3541 struct kvm_segment cs; 3542 3543 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3544 *db = cs.db; 3545 *l = cs.l; 3546 } 3547 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3548 3549 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3550 struct kvm_sregs *sregs) 3551 { 3552 struct descriptor_table dt; 3553 3554 vcpu_load(vcpu); 3555 3556 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3557 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3558 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3559 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3560 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3561 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3562 3563 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3564 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3565 3566 kvm_x86_ops->get_idt(vcpu, &dt); 3567 sregs->idt.limit = dt.limit; 3568 sregs->idt.base = dt.base; 3569 kvm_x86_ops->get_gdt(vcpu, &dt); 3570 sregs->gdt.limit = dt.limit; 3571 sregs->gdt.base = dt.base; 3572 3573 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3574 sregs->cr0 = vcpu->arch.cr0; 3575 sregs->cr2 = vcpu->arch.cr2; 3576 sregs->cr3 = vcpu->arch.cr3; 3577 sregs->cr4 = vcpu->arch.cr4; 3578 sregs->cr8 = kvm_get_cr8(vcpu); 3579 sregs->efer = vcpu->arch.shadow_efer; 3580 sregs->apic_base = kvm_get_apic_base(vcpu); 3581 3582 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 3583 3584 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 3585 set_bit(vcpu->arch.interrupt.nr, 3586 (unsigned long *)sregs->interrupt_bitmap); 3587 3588 vcpu_put(vcpu); 3589 3590 return 0; 3591 } 3592 3593 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3594 struct kvm_mp_state *mp_state) 3595 { 3596 vcpu_load(vcpu); 3597 mp_state->mp_state = vcpu->arch.mp_state; 3598 vcpu_put(vcpu); 3599 return 0; 3600 } 3601 3602 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3603 struct kvm_mp_state *mp_state) 3604 { 3605 vcpu_load(vcpu); 3606 vcpu->arch.mp_state = mp_state->mp_state; 3607 vcpu_put(vcpu); 3608 return 0; 3609 } 3610 3611 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3612 struct kvm_segment *var, int seg) 3613 { 3614 kvm_x86_ops->set_segment(vcpu, var, seg); 3615 } 3616 3617 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3618 struct kvm_segment *kvm_desct) 3619 { 3620 kvm_desct->base = seg_desc->base0; 3621 kvm_desct->base |= seg_desc->base1 << 16; 3622 kvm_desct->base |= seg_desc->base2 << 24; 3623 kvm_desct->limit = seg_desc->limit0; 3624 kvm_desct->limit |= seg_desc->limit << 16; 3625 if (seg_desc->g) { 3626 kvm_desct->limit <<= 12; 3627 kvm_desct->limit |= 0xfff; 3628 } 3629 kvm_desct->selector = selector; 3630 kvm_desct->type = seg_desc->type; 3631 kvm_desct->present = seg_desc->p; 3632 kvm_desct->dpl = seg_desc->dpl; 3633 kvm_desct->db = seg_desc->d; 3634 kvm_desct->s = seg_desc->s; 3635 kvm_desct->l = seg_desc->l; 3636 kvm_desct->g = seg_desc->g; 3637 kvm_desct->avl = seg_desc->avl; 3638 if (!selector) 3639 kvm_desct->unusable = 1; 3640 else 3641 kvm_desct->unusable = 0; 3642 kvm_desct->padding = 0; 3643 } 3644 3645 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 3646 u16 selector, 3647 struct descriptor_table *dtable) 3648 { 3649 if (selector & 1 << 2) { 3650 struct kvm_segment kvm_seg; 3651 3652 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3653 3654 if (kvm_seg.unusable) 3655 dtable->limit = 0; 3656 else 3657 dtable->limit = kvm_seg.limit; 3658 dtable->base = kvm_seg.base; 3659 } 3660 else 3661 kvm_x86_ops->get_gdt(vcpu, dtable); 3662 } 3663 3664 /* allowed just for 8 bytes segments */ 3665 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3666 struct desc_struct *seg_desc) 3667 { 3668 gpa_t gpa; 3669 struct descriptor_table dtable; 3670 u16 index = selector >> 3; 3671 3672 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3673 3674 if (dtable.limit < index * 8 + 7) { 3675 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3676 return 1; 3677 } 3678 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3679 gpa += index * 8; 3680 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); 3681 } 3682 3683 /* allowed just for 8 bytes segments */ 3684 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3685 struct desc_struct *seg_desc) 3686 { 3687 gpa_t gpa; 3688 struct descriptor_table dtable; 3689 u16 index = selector >> 3; 3690 3691 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3692 3693 if (dtable.limit < index * 8 + 7) 3694 return 1; 3695 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3696 gpa += index * 8; 3697 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); 3698 } 3699 3700 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3701 struct desc_struct *seg_desc) 3702 { 3703 u32 base_addr; 3704 3705 base_addr = seg_desc->base0; 3706 base_addr |= (seg_desc->base1 << 16); 3707 base_addr |= (seg_desc->base2 << 24); 3708 3709 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 3710 } 3711 3712 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3713 { 3714 struct kvm_segment kvm_seg; 3715 3716 kvm_get_segment(vcpu, &kvm_seg, seg); 3717 return kvm_seg.selector; 3718 } 3719 3720 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3721 u16 selector, 3722 struct kvm_segment *kvm_seg) 3723 { 3724 struct desc_struct seg_desc; 3725 3726 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3727 return 1; 3728 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3729 return 0; 3730 } 3731 3732 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 3733 { 3734 struct kvm_segment segvar = { 3735 .base = selector << 4, 3736 .limit = 0xffff, 3737 .selector = selector, 3738 .type = 3, 3739 .present = 1, 3740 .dpl = 3, 3741 .db = 0, 3742 .s = 1, 3743 .l = 0, 3744 .g = 0, 3745 .avl = 0, 3746 .unusable = 0, 3747 }; 3748 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 3749 return 0; 3750 } 3751 3752 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3753 int type_bits, int seg) 3754 { 3755 struct kvm_segment kvm_seg; 3756 3757 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 3758 return kvm_load_realmode_segment(vcpu, selector, seg); 3759 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3760 return 1; 3761 kvm_seg.type |= type_bits; 3762 3763 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3764 seg != VCPU_SREG_LDTR) 3765 if (!kvm_seg.s) 3766 kvm_seg.unusable = 1; 3767 3768 kvm_set_segment(vcpu, &kvm_seg, seg); 3769 return 0; 3770 } 3771 3772 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3773 struct tss_segment_32 *tss) 3774 { 3775 tss->cr3 = vcpu->arch.cr3; 3776 tss->eip = kvm_rip_read(vcpu); 3777 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3778 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3779 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3780 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3781 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3782 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3783 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3784 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3785 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3786 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3787 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3788 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3789 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3790 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3791 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3792 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3793 } 3794 3795 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3796 struct tss_segment_32 *tss) 3797 { 3798 kvm_set_cr3(vcpu, tss->cr3); 3799 3800 kvm_rip_write(vcpu, tss->eip); 3801 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3802 3803 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 3804 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 3805 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 3806 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 3807 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 3808 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 3809 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 3810 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 3811 3812 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3813 return 1; 3814 3815 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3816 return 1; 3817 3818 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3819 return 1; 3820 3821 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3822 return 1; 3823 3824 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3825 return 1; 3826 3827 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3828 return 1; 3829 3830 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3831 return 1; 3832 return 0; 3833 } 3834 3835 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3836 struct tss_segment_16 *tss) 3837 { 3838 tss->ip = kvm_rip_read(vcpu); 3839 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3840 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3841 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3842 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3843 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3844 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3845 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3846 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 3847 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 3848 3849 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3850 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3851 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3852 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3853 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3854 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3855 } 3856 3857 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3858 struct tss_segment_16 *tss) 3859 { 3860 kvm_rip_write(vcpu, tss->ip); 3861 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3862 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 3863 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 3864 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 3865 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 3866 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 3867 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 3868 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 3869 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 3870 3871 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3872 return 1; 3873 3874 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3875 return 1; 3876 3877 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3878 return 1; 3879 3880 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3881 return 1; 3882 3883 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3884 return 1; 3885 return 0; 3886 } 3887 3888 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3889 u16 old_tss_sel, u32 old_tss_base, 3890 struct desc_struct *nseg_desc) 3891 { 3892 struct tss_segment_16 tss_segment_16; 3893 int ret = 0; 3894 3895 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3896 sizeof tss_segment_16)) 3897 goto out; 3898 3899 save_state_to_tss16(vcpu, &tss_segment_16); 3900 3901 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3902 sizeof tss_segment_16)) 3903 goto out; 3904 3905 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3906 &tss_segment_16, sizeof tss_segment_16)) 3907 goto out; 3908 3909 if (old_tss_sel != 0xffff) { 3910 tss_segment_16.prev_task_link = old_tss_sel; 3911 3912 if (kvm_write_guest(vcpu->kvm, 3913 get_tss_base_addr(vcpu, nseg_desc), 3914 &tss_segment_16.prev_task_link, 3915 sizeof tss_segment_16.prev_task_link)) 3916 goto out; 3917 } 3918 3919 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3920 goto out; 3921 3922 ret = 1; 3923 out: 3924 return ret; 3925 } 3926 3927 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3928 u16 old_tss_sel, u32 old_tss_base, 3929 struct desc_struct *nseg_desc) 3930 { 3931 struct tss_segment_32 tss_segment_32; 3932 int ret = 0; 3933 3934 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3935 sizeof tss_segment_32)) 3936 goto out; 3937 3938 save_state_to_tss32(vcpu, &tss_segment_32); 3939 3940 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3941 sizeof tss_segment_32)) 3942 goto out; 3943 3944 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3945 &tss_segment_32, sizeof tss_segment_32)) 3946 goto out; 3947 3948 if (old_tss_sel != 0xffff) { 3949 tss_segment_32.prev_task_link = old_tss_sel; 3950 3951 if (kvm_write_guest(vcpu->kvm, 3952 get_tss_base_addr(vcpu, nseg_desc), 3953 &tss_segment_32.prev_task_link, 3954 sizeof tss_segment_32.prev_task_link)) 3955 goto out; 3956 } 3957 3958 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3959 goto out; 3960 3961 ret = 1; 3962 out: 3963 return ret; 3964 } 3965 3966 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3967 { 3968 struct kvm_segment tr_seg; 3969 struct desc_struct cseg_desc; 3970 struct desc_struct nseg_desc; 3971 int ret = 0; 3972 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 3973 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 3974 3975 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 3976 3977 /* FIXME: Handle errors. Failure to read either TSS or their 3978 * descriptors should generate a pagefault. 3979 */ 3980 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3981 goto out; 3982 3983 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 3984 goto out; 3985 3986 if (reason != TASK_SWITCH_IRET) { 3987 int cpl; 3988 3989 cpl = kvm_x86_ops->get_cpl(vcpu); 3990 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3991 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3992 return 1; 3993 } 3994 } 3995 3996 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3997 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3998 return 1; 3999 } 4000 4001 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4002 cseg_desc.type &= ~(1 << 1); //clear the B flag 4003 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4004 } 4005 4006 if (reason == TASK_SWITCH_IRET) { 4007 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4008 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4009 } 4010 4011 /* set back link to prev task only if NT bit is set in eflags 4012 note that old_tss_sel is not used afetr this point */ 4013 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4014 old_tss_sel = 0xffff; 4015 4016 /* set back link to prev task only if NT bit is set in eflags 4017 note that old_tss_sel is not used afetr this point */ 4018 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4019 old_tss_sel = 0xffff; 4020 4021 if (nseg_desc.type & 8) 4022 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4023 old_tss_base, &nseg_desc); 4024 else 4025 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4026 old_tss_base, &nseg_desc); 4027 4028 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4029 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4030 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4031 } 4032 4033 if (reason != TASK_SWITCH_IRET) { 4034 nseg_desc.type |= (1 << 1); 4035 save_guest_segment_descriptor(vcpu, tss_selector, 4036 &nseg_desc); 4037 } 4038 4039 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4040 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4041 tr_seg.type = 11; 4042 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4043 out: 4044 return ret; 4045 } 4046 EXPORT_SYMBOL_GPL(kvm_task_switch); 4047 4048 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4049 struct kvm_sregs *sregs) 4050 { 4051 int mmu_reset_needed = 0; 4052 int pending_vec, max_bits; 4053 struct descriptor_table dt; 4054 4055 vcpu_load(vcpu); 4056 4057 dt.limit = sregs->idt.limit; 4058 dt.base = sregs->idt.base; 4059 kvm_x86_ops->set_idt(vcpu, &dt); 4060 dt.limit = sregs->gdt.limit; 4061 dt.base = sregs->gdt.base; 4062 kvm_x86_ops->set_gdt(vcpu, &dt); 4063 4064 vcpu->arch.cr2 = sregs->cr2; 4065 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4066 4067 down_read(&vcpu->kvm->slots_lock); 4068 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) 4069 vcpu->arch.cr3 = sregs->cr3; 4070 else 4071 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 4072 up_read(&vcpu->kvm->slots_lock); 4073 4074 kvm_set_cr8(vcpu, sregs->cr8); 4075 4076 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4077 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4078 kvm_set_apic_base(vcpu, sregs->apic_base); 4079 4080 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4081 4082 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4083 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4084 vcpu->arch.cr0 = sregs->cr0; 4085 4086 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4087 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4088 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4089 load_pdptrs(vcpu, vcpu->arch.cr3); 4090 4091 if (mmu_reset_needed) 4092 kvm_mmu_reset_context(vcpu); 4093 4094 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4095 pending_vec = find_first_bit( 4096 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4097 if (pending_vec < max_bits) { 4098 kvm_queue_interrupt(vcpu, pending_vec, false); 4099 pr_debug("Set back pending irq %d\n", pending_vec); 4100 if (irqchip_in_kernel(vcpu->kvm)) 4101 kvm_pic_clear_isr_ack(vcpu->kvm); 4102 } 4103 4104 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4105 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4106 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4107 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4108 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4109 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4110 4111 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4112 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4113 4114 /* Older userspace won't unhalt the vcpu on reset. */ 4115 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 4116 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4117 !(vcpu->arch.cr0 & X86_CR0_PE)) 4118 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4119 4120 vcpu_put(vcpu); 4121 4122 return 0; 4123 } 4124 4125 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4126 struct kvm_guest_debug *dbg) 4127 { 4128 int i, r; 4129 4130 vcpu_load(vcpu); 4131 4132 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4133 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4134 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4135 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4136 vcpu->arch.switch_db_regs = 4137 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4138 } else { 4139 for (i = 0; i < KVM_NR_DB_REGS; i++) 4140 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4141 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4142 } 4143 4144 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4145 4146 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4147 kvm_queue_exception(vcpu, DB_VECTOR); 4148 else if (dbg->control & KVM_GUESTDBG_INJECT_BP) 4149 kvm_queue_exception(vcpu, BP_VECTOR); 4150 4151 vcpu_put(vcpu); 4152 4153 return r; 4154 } 4155 4156 /* 4157 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4158 * we have asm/x86/processor.h 4159 */ 4160 struct fxsave { 4161 u16 cwd; 4162 u16 swd; 4163 u16 twd; 4164 u16 fop; 4165 u64 rip; 4166 u64 rdp; 4167 u32 mxcsr; 4168 u32 mxcsr_mask; 4169 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4170 #ifdef CONFIG_X86_64 4171 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4172 #else 4173 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4174 #endif 4175 }; 4176 4177 /* 4178 * Translate a guest virtual address to a guest physical address. 4179 */ 4180 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4181 struct kvm_translation *tr) 4182 { 4183 unsigned long vaddr = tr->linear_address; 4184 gpa_t gpa; 4185 4186 vcpu_load(vcpu); 4187 down_read(&vcpu->kvm->slots_lock); 4188 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4189 up_read(&vcpu->kvm->slots_lock); 4190 tr->physical_address = gpa; 4191 tr->valid = gpa != UNMAPPED_GVA; 4192 tr->writeable = 1; 4193 tr->usermode = 0; 4194 vcpu_put(vcpu); 4195 4196 return 0; 4197 } 4198 4199 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4200 { 4201 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4202 4203 vcpu_load(vcpu); 4204 4205 memcpy(fpu->fpr, fxsave->st_space, 128); 4206 fpu->fcw = fxsave->cwd; 4207 fpu->fsw = fxsave->swd; 4208 fpu->ftwx = fxsave->twd; 4209 fpu->last_opcode = fxsave->fop; 4210 fpu->last_ip = fxsave->rip; 4211 fpu->last_dp = fxsave->rdp; 4212 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4213 4214 vcpu_put(vcpu); 4215 4216 return 0; 4217 } 4218 4219 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4220 { 4221 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4222 4223 vcpu_load(vcpu); 4224 4225 memcpy(fxsave->st_space, fpu->fpr, 128); 4226 fxsave->cwd = fpu->fcw; 4227 fxsave->swd = fpu->fsw; 4228 fxsave->twd = fpu->ftwx; 4229 fxsave->fop = fpu->last_opcode; 4230 fxsave->rip = fpu->last_ip; 4231 fxsave->rdp = fpu->last_dp; 4232 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4233 4234 vcpu_put(vcpu); 4235 4236 return 0; 4237 } 4238 4239 void fx_init(struct kvm_vcpu *vcpu) 4240 { 4241 unsigned after_mxcsr_mask; 4242 4243 /* 4244 * Touch the fpu the first time in non atomic context as if 4245 * this is the first fpu instruction the exception handler 4246 * will fire before the instruction returns and it'll have to 4247 * allocate ram with GFP_KERNEL. 4248 */ 4249 if (!used_math()) 4250 kvm_fx_save(&vcpu->arch.host_fx_image); 4251 4252 /* Initialize guest FPU by resetting ours and saving into guest's */ 4253 preempt_disable(); 4254 kvm_fx_save(&vcpu->arch.host_fx_image); 4255 kvm_fx_finit(); 4256 kvm_fx_save(&vcpu->arch.guest_fx_image); 4257 kvm_fx_restore(&vcpu->arch.host_fx_image); 4258 preempt_enable(); 4259 4260 vcpu->arch.cr0 |= X86_CR0_ET; 4261 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4262 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4263 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4264 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4265 } 4266 EXPORT_SYMBOL_GPL(fx_init); 4267 4268 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4269 { 4270 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4271 return; 4272 4273 vcpu->guest_fpu_loaded = 1; 4274 kvm_fx_save(&vcpu->arch.host_fx_image); 4275 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4276 } 4277 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4278 4279 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4280 { 4281 if (!vcpu->guest_fpu_loaded) 4282 return; 4283 4284 vcpu->guest_fpu_loaded = 0; 4285 kvm_fx_save(&vcpu->arch.guest_fx_image); 4286 kvm_fx_restore(&vcpu->arch.host_fx_image); 4287 ++vcpu->stat.fpu_reload; 4288 } 4289 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4290 4291 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4292 { 4293 if (vcpu->arch.time_page) { 4294 kvm_release_page_dirty(vcpu->arch.time_page); 4295 vcpu->arch.time_page = NULL; 4296 } 4297 4298 kvm_x86_ops->vcpu_free(vcpu); 4299 } 4300 4301 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4302 unsigned int id) 4303 { 4304 return kvm_x86_ops->vcpu_create(kvm, id); 4305 } 4306 4307 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4308 { 4309 int r; 4310 4311 /* We do fxsave: this must be aligned. */ 4312 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4313 4314 vcpu->arch.mtrr_state.have_fixed = 1; 4315 vcpu_load(vcpu); 4316 r = kvm_arch_vcpu_reset(vcpu); 4317 if (r == 0) 4318 r = kvm_mmu_setup(vcpu); 4319 vcpu_put(vcpu); 4320 if (r < 0) 4321 goto free_vcpu; 4322 4323 return 0; 4324 free_vcpu: 4325 kvm_x86_ops->vcpu_free(vcpu); 4326 return r; 4327 } 4328 4329 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4330 { 4331 vcpu_load(vcpu); 4332 kvm_mmu_unload(vcpu); 4333 vcpu_put(vcpu); 4334 4335 kvm_x86_ops->vcpu_free(vcpu); 4336 } 4337 4338 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4339 { 4340 vcpu->arch.nmi_pending = false; 4341 vcpu->arch.nmi_injected = false; 4342 4343 vcpu->arch.switch_db_regs = 0; 4344 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4345 vcpu->arch.dr6 = DR6_FIXED_1; 4346 vcpu->arch.dr7 = DR7_FIXED_1; 4347 4348 return kvm_x86_ops->vcpu_reset(vcpu); 4349 } 4350 4351 void kvm_arch_hardware_enable(void *garbage) 4352 { 4353 kvm_x86_ops->hardware_enable(garbage); 4354 } 4355 4356 void kvm_arch_hardware_disable(void *garbage) 4357 { 4358 kvm_x86_ops->hardware_disable(garbage); 4359 } 4360 4361 int kvm_arch_hardware_setup(void) 4362 { 4363 return kvm_x86_ops->hardware_setup(); 4364 } 4365 4366 void kvm_arch_hardware_unsetup(void) 4367 { 4368 kvm_x86_ops->hardware_unsetup(); 4369 } 4370 4371 void kvm_arch_check_processor_compat(void *rtn) 4372 { 4373 kvm_x86_ops->check_processor_compatibility(rtn); 4374 } 4375 4376 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4377 { 4378 struct page *page; 4379 struct kvm *kvm; 4380 int r; 4381 4382 BUG_ON(vcpu->kvm == NULL); 4383 kvm = vcpu->kvm; 4384 4385 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4386 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4387 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4388 else 4389 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4390 4391 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4392 if (!page) { 4393 r = -ENOMEM; 4394 goto fail; 4395 } 4396 vcpu->arch.pio_data = page_address(page); 4397 4398 r = kvm_mmu_create(vcpu); 4399 if (r < 0) 4400 goto fail_free_pio_data; 4401 4402 if (irqchip_in_kernel(kvm)) { 4403 r = kvm_create_lapic(vcpu); 4404 if (r < 0) 4405 goto fail_mmu_destroy; 4406 } 4407 4408 return 0; 4409 4410 fail_mmu_destroy: 4411 kvm_mmu_destroy(vcpu); 4412 fail_free_pio_data: 4413 free_page((unsigned long)vcpu->arch.pio_data); 4414 fail: 4415 return r; 4416 } 4417 4418 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4419 { 4420 kvm_free_lapic(vcpu); 4421 down_read(&vcpu->kvm->slots_lock); 4422 kvm_mmu_destroy(vcpu); 4423 up_read(&vcpu->kvm->slots_lock); 4424 free_page((unsigned long)vcpu->arch.pio_data); 4425 } 4426 4427 struct kvm *kvm_arch_create_vm(void) 4428 { 4429 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4430 4431 if (!kvm) 4432 return ERR_PTR(-ENOMEM); 4433 4434 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4435 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4436 4437 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4438 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4439 4440 rdtscll(kvm->arch.vm_init_tsc); 4441 4442 return kvm; 4443 } 4444 4445 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4446 { 4447 vcpu_load(vcpu); 4448 kvm_mmu_unload(vcpu); 4449 vcpu_put(vcpu); 4450 } 4451 4452 static void kvm_free_vcpus(struct kvm *kvm) 4453 { 4454 unsigned int i; 4455 4456 /* 4457 * Unpin any mmu pages first. 4458 */ 4459 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4460 if (kvm->vcpus[i]) 4461 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4462 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4463 if (kvm->vcpus[i]) { 4464 kvm_arch_vcpu_free(kvm->vcpus[i]); 4465 kvm->vcpus[i] = NULL; 4466 } 4467 } 4468 4469 } 4470 4471 void kvm_arch_sync_events(struct kvm *kvm) 4472 { 4473 kvm_free_all_assigned_devices(kvm); 4474 } 4475 4476 void kvm_arch_destroy_vm(struct kvm *kvm) 4477 { 4478 kvm_iommu_unmap_guest(kvm); 4479 kvm_free_pit(kvm); 4480 kfree(kvm->arch.vpic); 4481 kfree(kvm->arch.vioapic); 4482 kvm_free_vcpus(kvm); 4483 kvm_free_physmem(kvm); 4484 if (kvm->arch.apic_access_page) 4485 put_page(kvm->arch.apic_access_page); 4486 if (kvm->arch.ept_identity_pagetable) 4487 put_page(kvm->arch.ept_identity_pagetable); 4488 kfree(kvm); 4489 } 4490 4491 int kvm_arch_set_memory_region(struct kvm *kvm, 4492 struct kvm_userspace_memory_region *mem, 4493 struct kvm_memory_slot old, 4494 int user_alloc) 4495 { 4496 int npages = mem->memory_size >> PAGE_SHIFT; 4497 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4498 4499 /*To keep backward compatibility with older userspace, 4500 *x86 needs to hanlde !user_alloc case. 4501 */ 4502 if (!user_alloc) { 4503 if (npages && !old.rmap) { 4504 unsigned long userspace_addr; 4505 4506 down_write(¤t->mm->mmap_sem); 4507 userspace_addr = do_mmap(NULL, 0, 4508 npages * PAGE_SIZE, 4509 PROT_READ | PROT_WRITE, 4510 MAP_PRIVATE | MAP_ANONYMOUS, 4511 0); 4512 up_write(¤t->mm->mmap_sem); 4513 4514 if (IS_ERR((void *)userspace_addr)) 4515 return PTR_ERR((void *)userspace_addr); 4516 4517 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4518 spin_lock(&kvm->mmu_lock); 4519 memslot->userspace_addr = userspace_addr; 4520 spin_unlock(&kvm->mmu_lock); 4521 } else { 4522 if (!old.user_alloc && old.rmap) { 4523 int ret; 4524 4525 down_write(¤t->mm->mmap_sem); 4526 ret = do_munmap(current->mm, old.userspace_addr, 4527 old.npages * PAGE_SIZE); 4528 up_write(¤t->mm->mmap_sem); 4529 if (ret < 0) 4530 printk(KERN_WARNING 4531 "kvm_vm_ioctl_set_memory_region: " 4532 "failed to munmap memory\n"); 4533 } 4534 } 4535 } 4536 4537 spin_lock(&kvm->mmu_lock); 4538 if (!kvm->arch.n_requested_mmu_pages) { 4539 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4540 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4541 } 4542 4543 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4544 spin_unlock(&kvm->mmu_lock); 4545 kvm_flush_remote_tlbs(kvm); 4546 4547 return 0; 4548 } 4549 4550 void kvm_arch_flush_shadow(struct kvm *kvm) 4551 { 4552 kvm_mmu_zap_all(kvm); 4553 kvm_reload_remote_mmus(kvm); 4554 } 4555 4556 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4557 { 4558 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4559 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4560 || vcpu->arch.nmi_pending; 4561 } 4562 4563 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4564 { 4565 int me; 4566 int cpu = vcpu->cpu; 4567 4568 if (waitqueue_active(&vcpu->wq)) { 4569 wake_up_interruptible(&vcpu->wq); 4570 ++vcpu->stat.halt_wakeup; 4571 } 4572 4573 me = get_cpu(); 4574 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 4575 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 4576 smp_send_reschedule(cpu); 4577 put_cpu(); 4578 } 4579 4580 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 4581 { 4582 return kvm_x86_ops->interrupt_allowed(vcpu); 4583 } 4584