1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 #include <linux/cpufreq.h> 40 41 #include <asm/uaccess.h> 42 #include <asm/msr.h> 43 #include <asm/desc.h> 44 #include <asm/mtrr.h> 45 46 #define MAX_IO_MSRS 256 47 #define CR0_RESERVED_BITS \ 48 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 49 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 51 #define CR4_RESERVED_BITS \ 52 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 53 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 54 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 55 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 56 57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 58 /* EFER defaults: 59 * - enable syscall per default because its emulated by KVM 60 * - enable LME and LMA per default on 64 bit KVM 61 */ 62 #ifdef CONFIG_X86_64 63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 64 #else 65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 66 #endif 67 68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 70 71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 72 struct kvm_cpuid_entry2 __user *entries); 73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 74 u32 function, u32 index); 75 76 struct kvm_x86_ops *kvm_x86_ops; 77 EXPORT_SYMBOL_GPL(kvm_x86_ops); 78 79 struct kvm_stats_debugfs_item debugfs_entries[] = { 80 { "pf_fixed", VCPU_STAT(pf_fixed) }, 81 { "pf_guest", VCPU_STAT(pf_guest) }, 82 { "tlb_flush", VCPU_STAT(tlb_flush) }, 83 { "invlpg", VCPU_STAT(invlpg) }, 84 { "exits", VCPU_STAT(exits) }, 85 { "io_exits", VCPU_STAT(io_exits) }, 86 { "mmio_exits", VCPU_STAT(mmio_exits) }, 87 { "signal_exits", VCPU_STAT(signal_exits) }, 88 { "irq_window", VCPU_STAT(irq_window_exits) }, 89 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 90 { "halt_exits", VCPU_STAT(halt_exits) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 92 { "hypercalls", VCPU_STAT(hypercalls) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) }, 94 { "irq_exits", VCPU_STAT(irq_exits) }, 95 { "host_state_reload", VCPU_STAT(host_state_reload) }, 96 { "efer_reload", VCPU_STAT(efer_reload) }, 97 { "fpu_reload", VCPU_STAT(fpu_reload) }, 98 { "insn_emulation", VCPU_STAT(insn_emulation) }, 99 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 100 { "irq_injections", VCPU_STAT(irq_injections) }, 101 { "nmi_injections", VCPU_STAT(nmi_injections) }, 102 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 103 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 104 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 105 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 106 { "mmu_flooded", VM_STAT(mmu_flooded) }, 107 { "mmu_recycled", VM_STAT(mmu_recycled) }, 108 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 109 { "mmu_unsync", VM_STAT(mmu_unsync) }, 110 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 111 { "largepages", VM_STAT(lpages) }, 112 { NULL } 113 }; 114 115 unsigned long segment_base(u16 selector) 116 { 117 struct descriptor_table gdt; 118 struct desc_struct *d; 119 unsigned long table_base; 120 unsigned long v; 121 122 if (selector == 0) 123 return 0; 124 125 asm("sgdt %0" : "=m"(gdt)); 126 table_base = gdt.base; 127 128 if (selector & 4) { /* from ldt */ 129 u16 ldt_selector; 130 131 asm("sldt %0" : "=g"(ldt_selector)); 132 table_base = segment_base(ldt_selector); 133 } 134 d = (struct desc_struct *)(table_base + (selector & ~7)); 135 v = d->base0 | ((unsigned long)d->base1 << 16) | 136 ((unsigned long)d->base2 << 24); 137 #ifdef CONFIG_X86_64 138 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 139 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 140 #endif 141 return v; 142 } 143 EXPORT_SYMBOL_GPL(segment_base); 144 145 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 146 { 147 if (irqchip_in_kernel(vcpu->kvm)) 148 return vcpu->arch.apic_base; 149 else 150 return vcpu->arch.apic_base; 151 } 152 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 153 154 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 155 { 156 /* TODO: reserve bits check */ 157 if (irqchip_in_kernel(vcpu->kvm)) 158 kvm_lapic_set_base(vcpu, data); 159 else 160 vcpu->arch.apic_base = data; 161 } 162 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 163 164 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 165 { 166 WARN_ON(vcpu->arch.exception.pending); 167 vcpu->arch.exception.pending = true; 168 vcpu->arch.exception.has_error_code = false; 169 vcpu->arch.exception.nr = nr; 170 } 171 EXPORT_SYMBOL_GPL(kvm_queue_exception); 172 173 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 174 u32 error_code) 175 { 176 ++vcpu->stat.pf_guest; 177 178 if (vcpu->arch.exception.pending) { 179 if (vcpu->arch.exception.nr == PF_VECTOR) { 180 printk(KERN_DEBUG "kvm: inject_page_fault:" 181 " double fault 0x%lx\n", addr); 182 vcpu->arch.exception.nr = DF_VECTOR; 183 vcpu->arch.exception.error_code = 0; 184 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 185 /* triple fault -> shutdown */ 186 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 187 } 188 return; 189 } 190 vcpu->arch.cr2 = addr; 191 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 192 } 193 194 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 195 { 196 vcpu->arch.nmi_pending = 1; 197 } 198 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 199 200 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 201 { 202 WARN_ON(vcpu->arch.exception.pending); 203 vcpu->arch.exception.pending = true; 204 vcpu->arch.exception.has_error_code = true; 205 vcpu->arch.exception.nr = nr; 206 vcpu->arch.exception.error_code = error_code; 207 } 208 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 209 210 static void __queue_exception(struct kvm_vcpu *vcpu) 211 { 212 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 213 vcpu->arch.exception.has_error_code, 214 vcpu->arch.exception.error_code); 215 } 216 217 /* 218 * Load the pae pdptrs. Return true is they are all valid. 219 */ 220 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 221 { 222 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 223 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 224 int i; 225 int ret; 226 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 227 228 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 229 offset * sizeof(u64), sizeof(pdpte)); 230 if (ret < 0) { 231 ret = 0; 232 goto out; 233 } 234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 235 if (is_present_pte(pdpte[i]) && 236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 237 ret = 0; 238 goto out; 239 } 240 } 241 ret = 1; 242 243 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 244 out: 245 246 return ret; 247 } 248 EXPORT_SYMBOL_GPL(load_pdptrs); 249 250 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 251 { 252 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 253 bool changed = true; 254 int r; 255 256 if (is_long_mode(vcpu) || !is_pae(vcpu)) 257 return false; 258 259 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 260 if (r < 0) 261 goto out; 262 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 263 out: 264 265 return changed; 266 } 267 268 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 269 { 270 if (cr0 & CR0_RESERVED_BITS) { 271 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 272 cr0, vcpu->arch.cr0); 273 kvm_inject_gp(vcpu, 0); 274 return; 275 } 276 277 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 278 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 279 kvm_inject_gp(vcpu, 0); 280 return; 281 } 282 283 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 284 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 285 "and a clear PE flag\n"); 286 kvm_inject_gp(vcpu, 0); 287 return; 288 } 289 290 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 291 #ifdef CONFIG_X86_64 292 if ((vcpu->arch.shadow_efer & EFER_LME)) { 293 int cs_db, cs_l; 294 295 if (!is_pae(vcpu)) { 296 printk(KERN_DEBUG "set_cr0: #GP, start paging " 297 "in long mode while PAE is disabled\n"); 298 kvm_inject_gp(vcpu, 0); 299 return; 300 } 301 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 302 if (cs_l) { 303 printk(KERN_DEBUG "set_cr0: #GP, start paging " 304 "in long mode while CS.L == 1\n"); 305 kvm_inject_gp(vcpu, 0); 306 return; 307 308 } 309 } else 310 #endif 311 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 312 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 313 "reserved bits\n"); 314 kvm_inject_gp(vcpu, 0); 315 return; 316 } 317 318 } 319 320 kvm_x86_ops->set_cr0(vcpu, cr0); 321 vcpu->arch.cr0 = cr0; 322 323 kvm_mmu_reset_context(vcpu); 324 return; 325 } 326 EXPORT_SYMBOL_GPL(kvm_set_cr0); 327 328 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 329 { 330 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 331 KVMTRACE_1D(LMSW, vcpu, 332 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 333 handler); 334 } 335 EXPORT_SYMBOL_GPL(kvm_lmsw); 336 337 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 338 { 339 unsigned long old_cr4 = vcpu->arch.cr4; 340 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 341 342 if (cr4 & CR4_RESERVED_BITS) { 343 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 344 kvm_inject_gp(vcpu, 0); 345 return; 346 } 347 348 if (is_long_mode(vcpu)) { 349 if (!(cr4 & X86_CR4_PAE)) { 350 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 351 "in long mode\n"); 352 kvm_inject_gp(vcpu, 0); 353 return; 354 } 355 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 356 && ((cr4 ^ old_cr4) & pdptr_bits) 357 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 358 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 359 kvm_inject_gp(vcpu, 0); 360 return; 361 } 362 363 if (cr4 & X86_CR4_VMXE) { 364 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 365 kvm_inject_gp(vcpu, 0); 366 return; 367 } 368 kvm_x86_ops->set_cr4(vcpu, cr4); 369 vcpu->arch.cr4 = cr4; 370 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 371 kvm_mmu_reset_context(vcpu); 372 } 373 EXPORT_SYMBOL_GPL(kvm_set_cr4); 374 375 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 376 { 377 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 378 kvm_mmu_sync_roots(vcpu); 379 kvm_mmu_flush_tlb(vcpu); 380 return; 381 } 382 383 if (is_long_mode(vcpu)) { 384 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 385 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 386 kvm_inject_gp(vcpu, 0); 387 return; 388 } 389 } else { 390 if (is_pae(vcpu)) { 391 if (cr3 & CR3_PAE_RESERVED_BITS) { 392 printk(KERN_DEBUG 393 "set_cr3: #GP, reserved bits\n"); 394 kvm_inject_gp(vcpu, 0); 395 return; 396 } 397 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 398 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 399 "reserved bits\n"); 400 kvm_inject_gp(vcpu, 0); 401 return; 402 } 403 } 404 /* 405 * We don't check reserved bits in nonpae mode, because 406 * this isn't enforced, and VMware depends on this. 407 */ 408 } 409 410 /* 411 * Does the new cr3 value map to physical memory? (Note, we 412 * catch an invalid cr3 even in real-mode, because it would 413 * cause trouble later on when we turn on paging anyway.) 414 * 415 * A real CPU would silently accept an invalid cr3 and would 416 * attempt to use it - with largely undefined (and often hard 417 * to debug) behavior on the guest side. 418 */ 419 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 420 kvm_inject_gp(vcpu, 0); 421 else { 422 vcpu->arch.cr3 = cr3; 423 vcpu->arch.mmu.new_cr3(vcpu); 424 } 425 } 426 EXPORT_SYMBOL_GPL(kvm_set_cr3); 427 428 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 429 { 430 if (cr8 & CR8_RESERVED_BITS) { 431 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 432 kvm_inject_gp(vcpu, 0); 433 return; 434 } 435 if (irqchip_in_kernel(vcpu->kvm)) 436 kvm_lapic_set_tpr(vcpu, cr8); 437 else 438 vcpu->arch.cr8 = cr8; 439 } 440 EXPORT_SYMBOL_GPL(kvm_set_cr8); 441 442 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 443 { 444 if (irqchip_in_kernel(vcpu->kvm)) 445 return kvm_lapic_get_cr8(vcpu); 446 else 447 return vcpu->arch.cr8; 448 } 449 EXPORT_SYMBOL_GPL(kvm_get_cr8); 450 451 static inline u32 bit(int bitno) 452 { 453 return 1 << (bitno & 31); 454 } 455 456 /* 457 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 458 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 459 * 460 * This list is modified at module load time to reflect the 461 * capabilities of the host cpu. 462 */ 463 static u32 msrs_to_save[] = { 464 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 465 MSR_K6_STAR, 466 #ifdef CONFIG_X86_64 467 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 468 #endif 469 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 470 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 471 }; 472 473 static unsigned num_msrs_to_save; 474 475 static u32 emulated_msrs[] = { 476 MSR_IA32_MISC_ENABLE, 477 }; 478 479 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 480 { 481 if (efer & efer_reserved_bits) { 482 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 483 efer); 484 kvm_inject_gp(vcpu, 0); 485 return; 486 } 487 488 if (is_paging(vcpu) 489 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 490 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 491 kvm_inject_gp(vcpu, 0); 492 return; 493 } 494 495 if (efer & EFER_FFXSR) { 496 struct kvm_cpuid_entry2 *feat; 497 498 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 499 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 500 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); 501 kvm_inject_gp(vcpu, 0); 502 return; 503 } 504 } 505 506 if (efer & EFER_SVME) { 507 struct kvm_cpuid_entry2 *feat; 508 509 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 510 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 511 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); 512 kvm_inject_gp(vcpu, 0); 513 return; 514 } 515 } 516 517 kvm_x86_ops->set_efer(vcpu, efer); 518 519 efer &= ~EFER_LMA; 520 efer |= vcpu->arch.shadow_efer & EFER_LMA; 521 522 vcpu->arch.shadow_efer = efer; 523 524 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 525 kvm_mmu_reset_context(vcpu); 526 } 527 528 void kvm_enable_efer_bits(u64 mask) 529 { 530 efer_reserved_bits &= ~mask; 531 } 532 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 533 534 535 /* 536 * Writes msr value into into the appropriate "register". 537 * Returns 0 on success, non-0 otherwise. 538 * Assumes vcpu_load() was already called. 539 */ 540 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 541 { 542 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 543 } 544 545 /* 546 * Adapt set_msr() to msr_io()'s calling convention 547 */ 548 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 549 { 550 return kvm_set_msr(vcpu, index, *data); 551 } 552 553 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 554 { 555 static int version; 556 struct pvclock_wall_clock wc; 557 struct timespec now, sys, boot; 558 559 if (!wall_clock) 560 return; 561 562 version++; 563 564 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 565 566 /* 567 * The guest calculates current wall clock time by adding 568 * system time (updated by kvm_write_guest_time below) to the 569 * wall clock specified here. guest system time equals host 570 * system time for us, thus we must fill in host boot time here. 571 */ 572 now = current_kernel_time(); 573 ktime_get_ts(&sys); 574 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 575 576 wc.sec = boot.tv_sec; 577 wc.nsec = boot.tv_nsec; 578 wc.version = version; 579 580 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 581 582 version++; 583 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 584 } 585 586 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 587 { 588 uint32_t quotient, remainder; 589 590 /* Don't try to replace with do_div(), this one calculates 591 * "(dividend << 32) / divisor" */ 592 __asm__ ( "divl %4" 593 : "=a" (quotient), "=d" (remainder) 594 : "0" (0), "1" (dividend), "r" (divisor) ); 595 return quotient; 596 } 597 598 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 599 { 600 uint64_t nsecs = 1000000000LL; 601 int32_t shift = 0; 602 uint64_t tps64; 603 uint32_t tps32; 604 605 tps64 = tsc_khz * 1000LL; 606 while (tps64 > nsecs*2) { 607 tps64 >>= 1; 608 shift--; 609 } 610 611 tps32 = (uint32_t)tps64; 612 while (tps32 <= (uint32_t)nsecs) { 613 tps32 <<= 1; 614 shift++; 615 } 616 617 hv_clock->tsc_shift = shift; 618 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 619 620 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 621 __func__, tsc_khz, hv_clock->tsc_shift, 622 hv_clock->tsc_to_system_mul); 623 } 624 625 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 626 627 static void kvm_write_guest_time(struct kvm_vcpu *v) 628 { 629 struct timespec ts; 630 unsigned long flags; 631 struct kvm_vcpu_arch *vcpu = &v->arch; 632 void *shared_kaddr; 633 unsigned long this_tsc_khz; 634 635 if ((!vcpu->time_page)) 636 return; 637 638 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 639 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 640 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 641 vcpu->hv_clock_tsc_khz = this_tsc_khz; 642 } 643 put_cpu_var(cpu_tsc_khz); 644 645 /* Keep irq disabled to prevent changes to the clock */ 646 local_irq_save(flags); 647 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 648 &vcpu->hv_clock.tsc_timestamp); 649 ktime_get_ts(&ts); 650 local_irq_restore(flags); 651 652 /* With all the info we got, fill in the values */ 653 654 vcpu->hv_clock.system_time = ts.tv_nsec + 655 (NSEC_PER_SEC * (u64)ts.tv_sec); 656 /* 657 * The interface expects us to write an even number signaling that the 658 * update is finished. Since the guest won't see the intermediate 659 * state, we just increase by 2 at the end. 660 */ 661 vcpu->hv_clock.version += 2; 662 663 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 664 665 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 666 sizeof(vcpu->hv_clock)); 667 668 kunmap_atomic(shared_kaddr, KM_USER0); 669 670 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 671 } 672 673 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 674 { 675 struct kvm_vcpu_arch *vcpu = &v->arch; 676 677 if (!vcpu->time_page) 678 return 0; 679 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 680 return 1; 681 } 682 683 static bool msr_mtrr_valid(unsigned msr) 684 { 685 switch (msr) { 686 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 687 case MSR_MTRRfix64K_00000: 688 case MSR_MTRRfix16K_80000: 689 case MSR_MTRRfix16K_A0000: 690 case MSR_MTRRfix4K_C0000: 691 case MSR_MTRRfix4K_C8000: 692 case MSR_MTRRfix4K_D0000: 693 case MSR_MTRRfix4K_D8000: 694 case MSR_MTRRfix4K_E0000: 695 case MSR_MTRRfix4K_E8000: 696 case MSR_MTRRfix4K_F0000: 697 case MSR_MTRRfix4K_F8000: 698 case MSR_MTRRdefType: 699 case MSR_IA32_CR_PAT: 700 return true; 701 case 0x2f8: 702 return true; 703 } 704 return false; 705 } 706 707 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 708 { 709 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 710 711 if (!msr_mtrr_valid(msr)) 712 return 1; 713 714 if (msr == MSR_MTRRdefType) { 715 vcpu->arch.mtrr_state.def_type = data; 716 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 717 } else if (msr == MSR_MTRRfix64K_00000) 718 p[0] = data; 719 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 720 p[1 + msr - MSR_MTRRfix16K_80000] = data; 721 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 722 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 723 else if (msr == MSR_IA32_CR_PAT) 724 vcpu->arch.pat = data; 725 else { /* Variable MTRRs */ 726 int idx, is_mtrr_mask; 727 u64 *pt; 728 729 idx = (msr - 0x200) / 2; 730 is_mtrr_mask = msr - 0x200 - 2 * idx; 731 if (!is_mtrr_mask) 732 pt = 733 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 734 else 735 pt = 736 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 737 *pt = data; 738 } 739 740 kvm_mmu_reset_context(vcpu); 741 return 0; 742 } 743 744 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 745 { 746 switch (msr) { 747 case MSR_EFER: 748 set_efer(vcpu, data); 749 break; 750 case MSR_IA32_MC0_STATUS: 751 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 752 __func__, data); 753 break; 754 case MSR_IA32_MCG_STATUS: 755 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 756 __func__, data); 757 break; 758 case MSR_IA32_MCG_CTL: 759 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 760 __func__, data); 761 break; 762 case MSR_IA32_DEBUGCTLMSR: 763 if (!data) { 764 /* We support the non-activated case already */ 765 break; 766 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 767 /* Values other than LBR and BTF are vendor-specific, 768 thus reserved and should throw a #GP */ 769 return 1; 770 } 771 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 772 __func__, data); 773 break; 774 case MSR_IA32_UCODE_REV: 775 case MSR_IA32_UCODE_WRITE: 776 case MSR_VM_HSAVE_PA: 777 break; 778 case 0x200 ... 0x2ff: 779 return set_msr_mtrr(vcpu, msr, data); 780 case MSR_IA32_APICBASE: 781 kvm_set_apic_base(vcpu, data); 782 break; 783 case MSR_IA32_MISC_ENABLE: 784 vcpu->arch.ia32_misc_enable_msr = data; 785 break; 786 case MSR_KVM_WALL_CLOCK: 787 vcpu->kvm->arch.wall_clock = data; 788 kvm_write_wall_clock(vcpu->kvm, data); 789 break; 790 case MSR_KVM_SYSTEM_TIME: { 791 if (vcpu->arch.time_page) { 792 kvm_release_page_dirty(vcpu->arch.time_page); 793 vcpu->arch.time_page = NULL; 794 } 795 796 vcpu->arch.time = data; 797 798 /* we verify if the enable bit is set... */ 799 if (!(data & 1)) 800 break; 801 802 /* ...but clean it before doing the actual write */ 803 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 804 805 vcpu->arch.time_page = 806 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 807 808 if (is_error_page(vcpu->arch.time_page)) { 809 kvm_release_page_clean(vcpu->arch.time_page); 810 vcpu->arch.time_page = NULL; 811 } 812 813 kvm_request_guest_time_update(vcpu); 814 break; 815 } 816 default: 817 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 818 return 1; 819 } 820 return 0; 821 } 822 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 823 824 825 /* 826 * Reads an msr value (of 'msr_index') into 'pdata'. 827 * Returns 0 on success, non-0 otherwise. 828 * Assumes vcpu_load() was already called. 829 */ 830 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 831 { 832 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 833 } 834 835 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 836 { 837 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 838 839 if (!msr_mtrr_valid(msr)) 840 return 1; 841 842 if (msr == MSR_MTRRdefType) 843 *pdata = vcpu->arch.mtrr_state.def_type + 844 (vcpu->arch.mtrr_state.enabled << 10); 845 else if (msr == MSR_MTRRfix64K_00000) 846 *pdata = p[0]; 847 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 848 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 849 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 850 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 851 else if (msr == MSR_IA32_CR_PAT) 852 *pdata = vcpu->arch.pat; 853 else { /* Variable MTRRs */ 854 int idx, is_mtrr_mask; 855 u64 *pt; 856 857 idx = (msr - 0x200) / 2; 858 is_mtrr_mask = msr - 0x200 - 2 * idx; 859 if (!is_mtrr_mask) 860 pt = 861 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 862 else 863 pt = 864 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 865 *pdata = *pt; 866 } 867 868 return 0; 869 } 870 871 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 872 { 873 u64 data; 874 875 switch (msr) { 876 case 0xc0010010: /* SYSCFG */ 877 case 0xc0010015: /* HWCR */ 878 case MSR_IA32_PLATFORM_ID: 879 case MSR_IA32_P5_MC_ADDR: 880 case MSR_IA32_P5_MC_TYPE: 881 case MSR_IA32_MC0_CTL: 882 case MSR_IA32_MCG_STATUS: 883 case MSR_IA32_MCG_CAP: 884 case MSR_IA32_MCG_CTL: 885 case MSR_IA32_MC0_MISC: 886 case MSR_IA32_MC0_MISC+4: 887 case MSR_IA32_MC0_MISC+8: 888 case MSR_IA32_MC0_MISC+12: 889 case MSR_IA32_MC0_MISC+16: 890 case MSR_IA32_MC0_MISC+20: 891 case MSR_IA32_UCODE_REV: 892 case MSR_IA32_EBL_CR_POWERON: 893 case MSR_IA32_DEBUGCTLMSR: 894 case MSR_IA32_LASTBRANCHFROMIP: 895 case MSR_IA32_LASTBRANCHTOIP: 896 case MSR_IA32_LASTINTFROMIP: 897 case MSR_IA32_LASTINTTOIP: 898 case MSR_VM_HSAVE_PA: 899 case MSR_P6_EVNTSEL0: 900 case MSR_P6_EVNTSEL1: 901 data = 0; 902 break; 903 case MSR_MTRRcap: 904 data = 0x500 | KVM_NR_VAR_MTRR; 905 break; 906 case 0x200 ... 0x2ff: 907 return get_msr_mtrr(vcpu, msr, pdata); 908 case 0xcd: /* fsb frequency */ 909 data = 3; 910 break; 911 case MSR_IA32_APICBASE: 912 data = kvm_get_apic_base(vcpu); 913 break; 914 case MSR_IA32_MISC_ENABLE: 915 data = vcpu->arch.ia32_misc_enable_msr; 916 break; 917 case MSR_IA32_PERF_STATUS: 918 /* TSC increment by tick */ 919 data = 1000ULL; 920 /* CPU multiplier */ 921 data |= (((uint64_t)4ULL) << 40); 922 break; 923 case MSR_EFER: 924 data = vcpu->arch.shadow_efer; 925 break; 926 case MSR_KVM_WALL_CLOCK: 927 data = vcpu->kvm->arch.wall_clock; 928 break; 929 case MSR_KVM_SYSTEM_TIME: 930 data = vcpu->arch.time; 931 break; 932 default: 933 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 934 return 1; 935 } 936 *pdata = data; 937 return 0; 938 } 939 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 940 941 /* 942 * Read or write a bunch of msrs. All parameters are kernel addresses. 943 * 944 * @return number of msrs set successfully. 945 */ 946 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 947 struct kvm_msr_entry *entries, 948 int (*do_msr)(struct kvm_vcpu *vcpu, 949 unsigned index, u64 *data)) 950 { 951 int i; 952 953 vcpu_load(vcpu); 954 955 down_read(&vcpu->kvm->slots_lock); 956 for (i = 0; i < msrs->nmsrs; ++i) 957 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 958 break; 959 up_read(&vcpu->kvm->slots_lock); 960 961 vcpu_put(vcpu); 962 963 return i; 964 } 965 966 /* 967 * Read or write a bunch of msrs. Parameters are user addresses. 968 * 969 * @return number of msrs set successfully. 970 */ 971 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 972 int (*do_msr)(struct kvm_vcpu *vcpu, 973 unsigned index, u64 *data), 974 int writeback) 975 { 976 struct kvm_msrs msrs; 977 struct kvm_msr_entry *entries; 978 int r, n; 979 unsigned size; 980 981 r = -EFAULT; 982 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 983 goto out; 984 985 r = -E2BIG; 986 if (msrs.nmsrs >= MAX_IO_MSRS) 987 goto out; 988 989 r = -ENOMEM; 990 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 991 entries = vmalloc(size); 992 if (!entries) 993 goto out; 994 995 r = -EFAULT; 996 if (copy_from_user(entries, user_msrs->entries, size)) 997 goto out_free; 998 999 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1000 if (r < 0) 1001 goto out_free; 1002 1003 r = -EFAULT; 1004 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1005 goto out_free; 1006 1007 r = n; 1008 1009 out_free: 1010 vfree(entries); 1011 out: 1012 return r; 1013 } 1014 1015 int kvm_dev_ioctl_check_extension(long ext) 1016 { 1017 int r; 1018 1019 switch (ext) { 1020 case KVM_CAP_IRQCHIP: 1021 case KVM_CAP_HLT: 1022 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1023 case KVM_CAP_SET_TSS_ADDR: 1024 case KVM_CAP_EXT_CPUID: 1025 case KVM_CAP_CLOCKSOURCE: 1026 case KVM_CAP_PIT: 1027 case KVM_CAP_NOP_IO_DELAY: 1028 case KVM_CAP_MP_STATE: 1029 case KVM_CAP_SYNC_MMU: 1030 case KVM_CAP_REINJECT_CONTROL: 1031 case KVM_CAP_IRQ_INJECT_STATUS: 1032 case KVM_CAP_ASSIGN_DEV_IRQ: 1033 r = 1; 1034 break; 1035 case KVM_CAP_COALESCED_MMIO: 1036 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1037 break; 1038 case KVM_CAP_VAPIC: 1039 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1040 break; 1041 case KVM_CAP_NR_VCPUS: 1042 r = KVM_MAX_VCPUS; 1043 break; 1044 case KVM_CAP_NR_MEMSLOTS: 1045 r = KVM_MEMORY_SLOTS; 1046 break; 1047 case KVM_CAP_PV_MMU: 1048 r = !tdp_enabled; 1049 break; 1050 case KVM_CAP_IOMMU: 1051 r = iommu_found(); 1052 break; 1053 default: 1054 r = 0; 1055 break; 1056 } 1057 return r; 1058 1059 } 1060 1061 long kvm_arch_dev_ioctl(struct file *filp, 1062 unsigned int ioctl, unsigned long arg) 1063 { 1064 void __user *argp = (void __user *)arg; 1065 long r; 1066 1067 switch (ioctl) { 1068 case KVM_GET_MSR_INDEX_LIST: { 1069 struct kvm_msr_list __user *user_msr_list = argp; 1070 struct kvm_msr_list msr_list; 1071 unsigned n; 1072 1073 r = -EFAULT; 1074 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1075 goto out; 1076 n = msr_list.nmsrs; 1077 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1078 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1079 goto out; 1080 r = -E2BIG; 1081 if (n < num_msrs_to_save) 1082 goto out; 1083 r = -EFAULT; 1084 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1085 num_msrs_to_save * sizeof(u32))) 1086 goto out; 1087 if (copy_to_user(user_msr_list->indices 1088 + num_msrs_to_save * sizeof(u32), 1089 &emulated_msrs, 1090 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1091 goto out; 1092 r = 0; 1093 break; 1094 } 1095 case KVM_GET_SUPPORTED_CPUID: { 1096 struct kvm_cpuid2 __user *cpuid_arg = argp; 1097 struct kvm_cpuid2 cpuid; 1098 1099 r = -EFAULT; 1100 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1101 goto out; 1102 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1103 cpuid_arg->entries); 1104 if (r) 1105 goto out; 1106 1107 r = -EFAULT; 1108 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1109 goto out; 1110 r = 0; 1111 break; 1112 } 1113 default: 1114 r = -EINVAL; 1115 } 1116 out: 1117 return r; 1118 } 1119 1120 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1121 { 1122 kvm_x86_ops->vcpu_load(vcpu, cpu); 1123 kvm_request_guest_time_update(vcpu); 1124 } 1125 1126 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1127 { 1128 kvm_x86_ops->vcpu_put(vcpu); 1129 kvm_put_guest_fpu(vcpu); 1130 } 1131 1132 static int is_efer_nx(void) 1133 { 1134 unsigned long long efer = 0; 1135 1136 rdmsrl_safe(MSR_EFER, &efer); 1137 return efer & EFER_NX; 1138 } 1139 1140 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1141 { 1142 int i; 1143 struct kvm_cpuid_entry2 *e, *entry; 1144 1145 entry = NULL; 1146 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1147 e = &vcpu->arch.cpuid_entries[i]; 1148 if (e->function == 0x80000001) { 1149 entry = e; 1150 break; 1151 } 1152 } 1153 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1154 entry->edx &= ~(1 << 20); 1155 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1156 } 1157 } 1158 1159 /* when an old userspace process fills a new kernel module */ 1160 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1161 struct kvm_cpuid *cpuid, 1162 struct kvm_cpuid_entry __user *entries) 1163 { 1164 int r, i; 1165 struct kvm_cpuid_entry *cpuid_entries; 1166 1167 r = -E2BIG; 1168 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1169 goto out; 1170 r = -ENOMEM; 1171 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1172 if (!cpuid_entries) 1173 goto out; 1174 r = -EFAULT; 1175 if (copy_from_user(cpuid_entries, entries, 1176 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1177 goto out_free; 1178 for (i = 0; i < cpuid->nent; i++) { 1179 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1180 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1181 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1182 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1183 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1184 vcpu->arch.cpuid_entries[i].index = 0; 1185 vcpu->arch.cpuid_entries[i].flags = 0; 1186 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1187 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1188 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1189 } 1190 vcpu->arch.cpuid_nent = cpuid->nent; 1191 cpuid_fix_nx_cap(vcpu); 1192 r = 0; 1193 1194 out_free: 1195 vfree(cpuid_entries); 1196 out: 1197 return r; 1198 } 1199 1200 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1201 struct kvm_cpuid2 *cpuid, 1202 struct kvm_cpuid_entry2 __user *entries) 1203 { 1204 int r; 1205 1206 r = -E2BIG; 1207 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1208 goto out; 1209 r = -EFAULT; 1210 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1211 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1212 goto out; 1213 vcpu->arch.cpuid_nent = cpuid->nent; 1214 return 0; 1215 1216 out: 1217 return r; 1218 } 1219 1220 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1221 struct kvm_cpuid2 *cpuid, 1222 struct kvm_cpuid_entry2 __user *entries) 1223 { 1224 int r; 1225 1226 r = -E2BIG; 1227 if (cpuid->nent < vcpu->arch.cpuid_nent) 1228 goto out; 1229 r = -EFAULT; 1230 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1231 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1232 goto out; 1233 return 0; 1234 1235 out: 1236 cpuid->nent = vcpu->arch.cpuid_nent; 1237 return r; 1238 } 1239 1240 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1241 u32 index) 1242 { 1243 entry->function = function; 1244 entry->index = index; 1245 cpuid_count(entry->function, entry->index, 1246 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1247 entry->flags = 0; 1248 } 1249 1250 #define F(x) bit(X86_FEATURE_##x) 1251 1252 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1253 u32 index, int *nent, int maxnent) 1254 { 1255 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1256 #ifdef CONFIG_X86_64 1257 unsigned f_lm = F(LM); 1258 #else 1259 unsigned f_lm = 0; 1260 #endif 1261 1262 /* cpuid 1.edx */ 1263 const u32 kvm_supported_word0_x86_features = 1264 F(FPU) | F(VME) | F(DE) | F(PSE) | 1265 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1266 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1267 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1268 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1269 0 /* Reserved, DS, ACPI */ | F(MMX) | 1270 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1271 0 /* HTT, TM, Reserved, PBE */; 1272 /* cpuid 0x80000001.edx */ 1273 const u32 kvm_supported_word1_x86_features = 1274 F(FPU) | F(VME) | F(DE) | F(PSE) | 1275 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1276 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1277 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1278 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1279 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1280 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | 1281 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1282 /* cpuid 1.ecx */ 1283 const u32 kvm_supported_word4_x86_features = 1284 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1285 0 /* DS-CPL, VMX, SMX, EST */ | 1286 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1287 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1288 0 /* Reserved, DCA */ | F(XMM4_1) | 1289 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | 1290 0 /* Reserved, XSAVE, OSXSAVE */; 1291 /* cpuid 0x80000001.ecx */ 1292 const u32 kvm_supported_word6_x86_features = 1293 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1294 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1295 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 1296 0 /* SKINIT */ | 0 /* WDT */; 1297 1298 /* all calls to cpuid_count() should be made on the same cpu */ 1299 get_cpu(); 1300 do_cpuid_1_ent(entry, function, index); 1301 ++*nent; 1302 1303 switch (function) { 1304 case 0: 1305 entry->eax = min(entry->eax, (u32)0xb); 1306 break; 1307 case 1: 1308 entry->edx &= kvm_supported_word0_x86_features; 1309 entry->ecx &= kvm_supported_word4_x86_features; 1310 break; 1311 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1312 * may return different values. This forces us to get_cpu() before 1313 * issuing the first command, and also to emulate this annoying behavior 1314 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1315 case 2: { 1316 int t, times = entry->eax & 0xff; 1317 1318 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1319 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1320 for (t = 1; t < times && *nent < maxnent; ++t) { 1321 do_cpuid_1_ent(&entry[t], function, 0); 1322 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1323 ++*nent; 1324 } 1325 break; 1326 } 1327 /* function 4 and 0xb have additional index. */ 1328 case 4: { 1329 int i, cache_type; 1330 1331 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1332 /* read more entries until cache_type is zero */ 1333 for (i = 1; *nent < maxnent; ++i) { 1334 cache_type = entry[i - 1].eax & 0x1f; 1335 if (!cache_type) 1336 break; 1337 do_cpuid_1_ent(&entry[i], function, i); 1338 entry[i].flags |= 1339 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1340 ++*nent; 1341 } 1342 break; 1343 } 1344 case 0xb: { 1345 int i, level_type; 1346 1347 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1348 /* read more entries until level_type is zero */ 1349 for (i = 1; *nent < maxnent; ++i) { 1350 level_type = entry[i - 1].ecx & 0xff00; 1351 if (!level_type) 1352 break; 1353 do_cpuid_1_ent(&entry[i], function, i); 1354 entry[i].flags |= 1355 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1356 ++*nent; 1357 } 1358 break; 1359 } 1360 case 0x80000000: 1361 entry->eax = min(entry->eax, 0x8000001a); 1362 break; 1363 case 0x80000001: 1364 entry->edx &= kvm_supported_word1_x86_features; 1365 entry->ecx &= kvm_supported_word6_x86_features; 1366 break; 1367 } 1368 put_cpu(); 1369 } 1370 1371 #undef F 1372 1373 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1374 struct kvm_cpuid_entry2 __user *entries) 1375 { 1376 struct kvm_cpuid_entry2 *cpuid_entries; 1377 int limit, nent = 0, r = -E2BIG; 1378 u32 func; 1379 1380 if (cpuid->nent < 1) 1381 goto out; 1382 r = -ENOMEM; 1383 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1384 if (!cpuid_entries) 1385 goto out; 1386 1387 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1388 limit = cpuid_entries[0].eax; 1389 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1390 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1391 &nent, cpuid->nent); 1392 r = -E2BIG; 1393 if (nent >= cpuid->nent) 1394 goto out_free; 1395 1396 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1397 limit = cpuid_entries[nent - 1].eax; 1398 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1399 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1400 &nent, cpuid->nent); 1401 r = -EFAULT; 1402 if (copy_to_user(entries, cpuid_entries, 1403 nent * sizeof(struct kvm_cpuid_entry2))) 1404 goto out_free; 1405 cpuid->nent = nent; 1406 r = 0; 1407 1408 out_free: 1409 vfree(cpuid_entries); 1410 out: 1411 return r; 1412 } 1413 1414 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1415 struct kvm_lapic_state *s) 1416 { 1417 vcpu_load(vcpu); 1418 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1419 vcpu_put(vcpu); 1420 1421 return 0; 1422 } 1423 1424 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1425 struct kvm_lapic_state *s) 1426 { 1427 vcpu_load(vcpu); 1428 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1429 kvm_apic_post_state_restore(vcpu); 1430 vcpu_put(vcpu); 1431 1432 return 0; 1433 } 1434 1435 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1436 struct kvm_interrupt *irq) 1437 { 1438 if (irq->irq < 0 || irq->irq >= 256) 1439 return -EINVAL; 1440 if (irqchip_in_kernel(vcpu->kvm)) 1441 return -ENXIO; 1442 vcpu_load(vcpu); 1443 1444 kvm_queue_interrupt(vcpu, irq->irq, false); 1445 1446 vcpu_put(vcpu); 1447 1448 return 0; 1449 } 1450 1451 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1452 { 1453 vcpu_load(vcpu); 1454 kvm_inject_nmi(vcpu); 1455 vcpu_put(vcpu); 1456 1457 return 0; 1458 } 1459 1460 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1461 struct kvm_tpr_access_ctl *tac) 1462 { 1463 if (tac->flags) 1464 return -EINVAL; 1465 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1466 return 0; 1467 } 1468 1469 long kvm_arch_vcpu_ioctl(struct file *filp, 1470 unsigned int ioctl, unsigned long arg) 1471 { 1472 struct kvm_vcpu *vcpu = filp->private_data; 1473 void __user *argp = (void __user *)arg; 1474 int r; 1475 struct kvm_lapic_state *lapic = NULL; 1476 1477 switch (ioctl) { 1478 case KVM_GET_LAPIC: { 1479 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1480 1481 r = -ENOMEM; 1482 if (!lapic) 1483 goto out; 1484 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1485 if (r) 1486 goto out; 1487 r = -EFAULT; 1488 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1489 goto out; 1490 r = 0; 1491 break; 1492 } 1493 case KVM_SET_LAPIC: { 1494 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1495 r = -ENOMEM; 1496 if (!lapic) 1497 goto out; 1498 r = -EFAULT; 1499 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1500 goto out; 1501 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1502 if (r) 1503 goto out; 1504 r = 0; 1505 break; 1506 } 1507 case KVM_INTERRUPT: { 1508 struct kvm_interrupt irq; 1509 1510 r = -EFAULT; 1511 if (copy_from_user(&irq, argp, sizeof irq)) 1512 goto out; 1513 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1514 if (r) 1515 goto out; 1516 r = 0; 1517 break; 1518 } 1519 case KVM_NMI: { 1520 r = kvm_vcpu_ioctl_nmi(vcpu); 1521 if (r) 1522 goto out; 1523 r = 0; 1524 break; 1525 } 1526 case KVM_SET_CPUID: { 1527 struct kvm_cpuid __user *cpuid_arg = argp; 1528 struct kvm_cpuid cpuid; 1529 1530 r = -EFAULT; 1531 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1532 goto out; 1533 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1534 if (r) 1535 goto out; 1536 break; 1537 } 1538 case KVM_SET_CPUID2: { 1539 struct kvm_cpuid2 __user *cpuid_arg = argp; 1540 struct kvm_cpuid2 cpuid; 1541 1542 r = -EFAULT; 1543 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1544 goto out; 1545 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1546 cpuid_arg->entries); 1547 if (r) 1548 goto out; 1549 break; 1550 } 1551 case KVM_GET_CPUID2: { 1552 struct kvm_cpuid2 __user *cpuid_arg = argp; 1553 struct kvm_cpuid2 cpuid; 1554 1555 r = -EFAULT; 1556 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1557 goto out; 1558 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1559 cpuid_arg->entries); 1560 if (r) 1561 goto out; 1562 r = -EFAULT; 1563 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1564 goto out; 1565 r = 0; 1566 break; 1567 } 1568 case KVM_GET_MSRS: 1569 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1570 break; 1571 case KVM_SET_MSRS: 1572 r = msr_io(vcpu, argp, do_set_msr, 0); 1573 break; 1574 case KVM_TPR_ACCESS_REPORTING: { 1575 struct kvm_tpr_access_ctl tac; 1576 1577 r = -EFAULT; 1578 if (copy_from_user(&tac, argp, sizeof tac)) 1579 goto out; 1580 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1581 if (r) 1582 goto out; 1583 r = -EFAULT; 1584 if (copy_to_user(argp, &tac, sizeof tac)) 1585 goto out; 1586 r = 0; 1587 break; 1588 }; 1589 case KVM_SET_VAPIC_ADDR: { 1590 struct kvm_vapic_addr va; 1591 1592 r = -EINVAL; 1593 if (!irqchip_in_kernel(vcpu->kvm)) 1594 goto out; 1595 r = -EFAULT; 1596 if (copy_from_user(&va, argp, sizeof va)) 1597 goto out; 1598 r = 0; 1599 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1600 break; 1601 } 1602 default: 1603 r = -EINVAL; 1604 } 1605 out: 1606 kfree(lapic); 1607 return r; 1608 } 1609 1610 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1611 { 1612 int ret; 1613 1614 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1615 return -1; 1616 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1617 return ret; 1618 } 1619 1620 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1621 u32 kvm_nr_mmu_pages) 1622 { 1623 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1624 return -EINVAL; 1625 1626 down_write(&kvm->slots_lock); 1627 spin_lock(&kvm->mmu_lock); 1628 1629 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1630 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1631 1632 spin_unlock(&kvm->mmu_lock); 1633 up_write(&kvm->slots_lock); 1634 return 0; 1635 } 1636 1637 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1638 { 1639 return kvm->arch.n_alloc_mmu_pages; 1640 } 1641 1642 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1643 { 1644 int i; 1645 struct kvm_mem_alias *alias; 1646 1647 for (i = 0; i < kvm->arch.naliases; ++i) { 1648 alias = &kvm->arch.aliases[i]; 1649 if (gfn >= alias->base_gfn 1650 && gfn < alias->base_gfn + alias->npages) 1651 return alias->target_gfn + gfn - alias->base_gfn; 1652 } 1653 return gfn; 1654 } 1655 1656 /* 1657 * Set a new alias region. Aliases map a portion of physical memory into 1658 * another portion. This is useful for memory windows, for example the PC 1659 * VGA region. 1660 */ 1661 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1662 struct kvm_memory_alias *alias) 1663 { 1664 int r, n; 1665 struct kvm_mem_alias *p; 1666 1667 r = -EINVAL; 1668 /* General sanity checks */ 1669 if (alias->memory_size & (PAGE_SIZE - 1)) 1670 goto out; 1671 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1672 goto out; 1673 if (alias->slot >= KVM_ALIAS_SLOTS) 1674 goto out; 1675 if (alias->guest_phys_addr + alias->memory_size 1676 < alias->guest_phys_addr) 1677 goto out; 1678 if (alias->target_phys_addr + alias->memory_size 1679 < alias->target_phys_addr) 1680 goto out; 1681 1682 down_write(&kvm->slots_lock); 1683 spin_lock(&kvm->mmu_lock); 1684 1685 p = &kvm->arch.aliases[alias->slot]; 1686 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1687 p->npages = alias->memory_size >> PAGE_SHIFT; 1688 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1689 1690 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1691 if (kvm->arch.aliases[n - 1].npages) 1692 break; 1693 kvm->arch.naliases = n; 1694 1695 spin_unlock(&kvm->mmu_lock); 1696 kvm_mmu_zap_all(kvm); 1697 1698 up_write(&kvm->slots_lock); 1699 1700 return 0; 1701 1702 out: 1703 return r; 1704 } 1705 1706 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1707 { 1708 int r; 1709 1710 r = 0; 1711 switch (chip->chip_id) { 1712 case KVM_IRQCHIP_PIC_MASTER: 1713 memcpy(&chip->chip.pic, 1714 &pic_irqchip(kvm)->pics[0], 1715 sizeof(struct kvm_pic_state)); 1716 break; 1717 case KVM_IRQCHIP_PIC_SLAVE: 1718 memcpy(&chip->chip.pic, 1719 &pic_irqchip(kvm)->pics[1], 1720 sizeof(struct kvm_pic_state)); 1721 break; 1722 case KVM_IRQCHIP_IOAPIC: 1723 memcpy(&chip->chip.ioapic, 1724 ioapic_irqchip(kvm), 1725 sizeof(struct kvm_ioapic_state)); 1726 break; 1727 default: 1728 r = -EINVAL; 1729 break; 1730 } 1731 return r; 1732 } 1733 1734 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1735 { 1736 int r; 1737 1738 r = 0; 1739 switch (chip->chip_id) { 1740 case KVM_IRQCHIP_PIC_MASTER: 1741 memcpy(&pic_irqchip(kvm)->pics[0], 1742 &chip->chip.pic, 1743 sizeof(struct kvm_pic_state)); 1744 break; 1745 case KVM_IRQCHIP_PIC_SLAVE: 1746 memcpy(&pic_irqchip(kvm)->pics[1], 1747 &chip->chip.pic, 1748 sizeof(struct kvm_pic_state)); 1749 break; 1750 case KVM_IRQCHIP_IOAPIC: 1751 memcpy(ioapic_irqchip(kvm), 1752 &chip->chip.ioapic, 1753 sizeof(struct kvm_ioapic_state)); 1754 break; 1755 default: 1756 r = -EINVAL; 1757 break; 1758 } 1759 kvm_pic_update_irq(pic_irqchip(kvm)); 1760 return r; 1761 } 1762 1763 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1764 { 1765 int r = 0; 1766 1767 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1768 return r; 1769 } 1770 1771 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1772 { 1773 int r = 0; 1774 1775 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1776 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1777 return r; 1778 } 1779 1780 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 1781 struct kvm_reinject_control *control) 1782 { 1783 if (!kvm->arch.vpit) 1784 return -ENXIO; 1785 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 1786 return 0; 1787 } 1788 1789 /* 1790 * Get (and clear) the dirty memory log for a memory slot. 1791 */ 1792 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1793 struct kvm_dirty_log *log) 1794 { 1795 int r; 1796 int n; 1797 struct kvm_memory_slot *memslot; 1798 int is_dirty = 0; 1799 1800 down_write(&kvm->slots_lock); 1801 1802 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1803 if (r) 1804 goto out; 1805 1806 /* If nothing is dirty, don't bother messing with page tables. */ 1807 if (is_dirty) { 1808 spin_lock(&kvm->mmu_lock); 1809 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1810 spin_unlock(&kvm->mmu_lock); 1811 kvm_flush_remote_tlbs(kvm); 1812 memslot = &kvm->memslots[log->slot]; 1813 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1814 memset(memslot->dirty_bitmap, 0, n); 1815 } 1816 r = 0; 1817 out: 1818 up_write(&kvm->slots_lock); 1819 return r; 1820 } 1821 1822 long kvm_arch_vm_ioctl(struct file *filp, 1823 unsigned int ioctl, unsigned long arg) 1824 { 1825 struct kvm *kvm = filp->private_data; 1826 void __user *argp = (void __user *)arg; 1827 int r = -EINVAL; 1828 /* 1829 * This union makes it completely explicit to gcc-3.x 1830 * that these two variables' stack usage should be 1831 * combined, not added together. 1832 */ 1833 union { 1834 struct kvm_pit_state ps; 1835 struct kvm_memory_alias alias; 1836 } u; 1837 1838 switch (ioctl) { 1839 case KVM_SET_TSS_ADDR: 1840 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1841 if (r < 0) 1842 goto out; 1843 break; 1844 case KVM_SET_MEMORY_REGION: { 1845 struct kvm_memory_region kvm_mem; 1846 struct kvm_userspace_memory_region kvm_userspace_mem; 1847 1848 r = -EFAULT; 1849 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1850 goto out; 1851 kvm_userspace_mem.slot = kvm_mem.slot; 1852 kvm_userspace_mem.flags = kvm_mem.flags; 1853 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1854 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1855 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1856 if (r) 1857 goto out; 1858 break; 1859 } 1860 case KVM_SET_NR_MMU_PAGES: 1861 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1862 if (r) 1863 goto out; 1864 break; 1865 case KVM_GET_NR_MMU_PAGES: 1866 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1867 break; 1868 case KVM_SET_MEMORY_ALIAS: 1869 r = -EFAULT; 1870 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 1871 goto out; 1872 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 1873 if (r) 1874 goto out; 1875 break; 1876 case KVM_CREATE_IRQCHIP: 1877 r = -ENOMEM; 1878 kvm->arch.vpic = kvm_create_pic(kvm); 1879 if (kvm->arch.vpic) { 1880 r = kvm_ioapic_init(kvm); 1881 if (r) { 1882 kfree(kvm->arch.vpic); 1883 kvm->arch.vpic = NULL; 1884 goto out; 1885 } 1886 } else 1887 goto out; 1888 r = kvm_setup_default_irq_routing(kvm); 1889 if (r) { 1890 kfree(kvm->arch.vpic); 1891 kfree(kvm->arch.vioapic); 1892 goto out; 1893 } 1894 break; 1895 case KVM_CREATE_PIT: 1896 mutex_lock(&kvm->lock); 1897 r = -EEXIST; 1898 if (kvm->arch.vpit) 1899 goto create_pit_unlock; 1900 r = -ENOMEM; 1901 kvm->arch.vpit = kvm_create_pit(kvm); 1902 if (kvm->arch.vpit) 1903 r = 0; 1904 create_pit_unlock: 1905 mutex_unlock(&kvm->lock); 1906 break; 1907 case KVM_IRQ_LINE_STATUS: 1908 case KVM_IRQ_LINE: { 1909 struct kvm_irq_level irq_event; 1910 1911 r = -EFAULT; 1912 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1913 goto out; 1914 if (irqchip_in_kernel(kvm)) { 1915 __s32 status; 1916 mutex_lock(&kvm->lock); 1917 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1918 irq_event.irq, irq_event.level); 1919 mutex_unlock(&kvm->lock); 1920 if (ioctl == KVM_IRQ_LINE_STATUS) { 1921 irq_event.status = status; 1922 if (copy_to_user(argp, &irq_event, 1923 sizeof irq_event)) 1924 goto out; 1925 } 1926 r = 0; 1927 } 1928 break; 1929 } 1930 case KVM_GET_IRQCHIP: { 1931 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1932 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1933 1934 r = -ENOMEM; 1935 if (!chip) 1936 goto out; 1937 r = -EFAULT; 1938 if (copy_from_user(chip, argp, sizeof *chip)) 1939 goto get_irqchip_out; 1940 r = -ENXIO; 1941 if (!irqchip_in_kernel(kvm)) 1942 goto get_irqchip_out; 1943 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 1944 if (r) 1945 goto get_irqchip_out; 1946 r = -EFAULT; 1947 if (copy_to_user(argp, chip, sizeof *chip)) 1948 goto get_irqchip_out; 1949 r = 0; 1950 get_irqchip_out: 1951 kfree(chip); 1952 if (r) 1953 goto out; 1954 break; 1955 } 1956 case KVM_SET_IRQCHIP: { 1957 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1958 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1959 1960 r = -ENOMEM; 1961 if (!chip) 1962 goto out; 1963 r = -EFAULT; 1964 if (copy_from_user(chip, argp, sizeof *chip)) 1965 goto set_irqchip_out; 1966 r = -ENXIO; 1967 if (!irqchip_in_kernel(kvm)) 1968 goto set_irqchip_out; 1969 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 1970 if (r) 1971 goto set_irqchip_out; 1972 r = 0; 1973 set_irqchip_out: 1974 kfree(chip); 1975 if (r) 1976 goto out; 1977 break; 1978 } 1979 case KVM_GET_PIT: { 1980 r = -EFAULT; 1981 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 1982 goto out; 1983 r = -ENXIO; 1984 if (!kvm->arch.vpit) 1985 goto out; 1986 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 1987 if (r) 1988 goto out; 1989 r = -EFAULT; 1990 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 1991 goto out; 1992 r = 0; 1993 break; 1994 } 1995 case KVM_SET_PIT: { 1996 r = -EFAULT; 1997 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 1998 goto out; 1999 r = -ENXIO; 2000 if (!kvm->arch.vpit) 2001 goto out; 2002 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 2003 if (r) 2004 goto out; 2005 r = 0; 2006 break; 2007 } 2008 case KVM_REINJECT_CONTROL: { 2009 struct kvm_reinject_control control; 2010 r = -EFAULT; 2011 if (copy_from_user(&control, argp, sizeof(control))) 2012 goto out; 2013 r = kvm_vm_ioctl_reinject(kvm, &control); 2014 if (r) 2015 goto out; 2016 r = 0; 2017 break; 2018 } 2019 default: 2020 ; 2021 } 2022 out: 2023 return r; 2024 } 2025 2026 static void kvm_init_msr_list(void) 2027 { 2028 u32 dummy[2]; 2029 unsigned i, j; 2030 2031 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2032 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2033 continue; 2034 if (j < i) 2035 msrs_to_save[j] = msrs_to_save[i]; 2036 j++; 2037 } 2038 num_msrs_to_save = j; 2039 } 2040 2041 /* 2042 * Only apic need an MMIO device hook, so shortcut now.. 2043 */ 2044 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 2045 gpa_t addr, int len, 2046 int is_write) 2047 { 2048 struct kvm_io_device *dev; 2049 2050 if (vcpu->arch.apic) { 2051 dev = &vcpu->arch.apic->dev; 2052 if (dev->in_range(dev, addr, len, is_write)) 2053 return dev; 2054 } 2055 return NULL; 2056 } 2057 2058 2059 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 2060 gpa_t addr, int len, 2061 int is_write) 2062 { 2063 struct kvm_io_device *dev; 2064 2065 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 2066 if (dev == NULL) 2067 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, 2068 is_write); 2069 return dev; 2070 } 2071 2072 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2073 struct kvm_vcpu *vcpu) 2074 { 2075 void *data = val; 2076 int r = X86EMUL_CONTINUE; 2077 2078 while (bytes) { 2079 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2080 unsigned offset = addr & (PAGE_SIZE-1); 2081 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 2082 int ret; 2083 2084 if (gpa == UNMAPPED_GVA) { 2085 r = X86EMUL_PROPAGATE_FAULT; 2086 goto out; 2087 } 2088 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 2089 if (ret < 0) { 2090 r = X86EMUL_UNHANDLEABLE; 2091 goto out; 2092 } 2093 2094 bytes -= toread; 2095 data += toread; 2096 addr += toread; 2097 } 2098 out: 2099 return r; 2100 } 2101 2102 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 2103 struct kvm_vcpu *vcpu) 2104 { 2105 void *data = val; 2106 int r = X86EMUL_CONTINUE; 2107 2108 while (bytes) { 2109 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2110 unsigned offset = addr & (PAGE_SIZE-1); 2111 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 2112 int ret; 2113 2114 if (gpa == UNMAPPED_GVA) { 2115 r = X86EMUL_PROPAGATE_FAULT; 2116 goto out; 2117 } 2118 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 2119 if (ret < 0) { 2120 r = X86EMUL_UNHANDLEABLE; 2121 goto out; 2122 } 2123 2124 bytes -= towrite; 2125 data += towrite; 2126 addr += towrite; 2127 } 2128 out: 2129 return r; 2130 } 2131 2132 2133 static int emulator_read_emulated(unsigned long addr, 2134 void *val, 2135 unsigned int bytes, 2136 struct kvm_vcpu *vcpu) 2137 { 2138 struct kvm_io_device *mmio_dev; 2139 gpa_t gpa; 2140 2141 if (vcpu->mmio_read_completed) { 2142 memcpy(val, vcpu->mmio_data, bytes); 2143 vcpu->mmio_read_completed = 0; 2144 return X86EMUL_CONTINUE; 2145 } 2146 2147 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2148 2149 /* For APIC access vmexit */ 2150 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2151 goto mmio; 2152 2153 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 2154 == X86EMUL_CONTINUE) 2155 return X86EMUL_CONTINUE; 2156 if (gpa == UNMAPPED_GVA) 2157 return X86EMUL_PROPAGATE_FAULT; 2158 2159 mmio: 2160 /* 2161 * Is this MMIO handled locally? 2162 */ 2163 mutex_lock(&vcpu->kvm->lock); 2164 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2165 if (mmio_dev) { 2166 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 2167 mutex_unlock(&vcpu->kvm->lock); 2168 return X86EMUL_CONTINUE; 2169 } 2170 mutex_unlock(&vcpu->kvm->lock); 2171 2172 vcpu->mmio_needed = 1; 2173 vcpu->mmio_phys_addr = gpa; 2174 vcpu->mmio_size = bytes; 2175 vcpu->mmio_is_write = 0; 2176 2177 return X86EMUL_UNHANDLEABLE; 2178 } 2179 2180 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2181 const void *val, int bytes) 2182 { 2183 int ret; 2184 2185 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2186 if (ret < 0) 2187 return 0; 2188 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2189 return 1; 2190 } 2191 2192 static int emulator_write_emulated_onepage(unsigned long addr, 2193 const void *val, 2194 unsigned int bytes, 2195 struct kvm_vcpu *vcpu) 2196 { 2197 struct kvm_io_device *mmio_dev; 2198 gpa_t gpa; 2199 2200 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2201 2202 if (gpa == UNMAPPED_GVA) { 2203 kvm_inject_page_fault(vcpu, addr, 2); 2204 return X86EMUL_PROPAGATE_FAULT; 2205 } 2206 2207 /* For APIC access vmexit */ 2208 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2209 goto mmio; 2210 2211 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2212 return X86EMUL_CONTINUE; 2213 2214 mmio: 2215 /* 2216 * Is this MMIO handled locally? 2217 */ 2218 mutex_lock(&vcpu->kvm->lock); 2219 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); 2220 if (mmio_dev) { 2221 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2222 mutex_unlock(&vcpu->kvm->lock); 2223 return X86EMUL_CONTINUE; 2224 } 2225 mutex_unlock(&vcpu->kvm->lock); 2226 2227 vcpu->mmio_needed = 1; 2228 vcpu->mmio_phys_addr = gpa; 2229 vcpu->mmio_size = bytes; 2230 vcpu->mmio_is_write = 1; 2231 memcpy(vcpu->mmio_data, val, bytes); 2232 2233 return X86EMUL_CONTINUE; 2234 } 2235 2236 int emulator_write_emulated(unsigned long addr, 2237 const void *val, 2238 unsigned int bytes, 2239 struct kvm_vcpu *vcpu) 2240 { 2241 /* Crossing a page boundary? */ 2242 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2243 int rc, now; 2244 2245 now = -addr & ~PAGE_MASK; 2246 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2247 if (rc != X86EMUL_CONTINUE) 2248 return rc; 2249 addr += now; 2250 val += now; 2251 bytes -= now; 2252 } 2253 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2254 } 2255 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2256 2257 static int emulator_cmpxchg_emulated(unsigned long addr, 2258 const void *old, 2259 const void *new, 2260 unsigned int bytes, 2261 struct kvm_vcpu *vcpu) 2262 { 2263 static int reported; 2264 2265 if (!reported) { 2266 reported = 1; 2267 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 2268 } 2269 #ifndef CONFIG_X86_64 2270 /* guests cmpxchg8b have to be emulated atomically */ 2271 if (bytes == 8) { 2272 gpa_t gpa; 2273 struct page *page; 2274 char *kaddr; 2275 u64 val; 2276 2277 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2278 2279 if (gpa == UNMAPPED_GVA || 2280 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2281 goto emul_write; 2282 2283 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2284 goto emul_write; 2285 2286 val = *(u64 *)new; 2287 2288 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2289 2290 kaddr = kmap_atomic(page, KM_USER0); 2291 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2292 kunmap_atomic(kaddr, KM_USER0); 2293 kvm_release_page_dirty(page); 2294 } 2295 emul_write: 2296 #endif 2297 2298 return emulator_write_emulated(addr, new, bytes, vcpu); 2299 } 2300 2301 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2302 { 2303 return kvm_x86_ops->get_segment_base(vcpu, seg); 2304 } 2305 2306 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2307 { 2308 kvm_mmu_invlpg(vcpu, address); 2309 return X86EMUL_CONTINUE; 2310 } 2311 2312 int emulate_clts(struct kvm_vcpu *vcpu) 2313 { 2314 KVMTRACE_0D(CLTS, vcpu, handler); 2315 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2316 return X86EMUL_CONTINUE; 2317 } 2318 2319 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2320 { 2321 struct kvm_vcpu *vcpu = ctxt->vcpu; 2322 2323 switch (dr) { 2324 case 0 ... 3: 2325 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2326 return X86EMUL_CONTINUE; 2327 default: 2328 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2329 return X86EMUL_UNHANDLEABLE; 2330 } 2331 } 2332 2333 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2334 { 2335 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2336 int exception; 2337 2338 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2339 if (exception) { 2340 /* FIXME: better handling */ 2341 return X86EMUL_UNHANDLEABLE; 2342 } 2343 return X86EMUL_CONTINUE; 2344 } 2345 2346 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2347 { 2348 u8 opcodes[4]; 2349 unsigned long rip = kvm_rip_read(vcpu); 2350 unsigned long rip_linear; 2351 2352 if (!printk_ratelimit()) 2353 return; 2354 2355 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2356 2357 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 2358 2359 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2360 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2361 } 2362 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2363 2364 static struct x86_emulate_ops emulate_ops = { 2365 .read_std = kvm_read_guest_virt, 2366 .read_emulated = emulator_read_emulated, 2367 .write_emulated = emulator_write_emulated, 2368 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2369 }; 2370 2371 static void cache_all_regs(struct kvm_vcpu *vcpu) 2372 { 2373 kvm_register_read(vcpu, VCPU_REGS_RAX); 2374 kvm_register_read(vcpu, VCPU_REGS_RSP); 2375 kvm_register_read(vcpu, VCPU_REGS_RIP); 2376 vcpu->arch.regs_dirty = ~0; 2377 } 2378 2379 int emulate_instruction(struct kvm_vcpu *vcpu, 2380 struct kvm_run *run, 2381 unsigned long cr2, 2382 u16 error_code, 2383 int emulation_type) 2384 { 2385 int r, shadow_mask; 2386 struct decode_cache *c; 2387 2388 kvm_clear_exception_queue(vcpu); 2389 vcpu->arch.mmio_fault_cr2 = cr2; 2390 /* 2391 * TODO: fix x86_emulate.c to use guest_read/write_register 2392 * instead of direct ->regs accesses, can save hundred cycles 2393 * on Intel for instructions that don't read/change RSP, for 2394 * for example. 2395 */ 2396 cache_all_regs(vcpu); 2397 2398 vcpu->mmio_is_write = 0; 2399 vcpu->arch.pio.string = 0; 2400 2401 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2402 int cs_db, cs_l; 2403 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2404 2405 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2406 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2407 vcpu->arch.emulate_ctxt.mode = 2408 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2409 ? X86EMUL_MODE_REAL : cs_l 2410 ? X86EMUL_MODE_PROT64 : cs_db 2411 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2412 2413 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2414 2415 /* Reject the instructions other than VMCALL/VMMCALL when 2416 * try to emulate invalid opcode */ 2417 c = &vcpu->arch.emulate_ctxt.decode; 2418 if ((emulation_type & EMULTYPE_TRAP_UD) && 2419 (!(c->twobyte && c->b == 0x01 && 2420 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2421 c->modrm_mod == 3 && c->modrm_rm == 1))) 2422 return EMULATE_FAIL; 2423 2424 ++vcpu->stat.insn_emulation; 2425 if (r) { 2426 ++vcpu->stat.insn_emulation_fail; 2427 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2428 return EMULATE_DONE; 2429 return EMULATE_FAIL; 2430 } 2431 } 2432 2433 if (emulation_type & EMULTYPE_SKIP) { 2434 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 2435 return EMULATE_DONE; 2436 } 2437 2438 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2439 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 2440 2441 if (r == 0) 2442 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 2443 2444 if (vcpu->arch.pio.string) 2445 return EMULATE_DO_MMIO; 2446 2447 if ((r || vcpu->mmio_is_write) && run) { 2448 run->exit_reason = KVM_EXIT_MMIO; 2449 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2450 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2451 run->mmio.len = vcpu->mmio_size; 2452 run->mmio.is_write = vcpu->mmio_is_write; 2453 } 2454 2455 if (r) { 2456 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2457 return EMULATE_DONE; 2458 if (!vcpu->mmio_needed) { 2459 kvm_report_emulation_failure(vcpu, "mmio"); 2460 return EMULATE_FAIL; 2461 } 2462 return EMULATE_DO_MMIO; 2463 } 2464 2465 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2466 2467 if (vcpu->mmio_is_write) { 2468 vcpu->mmio_needed = 0; 2469 return EMULATE_DO_MMIO; 2470 } 2471 2472 return EMULATE_DONE; 2473 } 2474 EXPORT_SYMBOL_GPL(emulate_instruction); 2475 2476 static int pio_copy_data(struct kvm_vcpu *vcpu) 2477 { 2478 void *p = vcpu->arch.pio_data; 2479 gva_t q = vcpu->arch.pio.guest_gva; 2480 unsigned bytes; 2481 int ret; 2482 2483 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2484 if (vcpu->arch.pio.in) 2485 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 2486 else 2487 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 2488 return ret; 2489 } 2490 2491 int complete_pio(struct kvm_vcpu *vcpu) 2492 { 2493 struct kvm_pio_request *io = &vcpu->arch.pio; 2494 long delta; 2495 int r; 2496 unsigned long val; 2497 2498 if (!io->string) { 2499 if (io->in) { 2500 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2501 memcpy(&val, vcpu->arch.pio_data, io->size); 2502 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2503 } 2504 } else { 2505 if (io->in) { 2506 r = pio_copy_data(vcpu); 2507 if (r) 2508 return r; 2509 } 2510 2511 delta = 1; 2512 if (io->rep) { 2513 delta *= io->cur_count; 2514 /* 2515 * The size of the register should really depend on 2516 * current address size. 2517 */ 2518 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2519 val -= delta; 2520 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2521 } 2522 if (io->down) 2523 delta = -delta; 2524 delta *= io->size; 2525 if (io->in) { 2526 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2527 val += delta; 2528 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2529 } else { 2530 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2531 val += delta; 2532 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2533 } 2534 } 2535 2536 io->count -= io->cur_count; 2537 io->cur_count = 0; 2538 2539 return 0; 2540 } 2541 2542 static void kernel_pio(struct kvm_io_device *pio_dev, 2543 struct kvm_vcpu *vcpu, 2544 void *pd) 2545 { 2546 /* TODO: String I/O for in kernel device */ 2547 2548 mutex_lock(&vcpu->kvm->lock); 2549 if (vcpu->arch.pio.in) 2550 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2551 vcpu->arch.pio.size, 2552 pd); 2553 else 2554 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2555 vcpu->arch.pio.size, 2556 pd); 2557 mutex_unlock(&vcpu->kvm->lock); 2558 } 2559 2560 static void pio_string_write(struct kvm_io_device *pio_dev, 2561 struct kvm_vcpu *vcpu) 2562 { 2563 struct kvm_pio_request *io = &vcpu->arch.pio; 2564 void *pd = vcpu->arch.pio_data; 2565 int i; 2566 2567 mutex_lock(&vcpu->kvm->lock); 2568 for (i = 0; i < io->cur_count; i++) { 2569 kvm_iodevice_write(pio_dev, io->port, 2570 io->size, 2571 pd); 2572 pd += io->size; 2573 } 2574 mutex_unlock(&vcpu->kvm->lock); 2575 } 2576 2577 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2578 gpa_t addr, int len, 2579 int is_write) 2580 { 2581 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); 2582 } 2583 2584 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2585 int size, unsigned port) 2586 { 2587 struct kvm_io_device *pio_dev; 2588 unsigned long val; 2589 2590 vcpu->run->exit_reason = KVM_EXIT_IO; 2591 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2592 vcpu->run->io.size = vcpu->arch.pio.size = size; 2593 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2594 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2595 vcpu->run->io.port = vcpu->arch.pio.port = port; 2596 vcpu->arch.pio.in = in; 2597 vcpu->arch.pio.string = 0; 2598 vcpu->arch.pio.down = 0; 2599 vcpu->arch.pio.rep = 0; 2600 2601 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2602 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2603 handler); 2604 else 2605 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2606 handler); 2607 2608 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2609 memcpy(vcpu->arch.pio_data, &val, 4); 2610 2611 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2612 if (pio_dev) { 2613 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2614 complete_pio(vcpu); 2615 return 1; 2616 } 2617 return 0; 2618 } 2619 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2620 2621 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2622 int size, unsigned long count, int down, 2623 gva_t address, int rep, unsigned port) 2624 { 2625 unsigned now, in_page; 2626 int ret = 0; 2627 struct kvm_io_device *pio_dev; 2628 2629 vcpu->run->exit_reason = KVM_EXIT_IO; 2630 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2631 vcpu->run->io.size = vcpu->arch.pio.size = size; 2632 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2633 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2634 vcpu->run->io.port = vcpu->arch.pio.port = port; 2635 vcpu->arch.pio.in = in; 2636 vcpu->arch.pio.string = 1; 2637 vcpu->arch.pio.down = down; 2638 vcpu->arch.pio.rep = rep; 2639 2640 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2641 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2642 handler); 2643 else 2644 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2645 handler); 2646 2647 if (!count) { 2648 kvm_x86_ops->skip_emulated_instruction(vcpu); 2649 return 1; 2650 } 2651 2652 if (!down) 2653 in_page = PAGE_SIZE - offset_in_page(address); 2654 else 2655 in_page = offset_in_page(address) + size; 2656 now = min(count, (unsigned long)in_page / size); 2657 if (!now) 2658 now = 1; 2659 if (down) { 2660 /* 2661 * String I/O in reverse. Yuck. Kill the guest, fix later. 2662 */ 2663 pr_unimpl(vcpu, "guest string pio down\n"); 2664 kvm_inject_gp(vcpu, 0); 2665 return 1; 2666 } 2667 vcpu->run->io.count = now; 2668 vcpu->arch.pio.cur_count = now; 2669 2670 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2671 kvm_x86_ops->skip_emulated_instruction(vcpu); 2672 2673 vcpu->arch.pio.guest_gva = address; 2674 2675 pio_dev = vcpu_find_pio_dev(vcpu, port, 2676 vcpu->arch.pio.cur_count, 2677 !vcpu->arch.pio.in); 2678 if (!vcpu->arch.pio.in) { 2679 /* string PIO write */ 2680 ret = pio_copy_data(vcpu); 2681 if (ret == X86EMUL_PROPAGATE_FAULT) { 2682 kvm_inject_gp(vcpu, 0); 2683 return 1; 2684 } 2685 if (ret == 0 && pio_dev) { 2686 pio_string_write(pio_dev, vcpu); 2687 complete_pio(vcpu); 2688 if (vcpu->arch.pio.count == 0) 2689 ret = 1; 2690 } 2691 } else if (pio_dev) 2692 pr_unimpl(vcpu, "no string pio read support yet, " 2693 "port %x size %d count %ld\n", 2694 port, size, count); 2695 2696 return ret; 2697 } 2698 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2699 2700 static void bounce_off(void *info) 2701 { 2702 /* nothing */ 2703 } 2704 2705 static unsigned int ref_freq; 2706 static unsigned long tsc_khz_ref; 2707 2708 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 2709 void *data) 2710 { 2711 struct cpufreq_freqs *freq = data; 2712 struct kvm *kvm; 2713 struct kvm_vcpu *vcpu; 2714 int i, send_ipi = 0; 2715 2716 if (!ref_freq) 2717 ref_freq = freq->old; 2718 2719 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 2720 return 0; 2721 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 2722 return 0; 2723 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 2724 2725 spin_lock(&kvm_lock); 2726 list_for_each_entry(kvm, &vm_list, vm_list) { 2727 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2728 vcpu = kvm->vcpus[i]; 2729 if (!vcpu) 2730 continue; 2731 if (vcpu->cpu != freq->cpu) 2732 continue; 2733 if (!kvm_request_guest_time_update(vcpu)) 2734 continue; 2735 if (vcpu->cpu != smp_processor_id()) 2736 send_ipi++; 2737 } 2738 } 2739 spin_unlock(&kvm_lock); 2740 2741 if (freq->old < freq->new && send_ipi) { 2742 /* 2743 * We upscale the frequency. Must make the guest 2744 * doesn't see old kvmclock values while running with 2745 * the new frequency, otherwise we risk the guest sees 2746 * time go backwards. 2747 * 2748 * In case we update the frequency for another cpu 2749 * (which might be in guest context) send an interrupt 2750 * to kick the cpu out of guest context. Next time 2751 * guest context is entered kvmclock will be updated, 2752 * so the guest will not see stale values. 2753 */ 2754 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 2755 } 2756 return 0; 2757 } 2758 2759 static struct notifier_block kvmclock_cpufreq_notifier_block = { 2760 .notifier_call = kvmclock_cpufreq_notifier 2761 }; 2762 2763 int kvm_arch_init(void *opaque) 2764 { 2765 int r, cpu; 2766 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2767 2768 if (kvm_x86_ops) { 2769 printk(KERN_ERR "kvm: already loaded the other module\n"); 2770 r = -EEXIST; 2771 goto out; 2772 } 2773 2774 if (!ops->cpu_has_kvm_support()) { 2775 printk(KERN_ERR "kvm: no hardware support\n"); 2776 r = -EOPNOTSUPP; 2777 goto out; 2778 } 2779 if (ops->disabled_by_bios()) { 2780 printk(KERN_ERR "kvm: disabled by bios\n"); 2781 r = -EOPNOTSUPP; 2782 goto out; 2783 } 2784 2785 r = kvm_mmu_module_init(); 2786 if (r) 2787 goto out; 2788 2789 kvm_init_msr_list(); 2790 2791 kvm_x86_ops = ops; 2792 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2793 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2794 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2795 PT_DIRTY_MASK, PT64_NX_MASK, 0); 2796 2797 for_each_possible_cpu(cpu) 2798 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2799 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 2800 tsc_khz_ref = tsc_khz; 2801 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 2802 CPUFREQ_TRANSITION_NOTIFIER); 2803 } 2804 2805 return 0; 2806 2807 out: 2808 return r; 2809 } 2810 2811 void kvm_arch_exit(void) 2812 { 2813 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 2814 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 2815 CPUFREQ_TRANSITION_NOTIFIER); 2816 kvm_x86_ops = NULL; 2817 kvm_mmu_module_exit(); 2818 } 2819 2820 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2821 { 2822 ++vcpu->stat.halt_exits; 2823 KVMTRACE_0D(HLT, vcpu, handler); 2824 if (irqchip_in_kernel(vcpu->kvm)) { 2825 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2826 return 1; 2827 } else { 2828 vcpu->run->exit_reason = KVM_EXIT_HLT; 2829 return 0; 2830 } 2831 } 2832 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2833 2834 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2835 unsigned long a1) 2836 { 2837 if (is_long_mode(vcpu)) 2838 return a0; 2839 else 2840 return a0 | ((gpa_t)a1 << 32); 2841 } 2842 2843 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2844 { 2845 unsigned long nr, a0, a1, a2, a3, ret; 2846 int r = 1; 2847 2848 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 2849 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 2850 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 2851 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 2852 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 2853 2854 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2855 2856 if (!is_long_mode(vcpu)) { 2857 nr &= 0xFFFFFFFF; 2858 a0 &= 0xFFFFFFFF; 2859 a1 &= 0xFFFFFFFF; 2860 a2 &= 0xFFFFFFFF; 2861 a3 &= 0xFFFFFFFF; 2862 } 2863 2864 switch (nr) { 2865 case KVM_HC_VAPIC_POLL_IRQ: 2866 ret = 0; 2867 break; 2868 case KVM_HC_MMU_OP: 2869 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2870 break; 2871 default: 2872 ret = -KVM_ENOSYS; 2873 break; 2874 } 2875 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 2876 ++vcpu->stat.hypercalls; 2877 return r; 2878 } 2879 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2880 2881 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2882 { 2883 char instruction[3]; 2884 int ret = 0; 2885 unsigned long rip = kvm_rip_read(vcpu); 2886 2887 2888 /* 2889 * Blow out the MMU to ensure that no other VCPU has an active mapping 2890 * to ensure that the updated hypercall appears atomically across all 2891 * VCPUs. 2892 */ 2893 kvm_mmu_zap_all(vcpu->kvm); 2894 2895 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2896 if (emulator_write_emulated(rip, instruction, 3, vcpu) 2897 != X86EMUL_CONTINUE) 2898 ret = -EFAULT; 2899 2900 return ret; 2901 } 2902 2903 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2904 { 2905 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2906 } 2907 2908 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2909 { 2910 struct descriptor_table dt = { limit, base }; 2911 2912 kvm_x86_ops->set_gdt(vcpu, &dt); 2913 } 2914 2915 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2916 { 2917 struct descriptor_table dt = { limit, base }; 2918 2919 kvm_x86_ops->set_idt(vcpu, &dt); 2920 } 2921 2922 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2923 unsigned long *rflags) 2924 { 2925 kvm_lmsw(vcpu, msw); 2926 *rflags = kvm_x86_ops->get_rflags(vcpu); 2927 } 2928 2929 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2930 { 2931 unsigned long value; 2932 2933 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2934 switch (cr) { 2935 case 0: 2936 value = vcpu->arch.cr0; 2937 break; 2938 case 2: 2939 value = vcpu->arch.cr2; 2940 break; 2941 case 3: 2942 value = vcpu->arch.cr3; 2943 break; 2944 case 4: 2945 value = vcpu->arch.cr4; 2946 break; 2947 case 8: 2948 value = kvm_get_cr8(vcpu); 2949 break; 2950 default: 2951 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2952 return 0; 2953 } 2954 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, 2955 (u32)((u64)value >> 32), handler); 2956 2957 return value; 2958 } 2959 2960 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2961 unsigned long *rflags) 2962 { 2963 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, 2964 (u32)((u64)val >> 32), handler); 2965 2966 switch (cr) { 2967 case 0: 2968 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2969 *rflags = kvm_x86_ops->get_rflags(vcpu); 2970 break; 2971 case 2: 2972 vcpu->arch.cr2 = val; 2973 break; 2974 case 3: 2975 kvm_set_cr3(vcpu, val); 2976 break; 2977 case 4: 2978 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2979 break; 2980 case 8: 2981 kvm_set_cr8(vcpu, val & 0xfUL); 2982 break; 2983 default: 2984 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2985 } 2986 } 2987 2988 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2989 { 2990 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2991 int j, nent = vcpu->arch.cpuid_nent; 2992 2993 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2994 /* when no next entry is found, the current entry[i] is reselected */ 2995 for (j = i + 1; ; j = (j + 1) % nent) { 2996 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2997 if (ej->function == e->function) { 2998 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2999 return j; 3000 } 3001 } 3002 return 0; /* silence gcc, even though control never reaches here */ 3003 } 3004 3005 /* find an entry with matching function, matching index (if needed), and that 3006 * should be read next (if it's stateful) */ 3007 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 3008 u32 function, u32 index) 3009 { 3010 if (e->function != function) 3011 return 0; 3012 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 3013 return 0; 3014 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 3015 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 3016 return 0; 3017 return 1; 3018 } 3019 3020 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 3021 u32 function, u32 index) 3022 { 3023 int i; 3024 struct kvm_cpuid_entry2 *best = NULL; 3025 3026 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 3027 struct kvm_cpuid_entry2 *e; 3028 3029 e = &vcpu->arch.cpuid_entries[i]; 3030 if (is_matching_cpuid_entry(e, function, index)) { 3031 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 3032 move_to_next_stateful_cpuid_entry(vcpu, i); 3033 best = e; 3034 break; 3035 } 3036 /* 3037 * Both basic or both extended? 3038 */ 3039 if (((e->function ^ function) & 0x80000000) == 0) 3040 if (!best || e->function > best->function) 3041 best = e; 3042 } 3043 return best; 3044 } 3045 3046 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 3047 { 3048 struct kvm_cpuid_entry2 *best; 3049 3050 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 3051 if (best) 3052 return best->eax & 0xff; 3053 return 36; 3054 } 3055 3056 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3057 { 3058 u32 function, index; 3059 struct kvm_cpuid_entry2 *best; 3060 3061 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 3062 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3063 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 3064 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 3065 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 3066 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 3067 best = kvm_find_cpuid_entry(vcpu, function, index); 3068 if (best) { 3069 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3070 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3071 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 3072 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3073 } 3074 kvm_x86_ops->skip_emulated_instruction(vcpu); 3075 KVMTRACE_5D(CPUID, vcpu, function, 3076 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 3077 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 3078 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 3079 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 3080 } 3081 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3082 3083 /* 3084 * Check if userspace requested an interrupt window, and that the 3085 * interrupt window is open. 3086 * 3087 * No need to exit to userspace if we already have an interrupt queued. 3088 */ 3089 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3090 struct kvm_run *kvm_run) 3091 { 3092 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3093 kvm_run->request_interrupt_window && 3094 kvm_arch_interrupt_allowed(vcpu)); 3095 } 3096 3097 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3098 struct kvm_run *kvm_run) 3099 { 3100 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3101 kvm_run->cr8 = kvm_get_cr8(vcpu); 3102 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3103 if (irqchip_in_kernel(vcpu->kvm)) 3104 kvm_run->ready_for_interrupt_injection = 1; 3105 else 3106 kvm_run->ready_for_interrupt_injection = 3107 kvm_arch_interrupt_allowed(vcpu) && 3108 !kvm_cpu_has_interrupt(vcpu) && 3109 !kvm_event_needs_reinjection(vcpu); 3110 } 3111 3112 static void vapic_enter(struct kvm_vcpu *vcpu) 3113 { 3114 struct kvm_lapic *apic = vcpu->arch.apic; 3115 struct page *page; 3116 3117 if (!apic || !apic->vapic_addr) 3118 return; 3119 3120 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3121 3122 vcpu->arch.apic->vapic_page = page; 3123 } 3124 3125 static void vapic_exit(struct kvm_vcpu *vcpu) 3126 { 3127 struct kvm_lapic *apic = vcpu->arch.apic; 3128 3129 if (!apic || !apic->vapic_addr) 3130 return; 3131 3132 down_read(&vcpu->kvm->slots_lock); 3133 kvm_release_page_dirty(apic->vapic_page); 3134 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 3135 up_read(&vcpu->kvm->slots_lock); 3136 } 3137 3138 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 3139 { 3140 int max_irr, tpr; 3141 3142 if (!kvm_x86_ops->update_cr8_intercept) 3143 return; 3144 3145 if (!vcpu->arch.apic->vapic_addr) 3146 max_irr = kvm_lapic_find_highest_irr(vcpu); 3147 else 3148 max_irr = -1; 3149 3150 if (max_irr != -1) 3151 max_irr >>= 4; 3152 3153 tpr = kvm_lapic_get_cr8(vcpu); 3154 3155 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3156 } 3157 3158 static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3159 { 3160 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3161 kvm_x86_ops->set_interrupt_shadow(vcpu, 0); 3162 3163 /* try to reinject previous events if any */ 3164 if (vcpu->arch.nmi_injected) { 3165 kvm_x86_ops->set_nmi(vcpu); 3166 return; 3167 } 3168 3169 if (vcpu->arch.interrupt.pending) { 3170 kvm_x86_ops->set_irq(vcpu); 3171 return; 3172 } 3173 3174 /* try to inject new event if pending */ 3175 if (vcpu->arch.nmi_pending) { 3176 if (kvm_x86_ops->nmi_allowed(vcpu)) { 3177 vcpu->arch.nmi_pending = false; 3178 vcpu->arch.nmi_injected = true; 3179 kvm_x86_ops->set_nmi(vcpu); 3180 } 3181 } else if (kvm_cpu_has_interrupt(vcpu)) { 3182 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 3183 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 3184 false); 3185 kvm_x86_ops->set_irq(vcpu); 3186 } 3187 } 3188 } 3189 3190 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3191 { 3192 int r; 3193 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3194 kvm_run->request_interrupt_window; 3195 3196 if (vcpu->requests) 3197 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3198 kvm_mmu_unload(vcpu); 3199 3200 r = kvm_mmu_reload(vcpu); 3201 if (unlikely(r)) 3202 goto out; 3203 3204 if (vcpu->requests) { 3205 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3206 __kvm_migrate_timers(vcpu); 3207 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 3208 kvm_write_guest_time(vcpu); 3209 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3210 kvm_mmu_sync_roots(vcpu); 3211 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3212 kvm_x86_ops->tlb_flush(vcpu); 3213 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3214 &vcpu->requests)) { 3215 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3216 r = 0; 3217 goto out; 3218 } 3219 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3220 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3221 r = 0; 3222 goto out; 3223 } 3224 } 3225 3226 preempt_disable(); 3227 3228 kvm_x86_ops->prepare_guest_switch(vcpu); 3229 kvm_load_guest_fpu(vcpu); 3230 3231 local_irq_disable(); 3232 3233 clear_bit(KVM_REQ_KICK, &vcpu->requests); 3234 smp_mb__after_clear_bit(); 3235 3236 if (vcpu->requests || need_resched() || signal_pending(current)) { 3237 local_irq_enable(); 3238 preempt_enable(); 3239 r = 1; 3240 goto out; 3241 } 3242 3243 if (vcpu->arch.exception.pending) 3244 __queue_exception(vcpu); 3245 else 3246 inject_pending_irq(vcpu, kvm_run); 3247 3248 /* enable NMI/IRQ window open exits if needed */ 3249 if (vcpu->arch.nmi_pending) 3250 kvm_x86_ops->enable_nmi_window(vcpu); 3251 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 3252 kvm_x86_ops->enable_irq_window(vcpu); 3253 3254 if (kvm_lapic_enabled(vcpu)) { 3255 update_cr8_intercept(vcpu); 3256 kvm_lapic_sync_to_vapic(vcpu); 3257 } 3258 3259 up_read(&vcpu->kvm->slots_lock); 3260 3261 kvm_guest_enter(); 3262 3263 get_debugreg(vcpu->arch.host_dr6, 6); 3264 get_debugreg(vcpu->arch.host_dr7, 7); 3265 if (unlikely(vcpu->arch.switch_db_regs)) { 3266 get_debugreg(vcpu->arch.host_db[0], 0); 3267 get_debugreg(vcpu->arch.host_db[1], 1); 3268 get_debugreg(vcpu->arch.host_db[2], 2); 3269 get_debugreg(vcpu->arch.host_db[3], 3); 3270 3271 set_debugreg(0, 7); 3272 set_debugreg(vcpu->arch.eff_db[0], 0); 3273 set_debugreg(vcpu->arch.eff_db[1], 1); 3274 set_debugreg(vcpu->arch.eff_db[2], 2); 3275 set_debugreg(vcpu->arch.eff_db[3], 3); 3276 } 3277 3278 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3279 kvm_x86_ops->run(vcpu, kvm_run); 3280 3281 if (unlikely(vcpu->arch.switch_db_regs)) { 3282 set_debugreg(0, 7); 3283 set_debugreg(vcpu->arch.host_db[0], 0); 3284 set_debugreg(vcpu->arch.host_db[1], 1); 3285 set_debugreg(vcpu->arch.host_db[2], 2); 3286 set_debugreg(vcpu->arch.host_db[3], 3); 3287 } 3288 set_debugreg(vcpu->arch.host_dr6, 6); 3289 set_debugreg(vcpu->arch.host_dr7, 7); 3290 3291 set_bit(KVM_REQ_KICK, &vcpu->requests); 3292 local_irq_enable(); 3293 3294 ++vcpu->stat.exits; 3295 3296 /* 3297 * We must have an instruction between local_irq_enable() and 3298 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3299 * the interrupt shadow. The stat.exits increment will do nicely. 3300 * But we need to prevent reordering, hence this barrier(): 3301 */ 3302 barrier(); 3303 3304 kvm_guest_exit(); 3305 3306 preempt_enable(); 3307 3308 down_read(&vcpu->kvm->slots_lock); 3309 3310 /* 3311 * Profile KVM exit RIPs: 3312 */ 3313 if (unlikely(prof_on == KVM_PROFILING)) { 3314 unsigned long rip = kvm_rip_read(vcpu); 3315 profile_hit(KVM_PROFILING, (void *)rip); 3316 } 3317 3318 3319 kvm_lapic_sync_from_vapic(vcpu); 3320 3321 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3322 out: 3323 return r; 3324 } 3325 3326 3327 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3328 { 3329 int r; 3330 3331 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3332 pr_debug("vcpu %d received sipi with vector # %x\n", 3333 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3334 kvm_lapic_reset(vcpu); 3335 r = kvm_arch_vcpu_reset(vcpu); 3336 if (r) 3337 return r; 3338 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3339 } 3340 3341 down_read(&vcpu->kvm->slots_lock); 3342 vapic_enter(vcpu); 3343 3344 r = 1; 3345 while (r > 0) { 3346 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3347 r = vcpu_enter_guest(vcpu, kvm_run); 3348 else { 3349 up_read(&vcpu->kvm->slots_lock); 3350 kvm_vcpu_block(vcpu); 3351 down_read(&vcpu->kvm->slots_lock); 3352 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3353 { 3354 switch(vcpu->arch.mp_state) { 3355 case KVM_MP_STATE_HALTED: 3356 vcpu->arch.mp_state = 3357 KVM_MP_STATE_RUNNABLE; 3358 case KVM_MP_STATE_RUNNABLE: 3359 break; 3360 case KVM_MP_STATE_SIPI_RECEIVED: 3361 default: 3362 r = -EINTR; 3363 break; 3364 } 3365 } 3366 } 3367 3368 if (r <= 0) 3369 break; 3370 3371 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 3372 if (kvm_cpu_has_pending_timer(vcpu)) 3373 kvm_inject_pending_timer_irqs(vcpu); 3374 3375 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3376 r = -EINTR; 3377 kvm_run->exit_reason = KVM_EXIT_INTR; 3378 ++vcpu->stat.request_irq_exits; 3379 } 3380 if (signal_pending(current)) { 3381 r = -EINTR; 3382 kvm_run->exit_reason = KVM_EXIT_INTR; 3383 ++vcpu->stat.signal_exits; 3384 } 3385 if (need_resched()) { 3386 up_read(&vcpu->kvm->slots_lock); 3387 kvm_resched(vcpu); 3388 down_read(&vcpu->kvm->slots_lock); 3389 } 3390 } 3391 3392 up_read(&vcpu->kvm->slots_lock); 3393 post_kvm_run_save(vcpu, kvm_run); 3394 3395 vapic_exit(vcpu); 3396 3397 return r; 3398 } 3399 3400 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3401 { 3402 int r; 3403 sigset_t sigsaved; 3404 3405 vcpu_load(vcpu); 3406 3407 if (vcpu->sigset_active) 3408 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3409 3410 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3411 kvm_vcpu_block(vcpu); 3412 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3413 r = -EAGAIN; 3414 goto out; 3415 } 3416 3417 /* re-sync apic's tpr */ 3418 if (!irqchip_in_kernel(vcpu->kvm)) 3419 kvm_set_cr8(vcpu, kvm_run->cr8); 3420 3421 if (vcpu->arch.pio.cur_count) { 3422 r = complete_pio(vcpu); 3423 if (r) 3424 goto out; 3425 } 3426 #if CONFIG_HAS_IOMEM 3427 if (vcpu->mmio_needed) { 3428 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3429 vcpu->mmio_read_completed = 1; 3430 vcpu->mmio_needed = 0; 3431 3432 down_read(&vcpu->kvm->slots_lock); 3433 r = emulate_instruction(vcpu, kvm_run, 3434 vcpu->arch.mmio_fault_cr2, 0, 3435 EMULTYPE_NO_DECODE); 3436 up_read(&vcpu->kvm->slots_lock); 3437 if (r == EMULATE_DO_MMIO) { 3438 /* 3439 * Read-modify-write. Back to userspace. 3440 */ 3441 r = 0; 3442 goto out; 3443 } 3444 } 3445 #endif 3446 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3447 kvm_register_write(vcpu, VCPU_REGS_RAX, 3448 kvm_run->hypercall.ret); 3449 3450 r = __vcpu_run(vcpu, kvm_run); 3451 3452 out: 3453 if (vcpu->sigset_active) 3454 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3455 3456 vcpu_put(vcpu); 3457 return r; 3458 } 3459 3460 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3461 { 3462 vcpu_load(vcpu); 3463 3464 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3465 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3466 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3467 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3468 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3469 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3470 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3471 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3472 #ifdef CONFIG_X86_64 3473 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3474 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3475 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3476 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3477 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3478 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3479 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3480 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3481 #endif 3482 3483 regs->rip = kvm_rip_read(vcpu); 3484 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3485 3486 /* 3487 * Don't leak debug flags in case they were set for guest debugging 3488 */ 3489 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3490 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3491 3492 vcpu_put(vcpu); 3493 3494 return 0; 3495 } 3496 3497 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3498 { 3499 vcpu_load(vcpu); 3500 3501 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3502 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3503 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3504 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3505 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3506 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3507 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3508 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3509 #ifdef CONFIG_X86_64 3510 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3511 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3512 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3513 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3514 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3515 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3516 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3517 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3518 3519 #endif 3520 3521 kvm_rip_write(vcpu, regs->rip); 3522 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3523 3524 3525 vcpu->arch.exception.pending = false; 3526 3527 vcpu_put(vcpu); 3528 3529 return 0; 3530 } 3531 3532 void kvm_get_segment(struct kvm_vcpu *vcpu, 3533 struct kvm_segment *var, int seg) 3534 { 3535 kvm_x86_ops->get_segment(vcpu, var, seg); 3536 } 3537 3538 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3539 { 3540 struct kvm_segment cs; 3541 3542 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3543 *db = cs.db; 3544 *l = cs.l; 3545 } 3546 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3547 3548 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3549 struct kvm_sregs *sregs) 3550 { 3551 struct descriptor_table dt; 3552 3553 vcpu_load(vcpu); 3554 3555 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3556 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3557 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3558 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3559 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3560 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3561 3562 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3563 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3564 3565 kvm_x86_ops->get_idt(vcpu, &dt); 3566 sregs->idt.limit = dt.limit; 3567 sregs->idt.base = dt.base; 3568 kvm_x86_ops->get_gdt(vcpu, &dt); 3569 sregs->gdt.limit = dt.limit; 3570 sregs->gdt.base = dt.base; 3571 3572 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3573 sregs->cr0 = vcpu->arch.cr0; 3574 sregs->cr2 = vcpu->arch.cr2; 3575 sregs->cr3 = vcpu->arch.cr3; 3576 sregs->cr4 = vcpu->arch.cr4; 3577 sregs->cr8 = kvm_get_cr8(vcpu); 3578 sregs->efer = vcpu->arch.shadow_efer; 3579 sregs->apic_base = kvm_get_apic_base(vcpu); 3580 3581 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 3582 3583 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 3584 set_bit(vcpu->arch.interrupt.nr, 3585 (unsigned long *)sregs->interrupt_bitmap); 3586 3587 vcpu_put(vcpu); 3588 3589 return 0; 3590 } 3591 3592 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3593 struct kvm_mp_state *mp_state) 3594 { 3595 vcpu_load(vcpu); 3596 mp_state->mp_state = vcpu->arch.mp_state; 3597 vcpu_put(vcpu); 3598 return 0; 3599 } 3600 3601 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3602 struct kvm_mp_state *mp_state) 3603 { 3604 vcpu_load(vcpu); 3605 vcpu->arch.mp_state = mp_state->mp_state; 3606 vcpu_put(vcpu); 3607 return 0; 3608 } 3609 3610 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3611 struct kvm_segment *var, int seg) 3612 { 3613 kvm_x86_ops->set_segment(vcpu, var, seg); 3614 } 3615 3616 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3617 struct kvm_segment *kvm_desct) 3618 { 3619 kvm_desct->base = seg_desc->base0; 3620 kvm_desct->base |= seg_desc->base1 << 16; 3621 kvm_desct->base |= seg_desc->base2 << 24; 3622 kvm_desct->limit = seg_desc->limit0; 3623 kvm_desct->limit |= seg_desc->limit << 16; 3624 if (seg_desc->g) { 3625 kvm_desct->limit <<= 12; 3626 kvm_desct->limit |= 0xfff; 3627 } 3628 kvm_desct->selector = selector; 3629 kvm_desct->type = seg_desc->type; 3630 kvm_desct->present = seg_desc->p; 3631 kvm_desct->dpl = seg_desc->dpl; 3632 kvm_desct->db = seg_desc->d; 3633 kvm_desct->s = seg_desc->s; 3634 kvm_desct->l = seg_desc->l; 3635 kvm_desct->g = seg_desc->g; 3636 kvm_desct->avl = seg_desc->avl; 3637 if (!selector) 3638 kvm_desct->unusable = 1; 3639 else 3640 kvm_desct->unusable = 0; 3641 kvm_desct->padding = 0; 3642 } 3643 3644 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 3645 u16 selector, 3646 struct descriptor_table *dtable) 3647 { 3648 if (selector & 1 << 2) { 3649 struct kvm_segment kvm_seg; 3650 3651 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3652 3653 if (kvm_seg.unusable) 3654 dtable->limit = 0; 3655 else 3656 dtable->limit = kvm_seg.limit; 3657 dtable->base = kvm_seg.base; 3658 } 3659 else 3660 kvm_x86_ops->get_gdt(vcpu, dtable); 3661 } 3662 3663 /* allowed just for 8 bytes segments */ 3664 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3665 struct desc_struct *seg_desc) 3666 { 3667 gpa_t gpa; 3668 struct descriptor_table dtable; 3669 u16 index = selector >> 3; 3670 3671 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3672 3673 if (dtable.limit < index * 8 + 7) { 3674 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3675 return 1; 3676 } 3677 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3678 gpa += index * 8; 3679 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); 3680 } 3681 3682 /* allowed just for 8 bytes segments */ 3683 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3684 struct desc_struct *seg_desc) 3685 { 3686 gpa_t gpa; 3687 struct descriptor_table dtable; 3688 u16 index = selector >> 3; 3689 3690 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3691 3692 if (dtable.limit < index * 8 + 7) 3693 return 1; 3694 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3695 gpa += index * 8; 3696 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); 3697 } 3698 3699 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3700 struct desc_struct *seg_desc) 3701 { 3702 u32 base_addr; 3703 3704 base_addr = seg_desc->base0; 3705 base_addr |= (seg_desc->base1 << 16); 3706 base_addr |= (seg_desc->base2 << 24); 3707 3708 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 3709 } 3710 3711 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3712 { 3713 struct kvm_segment kvm_seg; 3714 3715 kvm_get_segment(vcpu, &kvm_seg, seg); 3716 return kvm_seg.selector; 3717 } 3718 3719 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3720 u16 selector, 3721 struct kvm_segment *kvm_seg) 3722 { 3723 struct desc_struct seg_desc; 3724 3725 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3726 return 1; 3727 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3728 return 0; 3729 } 3730 3731 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 3732 { 3733 struct kvm_segment segvar = { 3734 .base = selector << 4, 3735 .limit = 0xffff, 3736 .selector = selector, 3737 .type = 3, 3738 .present = 1, 3739 .dpl = 3, 3740 .db = 0, 3741 .s = 1, 3742 .l = 0, 3743 .g = 0, 3744 .avl = 0, 3745 .unusable = 0, 3746 }; 3747 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 3748 return 0; 3749 } 3750 3751 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3752 int type_bits, int seg) 3753 { 3754 struct kvm_segment kvm_seg; 3755 3756 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 3757 return kvm_load_realmode_segment(vcpu, selector, seg); 3758 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3759 return 1; 3760 kvm_seg.type |= type_bits; 3761 3762 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3763 seg != VCPU_SREG_LDTR) 3764 if (!kvm_seg.s) 3765 kvm_seg.unusable = 1; 3766 3767 kvm_set_segment(vcpu, &kvm_seg, seg); 3768 return 0; 3769 } 3770 3771 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3772 struct tss_segment_32 *tss) 3773 { 3774 tss->cr3 = vcpu->arch.cr3; 3775 tss->eip = kvm_rip_read(vcpu); 3776 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3777 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3778 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3779 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3780 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3781 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3782 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3783 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3784 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3785 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3786 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3787 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3788 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3789 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3790 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3791 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3792 } 3793 3794 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3795 struct tss_segment_32 *tss) 3796 { 3797 kvm_set_cr3(vcpu, tss->cr3); 3798 3799 kvm_rip_write(vcpu, tss->eip); 3800 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3801 3802 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 3803 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 3804 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 3805 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 3806 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 3807 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 3808 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 3809 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 3810 3811 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3812 return 1; 3813 3814 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3815 return 1; 3816 3817 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3818 return 1; 3819 3820 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3821 return 1; 3822 3823 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3824 return 1; 3825 3826 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3827 return 1; 3828 3829 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3830 return 1; 3831 return 0; 3832 } 3833 3834 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3835 struct tss_segment_16 *tss) 3836 { 3837 tss->ip = kvm_rip_read(vcpu); 3838 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3839 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3840 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3841 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3842 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3843 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3844 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3845 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 3846 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 3847 3848 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3849 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3850 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3851 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3852 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3853 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3854 } 3855 3856 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3857 struct tss_segment_16 *tss) 3858 { 3859 kvm_rip_write(vcpu, tss->ip); 3860 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3861 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 3862 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 3863 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 3864 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 3865 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 3866 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 3867 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 3868 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 3869 3870 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3871 return 1; 3872 3873 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3874 return 1; 3875 3876 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3877 return 1; 3878 3879 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3880 return 1; 3881 3882 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3883 return 1; 3884 return 0; 3885 } 3886 3887 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3888 u16 old_tss_sel, u32 old_tss_base, 3889 struct desc_struct *nseg_desc) 3890 { 3891 struct tss_segment_16 tss_segment_16; 3892 int ret = 0; 3893 3894 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3895 sizeof tss_segment_16)) 3896 goto out; 3897 3898 save_state_to_tss16(vcpu, &tss_segment_16); 3899 3900 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3901 sizeof tss_segment_16)) 3902 goto out; 3903 3904 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3905 &tss_segment_16, sizeof tss_segment_16)) 3906 goto out; 3907 3908 if (old_tss_sel != 0xffff) { 3909 tss_segment_16.prev_task_link = old_tss_sel; 3910 3911 if (kvm_write_guest(vcpu->kvm, 3912 get_tss_base_addr(vcpu, nseg_desc), 3913 &tss_segment_16.prev_task_link, 3914 sizeof tss_segment_16.prev_task_link)) 3915 goto out; 3916 } 3917 3918 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3919 goto out; 3920 3921 ret = 1; 3922 out: 3923 return ret; 3924 } 3925 3926 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3927 u16 old_tss_sel, u32 old_tss_base, 3928 struct desc_struct *nseg_desc) 3929 { 3930 struct tss_segment_32 tss_segment_32; 3931 int ret = 0; 3932 3933 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3934 sizeof tss_segment_32)) 3935 goto out; 3936 3937 save_state_to_tss32(vcpu, &tss_segment_32); 3938 3939 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3940 sizeof tss_segment_32)) 3941 goto out; 3942 3943 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3944 &tss_segment_32, sizeof tss_segment_32)) 3945 goto out; 3946 3947 if (old_tss_sel != 0xffff) { 3948 tss_segment_32.prev_task_link = old_tss_sel; 3949 3950 if (kvm_write_guest(vcpu->kvm, 3951 get_tss_base_addr(vcpu, nseg_desc), 3952 &tss_segment_32.prev_task_link, 3953 sizeof tss_segment_32.prev_task_link)) 3954 goto out; 3955 } 3956 3957 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3958 goto out; 3959 3960 ret = 1; 3961 out: 3962 return ret; 3963 } 3964 3965 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3966 { 3967 struct kvm_segment tr_seg; 3968 struct desc_struct cseg_desc; 3969 struct desc_struct nseg_desc; 3970 int ret = 0; 3971 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 3972 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 3973 3974 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 3975 3976 /* FIXME: Handle errors. Failure to read either TSS or their 3977 * descriptors should generate a pagefault. 3978 */ 3979 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3980 goto out; 3981 3982 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 3983 goto out; 3984 3985 if (reason != TASK_SWITCH_IRET) { 3986 int cpl; 3987 3988 cpl = kvm_x86_ops->get_cpl(vcpu); 3989 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3990 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3991 return 1; 3992 } 3993 } 3994 3995 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3996 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3997 return 1; 3998 } 3999 4000 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 4001 cseg_desc.type &= ~(1 << 1); //clear the B flag 4002 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 4003 } 4004 4005 if (reason == TASK_SWITCH_IRET) { 4006 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4007 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4008 } 4009 4010 /* set back link to prev task only if NT bit is set in eflags 4011 note that old_tss_sel is not used afetr this point */ 4012 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4013 old_tss_sel = 0xffff; 4014 4015 /* set back link to prev task only if NT bit is set in eflags 4016 note that old_tss_sel is not used afetr this point */ 4017 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4018 old_tss_sel = 0xffff; 4019 4020 if (nseg_desc.type & 8) 4021 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4022 old_tss_base, &nseg_desc); 4023 else 4024 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, 4025 old_tss_base, &nseg_desc); 4026 4027 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4028 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4029 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4030 } 4031 4032 if (reason != TASK_SWITCH_IRET) { 4033 nseg_desc.type |= (1 << 1); 4034 save_guest_segment_descriptor(vcpu, tss_selector, 4035 &nseg_desc); 4036 } 4037 4038 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 4039 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 4040 tr_seg.type = 11; 4041 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 4042 out: 4043 return ret; 4044 } 4045 EXPORT_SYMBOL_GPL(kvm_task_switch); 4046 4047 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 4048 struct kvm_sregs *sregs) 4049 { 4050 int mmu_reset_needed = 0; 4051 int pending_vec, max_bits; 4052 struct descriptor_table dt; 4053 4054 vcpu_load(vcpu); 4055 4056 dt.limit = sregs->idt.limit; 4057 dt.base = sregs->idt.base; 4058 kvm_x86_ops->set_idt(vcpu, &dt); 4059 dt.limit = sregs->gdt.limit; 4060 dt.base = sregs->gdt.base; 4061 kvm_x86_ops->set_gdt(vcpu, &dt); 4062 4063 vcpu->arch.cr2 = sregs->cr2; 4064 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4065 4066 down_read(&vcpu->kvm->slots_lock); 4067 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) 4068 vcpu->arch.cr3 = sregs->cr3; 4069 else 4070 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 4071 up_read(&vcpu->kvm->slots_lock); 4072 4073 kvm_set_cr8(vcpu, sregs->cr8); 4074 4075 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 4076 kvm_x86_ops->set_efer(vcpu, sregs->efer); 4077 kvm_set_apic_base(vcpu, sregs->apic_base); 4078 4079 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4080 4081 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 4082 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 4083 vcpu->arch.cr0 = sregs->cr0; 4084 4085 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4086 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4087 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4088 load_pdptrs(vcpu, vcpu->arch.cr3); 4089 4090 if (mmu_reset_needed) 4091 kvm_mmu_reset_context(vcpu); 4092 4093 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 4094 pending_vec = find_first_bit( 4095 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 4096 if (pending_vec < max_bits) { 4097 kvm_queue_interrupt(vcpu, pending_vec, false); 4098 pr_debug("Set back pending irq %d\n", pending_vec); 4099 if (irqchip_in_kernel(vcpu->kvm)) 4100 kvm_pic_clear_isr_ack(vcpu->kvm); 4101 } 4102 4103 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4104 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 4105 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 4106 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 4107 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 4108 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 4109 4110 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4111 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4112 4113 /* Older userspace won't unhalt the vcpu on reset. */ 4114 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 4115 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4116 !(vcpu->arch.cr0 & X86_CR0_PE)) 4117 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4118 4119 vcpu_put(vcpu); 4120 4121 return 0; 4122 } 4123 4124 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4125 struct kvm_guest_debug *dbg) 4126 { 4127 int i, r; 4128 4129 vcpu_load(vcpu); 4130 4131 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4132 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4133 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4134 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4135 vcpu->arch.switch_db_regs = 4136 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 4137 } else { 4138 for (i = 0; i < KVM_NR_DB_REGS; i++) 4139 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 4140 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4141 } 4142 4143 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4144 4145 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4146 kvm_queue_exception(vcpu, DB_VECTOR); 4147 else if (dbg->control & KVM_GUESTDBG_INJECT_BP) 4148 kvm_queue_exception(vcpu, BP_VECTOR); 4149 4150 vcpu_put(vcpu); 4151 4152 return r; 4153 } 4154 4155 /* 4156 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 4157 * we have asm/x86/processor.h 4158 */ 4159 struct fxsave { 4160 u16 cwd; 4161 u16 swd; 4162 u16 twd; 4163 u16 fop; 4164 u64 rip; 4165 u64 rdp; 4166 u32 mxcsr; 4167 u32 mxcsr_mask; 4168 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 4169 #ifdef CONFIG_X86_64 4170 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 4171 #else 4172 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 4173 #endif 4174 }; 4175 4176 /* 4177 * Translate a guest virtual address to a guest physical address. 4178 */ 4179 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 4180 struct kvm_translation *tr) 4181 { 4182 unsigned long vaddr = tr->linear_address; 4183 gpa_t gpa; 4184 4185 vcpu_load(vcpu); 4186 down_read(&vcpu->kvm->slots_lock); 4187 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 4188 up_read(&vcpu->kvm->slots_lock); 4189 tr->physical_address = gpa; 4190 tr->valid = gpa != UNMAPPED_GVA; 4191 tr->writeable = 1; 4192 tr->usermode = 0; 4193 vcpu_put(vcpu); 4194 4195 return 0; 4196 } 4197 4198 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4199 { 4200 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4201 4202 vcpu_load(vcpu); 4203 4204 memcpy(fpu->fpr, fxsave->st_space, 128); 4205 fpu->fcw = fxsave->cwd; 4206 fpu->fsw = fxsave->swd; 4207 fpu->ftwx = fxsave->twd; 4208 fpu->last_opcode = fxsave->fop; 4209 fpu->last_ip = fxsave->rip; 4210 fpu->last_dp = fxsave->rdp; 4211 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 4212 4213 vcpu_put(vcpu); 4214 4215 return 0; 4216 } 4217 4218 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 4219 { 4220 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 4221 4222 vcpu_load(vcpu); 4223 4224 memcpy(fxsave->st_space, fpu->fpr, 128); 4225 fxsave->cwd = fpu->fcw; 4226 fxsave->swd = fpu->fsw; 4227 fxsave->twd = fpu->ftwx; 4228 fxsave->fop = fpu->last_opcode; 4229 fxsave->rip = fpu->last_ip; 4230 fxsave->rdp = fpu->last_dp; 4231 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 4232 4233 vcpu_put(vcpu); 4234 4235 return 0; 4236 } 4237 4238 void fx_init(struct kvm_vcpu *vcpu) 4239 { 4240 unsigned after_mxcsr_mask; 4241 4242 /* 4243 * Touch the fpu the first time in non atomic context as if 4244 * this is the first fpu instruction the exception handler 4245 * will fire before the instruction returns and it'll have to 4246 * allocate ram with GFP_KERNEL. 4247 */ 4248 if (!used_math()) 4249 kvm_fx_save(&vcpu->arch.host_fx_image); 4250 4251 /* Initialize guest FPU by resetting ours and saving into guest's */ 4252 preempt_disable(); 4253 kvm_fx_save(&vcpu->arch.host_fx_image); 4254 kvm_fx_finit(); 4255 kvm_fx_save(&vcpu->arch.guest_fx_image); 4256 kvm_fx_restore(&vcpu->arch.host_fx_image); 4257 preempt_enable(); 4258 4259 vcpu->arch.cr0 |= X86_CR0_ET; 4260 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 4261 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 4262 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 4263 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 4264 } 4265 EXPORT_SYMBOL_GPL(fx_init); 4266 4267 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 4268 { 4269 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 4270 return; 4271 4272 vcpu->guest_fpu_loaded = 1; 4273 kvm_fx_save(&vcpu->arch.host_fx_image); 4274 kvm_fx_restore(&vcpu->arch.guest_fx_image); 4275 } 4276 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 4277 4278 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 4279 { 4280 if (!vcpu->guest_fpu_loaded) 4281 return; 4282 4283 vcpu->guest_fpu_loaded = 0; 4284 kvm_fx_save(&vcpu->arch.guest_fx_image); 4285 kvm_fx_restore(&vcpu->arch.host_fx_image); 4286 ++vcpu->stat.fpu_reload; 4287 } 4288 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 4289 4290 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 4291 { 4292 if (vcpu->arch.time_page) { 4293 kvm_release_page_dirty(vcpu->arch.time_page); 4294 vcpu->arch.time_page = NULL; 4295 } 4296 4297 kvm_x86_ops->vcpu_free(vcpu); 4298 } 4299 4300 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 4301 unsigned int id) 4302 { 4303 return kvm_x86_ops->vcpu_create(kvm, id); 4304 } 4305 4306 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 4307 { 4308 int r; 4309 4310 /* We do fxsave: this must be aligned. */ 4311 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 4312 4313 vcpu->arch.mtrr_state.have_fixed = 1; 4314 vcpu_load(vcpu); 4315 r = kvm_arch_vcpu_reset(vcpu); 4316 if (r == 0) 4317 r = kvm_mmu_setup(vcpu); 4318 vcpu_put(vcpu); 4319 if (r < 0) 4320 goto free_vcpu; 4321 4322 return 0; 4323 free_vcpu: 4324 kvm_x86_ops->vcpu_free(vcpu); 4325 return r; 4326 } 4327 4328 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 4329 { 4330 vcpu_load(vcpu); 4331 kvm_mmu_unload(vcpu); 4332 vcpu_put(vcpu); 4333 4334 kvm_x86_ops->vcpu_free(vcpu); 4335 } 4336 4337 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4338 { 4339 vcpu->arch.nmi_pending = false; 4340 vcpu->arch.nmi_injected = false; 4341 4342 vcpu->arch.switch_db_regs = 0; 4343 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 4344 vcpu->arch.dr6 = DR6_FIXED_1; 4345 vcpu->arch.dr7 = DR7_FIXED_1; 4346 4347 return kvm_x86_ops->vcpu_reset(vcpu); 4348 } 4349 4350 void kvm_arch_hardware_enable(void *garbage) 4351 { 4352 kvm_x86_ops->hardware_enable(garbage); 4353 } 4354 4355 void kvm_arch_hardware_disable(void *garbage) 4356 { 4357 kvm_x86_ops->hardware_disable(garbage); 4358 } 4359 4360 int kvm_arch_hardware_setup(void) 4361 { 4362 return kvm_x86_ops->hardware_setup(); 4363 } 4364 4365 void kvm_arch_hardware_unsetup(void) 4366 { 4367 kvm_x86_ops->hardware_unsetup(); 4368 } 4369 4370 void kvm_arch_check_processor_compat(void *rtn) 4371 { 4372 kvm_x86_ops->check_processor_compatibility(rtn); 4373 } 4374 4375 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4376 { 4377 struct page *page; 4378 struct kvm *kvm; 4379 int r; 4380 4381 BUG_ON(vcpu->kvm == NULL); 4382 kvm = vcpu->kvm; 4383 4384 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4385 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4386 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4387 else 4388 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4389 4390 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4391 if (!page) { 4392 r = -ENOMEM; 4393 goto fail; 4394 } 4395 vcpu->arch.pio_data = page_address(page); 4396 4397 r = kvm_mmu_create(vcpu); 4398 if (r < 0) 4399 goto fail_free_pio_data; 4400 4401 if (irqchip_in_kernel(kvm)) { 4402 r = kvm_create_lapic(vcpu); 4403 if (r < 0) 4404 goto fail_mmu_destroy; 4405 } 4406 4407 return 0; 4408 4409 fail_mmu_destroy: 4410 kvm_mmu_destroy(vcpu); 4411 fail_free_pio_data: 4412 free_page((unsigned long)vcpu->arch.pio_data); 4413 fail: 4414 return r; 4415 } 4416 4417 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4418 { 4419 kvm_free_lapic(vcpu); 4420 down_read(&vcpu->kvm->slots_lock); 4421 kvm_mmu_destroy(vcpu); 4422 up_read(&vcpu->kvm->slots_lock); 4423 free_page((unsigned long)vcpu->arch.pio_data); 4424 } 4425 4426 struct kvm *kvm_arch_create_vm(void) 4427 { 4428 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4429 4430 if (!kvm) 4431 return ERR_PTR(-ENOMEM); 4432 4433 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4434 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4435 4436 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4437 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4438 4439 rdtscll(kvm->arch.vm_init_tsc); 4440 4441 return kvm; 4442 } 4443 4444 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4445 { 4446 vcpu_load(vcpu); 4447 kvm_mmu_unload(vcpu); 4448 vcpu_put(vcpu); 4449 } 4450 4451 static void kvm_free_vcpus(struct kvm *kvm) 4452 { 4453 unsigned int i; 4454 4455 /* 4456 * Unpin any mmu pages first. 4457 */ 4458 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4459 if (kvm->vcpus[i]) 4460 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4461 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4462 if (kvm->vcpus[i]) { 4463 kvm_arch_vcpu_free(kvm->vcpus[i]); 4464 kvm->vcpus[i] = NULL; 4465 } 4466 } 4467 4468 } 4469 4470 void kvm_arch_sync_events(struct kvm *kvm) 4471 { 4472 kvm_free_all_assigned_devices(kvm); 4473 } 4474 4475 void kvm_arch_destroy_vm(struct kvm *kvm) 4476 { 4477 kvm_iommu_unmap_guest(kvm); 4478 kvm_free_pit(kvm); 4479 kfree(kvm->arch.vpic); 4480 kfree(kvm->arch.vioapic); 4481 kvm_free_vcpus(kvm); 4482 kvm_free_physmem(kvm); 4483 if (kvm->arch.apic_access_page) 4484 put_page(kvm->arch.apic_access_page); 4485 if (kvm->arch.ept_identity_pagetable) 4486 put_page(kvm->arch.ept_identity_pagetable); 4487 kfree(kvm); 4488 } 4489 4490 int kvm_arch_set_memory_region(struct kvm *kvm, 4491 struct kvm_userspace_memory_region *mem, 4492 struct kvm_memory_slot old, 4493 int user_alloc) 4494 { 4495 int npages = mem->memory_size >> PAGE_SHIFT; 4496 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4497 4498 /*To keep backward compatibility with older userspace, 4499 *x86 needs to hanlde !user_alloc case. 4500 */ 4501 if (!user_alloc) { 4502 if (npages && !old.rmap) { 4503 unsigned long userspace_addr; 4504 4505 down_write(¤t->mm->mmap_sem); 4506 userspace_addr = do_mmap(NULL, 0, 4507 npages * PAGE_SIZE, 4508 PROT_READ | PROT_WRITE, 4509 MAP_PRIVATE | MAP_ANONYMOUS, 4510 0); 4511 up_write(¤t->mm->mmap_sem); 4512 4513 if (IS_ERR((void *)userspace_addr)) 4514 return PTR_ERR((void *)userspace_addr); 4515 4516 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4517 spin_lock(&kvm->mmu_lock); 4518 memslot->userspace_addr = userspace_addr; 4519 spin_unlock(&kvm->mmu_lock); 4520 } else { 4521 if (!old.user_alloc && old.rmap) { 4522 int ret; 4523 4524 down_write(¤t->mm->mmap_sem); 4525 ret = do_munmap(current->mm, old.userspace_addr, 4526 old.npages * PAGE_SIZE); 4527 up_write(¤t->mm->mmap_sem); 4528 if (ret < 0) 4529 printk(KERN_WARNING 4530 "kvm_vm_ioctl_set_memory_region: " 4531 "failed to munmap memory\n"); 4532 } 4533 } 4534 } 4535 4536 spin_lock(&kvm->mmu_lock); 4537 if (!kvm->arch.n_requested_mmu_pages) { 4538 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4539 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4540 } 4541 4542 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4543 spin_unlock(&kvm->mmu_lock); 4544 kvm_flush_remote_tlbs(kvm); 4545 4546 return 0; 4547 } 4548 4549 void kvm_arch_flush_shadow(struct kvm *kvm) 4550 { 4551 kvm_mmu_zap_all(kvm); 4552 kvm_reload_remote_mmus(kvm); 4553 } 4554 4555 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4556 { 4557 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4558 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4559 || vcpu->arch.nmi_pending; 4560 } 4561 4562 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4563 { 4564 int me; 4565 int cpu = vcpu->cpu; 4566 4567 if (waitqueue_active(&vcpu->wq)) { 4568 wake_up_interruptible(&vcpu->wq); 4569 ++vcpu->stat.halt_wakeup; 4570 } 4571 4572 me = get_cpu(); 4573 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 4574 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 4575 smp_send_reschedule(cpu); 4576 put_cpu(); 4577 } 4578 4579 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 4580 { 4581 return kvm_x86_ops->interrupt_allowed(vcpu); 4582 } 4583