1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * KVM paravirt_ops implementation 4 * 5 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright IBM Corporation, 2007 7 * Authors: Anthony Liguori <aliguori@us.ibm.com> 8 */ 9 10 #include <linux/context_tracking.h> 11 #include <linux/init.h> 12 #include <linux/kernel.h> 13 #include <linux/kvm_para.h> 14 #include <linux/cpu.h> 15 #include <linux/mm.h> 16 #include <linux/highmem.h> 17 #include <linux/hardirq.h> 18 #include <linux/notifier.h> 19 #include <linux/reboot.h> 20 #include <linux/hash.h> 21 #include <linux/sched.h> 22 #include <linux/slab.h> 23 #include <linux/kprobes.h> 24 #include <linux/nmi.h> 25 #include <linux/swait.h> 26 #include <asm/timer.h> 27 #include <asm/cpu.h> 28 #include <asm/traps.h> 29 #include <asm/desc.h> 30 #include <asm/tlbflush.h> 31 #include <asm/apic.h> 32 #include <asm/apicdef.h> 33 #include <asm/hypervisor.h> 34 #include <asm/tlb.h> 35 #include <asm/cpuidle_haltpoll.h> 36 37 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); 38 39 static int kvmapf = 1; 40 41 static int __init parse_no_kvmapf(char *arg) 42 { 43 kvmapf = 0; 44 return 0; 45 } 46 47 early_param("no-kvmapf", parse_no_kvmapf); 48 49 static int steal_acc = 1; 50 static int __init parse_no_stealacc(char *arg) 51 { 52 steal_acc = 0; 53 return 0; 54 } 55 56 early_param("no-steal-acc", parse_no_stealacc); 57 58 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 59 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; 60 static int has_steal_clock = 0; 61 62 /* 63 * No need for any "IO delay" on KVM 64 */ 65 static void kvm_io_delay(void) 66 { 67 } 68 69 #define KVM_TASK_SLEEP_HASHBITS 8 70 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) 71 72 struct kvm_task_sleep_node { 73 struct hlist_node link; 74 struct swait_queue_head wq; 75 u32 token; 76 int cpu; 77 }; 78 79 static struct kvm_task_sleep_head { 80 raw_spinlock_t lock; 81 struct hlist_head list; 82 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; 83 84 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, 85 u32 token) 86 { 87 struct hlist_node *p; 88 89 hlist_for_each(p, &b->list) { 90 struct kvm_task_sleep_node *n = 91 hlist_entry(p, typeof(*n), link); 92 if (n->token == token) 93 return n; 94 } 95 96 return NULL; 97 } 98 99 static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) 100 { 101 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 102 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 103 struct kvm_task_sleep_node *e; 104 105 raw_spin_lock(&b->lock); 106 e = _find_apf_task(b, token); 107 if (e) { 108 /* dummy entry exist -> wake up was delivered ahead of PF */ 109 hlist_del(&e->link); 110 raw_spin_unlock(&b->lock); 111 kfree(e); 112 return false; 113 } 114 115 n->token = token; 116 n->cpu = smp_processor_id(); 117 init_swait_queue_head(&n->wq); 118 hlist_add_head(&n->link, &b->list); 119 raw_spin_unlock(&b->lock); 120 return true; 121 } 122 123 /* 124 * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled 125 * @token: Token to identify the sleep node entry 126 * 127 * Invoked from the async pagefault handling code or from the VM exit page 128 * fault handler. In both cases RCU is watching. 129 */ 130 void kvm_async_pf_task_wait_schedule(u32 token) 131 { 132 struct kvm_task_sleep_node n; 133 DECLARE_SWAITQUEUE(wait); 134 135 lockdep_assert_irqs_disabled(); 136 137 if (!kvm_async_pf_queue_task(token, &n)) 138 return; 139 140 for (;;) { 141 prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 142 if (hlist_unhashed(&n.link)) 143 break; 144 145 local_irq_enable(); 146 schedule(); 147 local_irq_disable(); 148 } 149 finish_swait(&n.wq, &wait); 150 } 151 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); 152 153 static void apf_task_wake_one(struct kvm_task_sleep_node *n) 154 { 155 hlist_del_init(&n->link); 156 if (swq_has_sleeper(&n->wq)) 157 swake_up_one(&n->wq); 158 } 159 160 static void apf_task_wake_all(void) 161 { 162 int i; 163 164 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { 165 struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; 166 struct kvm_task_sleep_node *n; 167 struct hlist_node *p, *next; 168 169 raw_spin_lock(&b->lock); 170 hlist_for_each_safe(p, next, &b->list) { 171 n = hlist_entry(p, typeof(*n), link); 172 if (n->cpu == smp_processor_id()) 173 apf_task_wake_one(n); 174 } 175 raw_spin_unlock(&b->lock); 176 } 177 } 178 179 void kvm_async_pf_task_wake(u32 token) 180 { 181 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 182 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 183 struct kvm_task_sleep_node *n; 184 185 if (token == ~0) { 186 apf_task_wake_all(); 187 return; 188 } 189 190 again: 191 raw_spin_lock(&b->lock); 192 n = _find_apf_task(b, token); 193 if (!n) { 194 /* 195 * async PF was not yet handled. 196 * Add dummy entry for the token. 197 */ 198 n = kzalloc(sizeof(*n), GFP_ATOMIC); 199 if (!n) { 200 /* 201 * Allocation failed! Busy wait while other cpu 202 * handles async PF. 203 */ 204 raw_spin_unlock(&b->lock); 205 cpu_relax(); 206 goto again; 207 } 208 n->token = token; 209 n->cpu = smp_processor_id(); 210 init_swait_queue_head(&n->wq); 211 hlist_add_head(&n->link, &b->list); 212 } else { 213 apf_task_wake_one(n); 214 } 215 raw_spin_unlock(&b->lock); 216 return; 217 } 218 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); 219 220 noinstr u32 kvm_read_and_reset_apf_flags(void) 221 { 222 u32 flags = 0; 223 224 if (__this_cpu_read(apf_reason.enabled)) { 225 flags = __this_cpu_read(apf_reason.flags); 226 __this_cpu_write(apf_reason.flags, 0); 227 } 228 229 return flags; 230 } 231 EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); 232 233 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) 234 { 235 u32 reason = kvm_read_and_reset_apf_flags(); 236 bool rcu_exit; 237 238 switch (reason) { 239 case KVM_PV_REASON_PAGE_NOT_PRESENT: 240 case KVM_PV_REASON_PAGE_READY: 241 break; 242 default: 243 return false; 244 } 245 246 rcu_exit = idtentry_enter_cond_rcu(regs); 247 instrumentation_begin(); 248 249 /* 250 * If the host managed to inject an async #PF into an interrupt 251 * disabled region, then die hard as this is not going to end well 252 * and the host side is seriously broken. 253 */ 254 if (unlikely(!(regs->flags & X86_EFLAGS_IF))) 255 panic("Host injected async #PF in interrupt disabled region\n"); 256 257 if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { 258 if (unlikely(!(user_mode(regs)))) 259 panic("Host injected async #PF in kernel mode\n"); 260 /* Page is swapped out by the host. */ 261 kvm_async_pf_task_wait_schedule(token); 262 } else { 263 kvm_async_pf_task_wake(token); 264 } 265 266 instrumentation_end(); 267 idtentry_exit_cond_rcu(regs, rcu_exit); 268 return true; 269 } 270 271 static void __init paravirt_ops_setup(void) 272 { 273 pv_info.name = "KVM"; 274 275 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 276 pv_ops.cpu.io_delay = kvm_io_delay; 277 278 #ifdef CONFIG_X86_IO_APIC 279 no_timer_check = 1; 280 #endif 281 } 282 283 static void kvm_register_steal_time(void) 284 { 285 int cpu = smp_processor_id(); 286 struct kvm_steal_time *st = &per_cpu(steal_time, cpu); 287 288 if (!has_steal_clock) 289 return; 290 291 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); 292 pr_info("kvm-stealtime: cpu %d, msr %llx\n", 293 cpu, (unsigned long long) slow_virt_to_phys(st)); 294 } 295 296 static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 297 298 static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) 299 { 300 /** 301 * This relies on __test_and_clear_bit to modify the memory 302 * in a way that is atomic with respect to the local CPU. 303 * The hypervisor only accesses this memory from the local CPU so 304 * there's no need for lock or memory barriers. 305 * An optimization barrier is implied in apic write. 306 */ 307 if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) 308 return; 309 apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); 310 } 311 312 static void kvm_guest_cpu_init(void) 313 { 314 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 315 u64 pa; 316 317 WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); 318 319 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); 320 pa |= KVM_ASYNC_PF_ENABLED; 321 322 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) 323 pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; 324 325 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); 326 __this_cpu_write(apf_reason.enabled, 1); 327 pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); 328 } 329 330 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { 331 unsigned long pa; 332 333 /* Size alignment is implied but just to make it explicit. */ 334 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 335 __this_cpu_write(kvm_apic_eoi, 0); 336 pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) 337 | KVM_MSR_ENABLED; 338 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 339 } 340 341 if (has_steal_clock) 342 kvm_register_steal_time(); 343 } 344 345 static void kvm_pv_disable_apf(void) 346 { 347 if (!__this_cpu_read(apf_reason.enabled)) 348 return; 349 350 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 351 __this_cpu_write(apf_reason.enabled, 0); 352 353 pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); 354 } 355 356 static void kvm_pv_guest_cpu_reboot(void *unused) 357 { 358 /* 359 * We disable PV EOI before we load a new kernel by kexec, 360 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. 361 * New kernel can re-enable when it boots. 362 */ 363 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 364 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 365 kvm_pv_disable_apf(); 366 kvm_disable_steal_time(); 367 } 368 369 static int kvm_pv_reboot_notify(struct notifier_block *nb, 370 unsigned long code, void *unused) 371 { 372 if (code == SYS_RESTART) 373 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 374 return NOTIFY_DONE; 375 } 376 377 static struct notifier_block kvm_pv_reboot_nb = { 378 .notifier_call = kvm_pv_reboot_notify, 379 }; 380 381 static u64 kvm_steal_clock(int cpu) 382 { 383 u64 steal; 384 struct kvm_steal_time *src; 385 int version; 386 387 src = &per_cpu(steal_time, cpu); 388 do { 389 version = src->version; 390 virt_rmb(); 391 steal = src->steal; 392 virt_rmb(); 393 } while ((version & 1) || (version != src->version)); 394 395 return steal; 396 } 397 398 void kvm_disable_steal_time(void) 399 { 400 if (!has_steal_clock) 401 return; 402 403 wrmsr(MSR_KVM_STEAL_TIME, 0, 0); 404 } 405 406 static inline void __set_percpu_decrypted(void *ptr, unsigned long size) 407 { 408 early_set_memory_decrypted((unsigned long) ptr, size); 409 } 410 411 /* 412 * Iterate through all possible CPUs and map the memory region pointed 413 * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. 414 * 415 * Note: we iterate through all possible CPUs to ensure that CPUs 416 * hotplugged will have their per-cpu variable already mapped as 417 * decrypted. 418 */ 419 static void __init sev_map_percpu_data(void) 420 { 421 int cpu; 422 423 if (!sev_active()) 424 return; 425 426 for_each_possible_cpu(cpu) { 427 __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); 428 __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); 429 __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); 430 } 431 } 432 433 static bool pv_tlb_flush_supported(void) 434 { 435 return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && 436 !kvm_para_has_hint(KVM_HINTS_REALTIME) && 437 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); 438 } 439 440 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); 441 442 #ifdef CONFIG_SMP 443 444 static bool pv_ipi_supported(void) 445 { 446 return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); 447 } 448 449 static bool pv_sched_yield_supported(void) 450 { 451 return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && 452 !kvm_para_has_hint(KVM_HINTS_REALTIME) && 453 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); 454 } 455 456 #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) 457 458 static void __send_ipi_mask(const struct cpumask *mask, int vector) 459 { 460 unsigned long flags; 461 int cpu, apic_id, icr; 462 int min = 0, max = 0; 463 #ifdef CONFIG_X86_64 464 __uint128_t ipi_bitmap = 0; 465 #else 466 u64 ipi_bitmap = 0; 467 #endif 468 long ret; 469 470 if (cpumask_empty(mask)) 471 return; 472 473 local_irq_save(flags); 474 475 switch (vector) { 476 default: 477 icr = APIC_DM_FIXED | vector; 478 break; 479 case NMI_VECTOR: 480 icr = APIC_DM_NMI; 481 break; 482 } 483 484 for_each_cpu(cpu, mask) { 485 apic_id = per_cpu(x86_cpu_to_apicid, cpu); 486 if (!ipi_bitmap) { 487 min = max = apic_id; 488 } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { 489 ipi_bitmap <<= min - apic_id; 490 min = apic_id; 491 } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { 492 max = apic_id < max ? max : apic_id; 493 } else { 494 ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, 495 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); 496 WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); 497 min = max = apic_id; 498 ipi_bitmap = 0; 499 } 500 __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); 501 } 502 503 if (ipi_bitmap) { 504 ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, 505 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); 506 WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); 507 } 508 509 local_irq_restore(flags); 510 } 511 512 static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) 513 { 514 __send_ipi_mask(mask, vector); 515 } 516 517 static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) 518 { 519 unsigned int this_cpu = smp_processor_id(); 520 struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); 521 const struct cpumask *local_mask; 522 523 cpumask_copy(new_mask, mask); 524 cpumask_clear_cpu(this_cpu, new_mask); 525 local_mask = new_mask; 526 __send_ipi_mask(local_mask, vector); 527 } 528 529 /* 530 * Set the IPI entry points 531 */ 532 static void kvm_setup_pv_ipi(void) 533 { 534 apic->send_IPI_mask = kvm_send_ipi_mask; 535 apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; 536 pr_info("KVM setup pv IPIs\n"); 537 } 538 539 static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) 540 { 541 int cpu; 542 543 native_send_call_func_ipi(mask); 544 545 /* Make sure other vCPUs get a chance to run if they need to. */ 546 for_each_cpu(cpu, mask) { 547 if (vcpu_is_preempted(cpu)) { 548 kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); 549 break; 550 } 551 } 552 } 553 554 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) 555 { 556 native_smp_prepare_cpus(max_cpus); 557 if (kvm_para_has_hint(KVM_HINTS_REALTIME)) 558 static_branch_disable(&virt_spin_lock_key); 559 } 560 561 static void __init kvm_smp_prepare_boot_cpu(void) 562 { 563 /* 564 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() 565 * shares the guest physical address with the hypervisor. 566 */ 567 sev_map_percpu_data(); 568 569 kvm_guest_cpu_init(); 570 native_smp_prepare_boot_cpu(); 571 kvm_spinlock_init(); 572 } 573 574 static void kvm_guest_cpu_offline(void) 575 { 576 kvm_disable_steal_time(); 577 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 578 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 579 kvm_pv_disable_apf(); 580 apf_task_wake_all(); 581 } 582 583 static int kvm_cpu_online(unsigned int cpu) 584 { 585 local_irq_disable(); 586 kvm_guest_cpu_init(); 587 local_irq_enable(); 588 return 0; 589 } 590 591 static int kvm_cpu_down_prepare(unsigned int cpu) 592 { 593 local_irq_disable(); 594 kvm_guest_cpu_offline(); 595 local_irq_enable(); 596 return 0; 597 } 598 #endif 599 600 static void kvm_flush_tlb_others(const struct cpumask *cpumask, 601 const struct flush_tlb_info *info) 602 { 603 u8 state; 604 int cpu; 605 struct kvm_steal_time *src; 606 struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); 607 608 cpumask_copy(flushmask, cpumask); 609 /* 610 * We have to call flush only on online vCPUs. And 611 * queue flush_on_enter for pre-empted vCPUs 612 */ 613 for_each_cpu(cpu, flushmask) { 614 src = &per_cpu(steal_time, cpu); 615 state = READ_ONCE(src->preempted); 616 if ((state & KVM_VCPU_PREEMPTED)) { 617 if (try_cmpxchg(&src->preempted, &state, 618 state | KVM_VCPU_FLUSH_TLB)) 619 __cpumask_clear_cpu(cpu, flushmask); 620 } 621 } 622 623 native_flush_tlb_others(flushmask, info); 624 } 625 626 static void __init kvm_guest_init(void) 627 { 628 int i; 629 630 paravirt_ops_setup(); 631 register_reboot_notifier(&kvm_pv_reboot_nb); 632 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) 633 raw_spin_lock_init(&async_pf_sleepers[i].lock); 634 635 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { 636 has_steal_clock = 1; 637 pv_ops.time.steal_clock = kvm_steal_clock; 638 } 639 640 if (pv_tlb_flush_supported()) { 641 pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; 642 pv_ops.mmu.tlb_remove_table = tlb_remove_table; 643 pr_info("KVM setup pv remote TLB flush\n"); 644 } 645 646 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 647 apic_set_eoi_write(kvm_guest_apic_eoi_write); 648 649 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) 650 static_branch_enable(&kvm_async_pf_enabled); 651 652 #ifdef CONFIG_SMP 653 smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; 654 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 655 if (pv_sched_yield_supported()) { 656 smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; 657 pr_info("KVM setup pv sched yield\n"); 658 } 659 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", 660 kvm_cpu_online, kvm_cpu_down_prepare) < 0) 661 pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); 662 #else 663 sev_map_percpu_data(); 664 kvm_guest_cpu_init(); 665 #endif 666 667 /* 668 * Hard lockup detection is enabled by default. Disable it, as guests 669 * can get false positives too easily, for example if the host is 670 * overcommitted. 671 */ 672 hardlockup_detector_disable(); 673 } 674 675 static noinline uint32_t __kvm_cpuid_base(void) 676 { 677 if (boot_cpu_data.cpuid_level < 0) 678 return 0; /* So we don't blow up on old processors */ 679 680 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) 681 return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); 682 683 return 0; 684 } 685 686 static inline uint32_t kvm_cpuid_base(void) 687 { 688 static int kvm_cpuid_base = -1; 689 690 if (kvm_cpuid_base == -1) 691 kvm_cpuid_base = __kvm_cpuid_base(); 692 693 return kvm_cpuid_base; 694 } 695 696 bool kvm_para_available(void) 697 { 698 return kvm_cpuid_base() != 0; 699 } 700 EXPORT_SYMBOL_GPL(kvm_para_available); 701 702 unsigned int kvm_arch_para_features(void) 703 { 704 return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); 705 } 706 707 unsigned int kvm_arch_para_hints(void) 708 { 709 return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); 710 } 711 EXPORT_SYMBOL_GPL(kvm_arch_para_hints); 712 713 static uint32_t __init kvm_detect(void) 714 { 715 return kvm_cpuid_base(); 716 } 717 718 static void __init kvm_apic_init(void) 719 { 720 #if defined(CONFIG_SMP) 721 if (pv_ipi_supported()) 722 kvm_setup_pv_ipi(); 723 #endif 724 } 725 726 static void __init kvm_init_platform(void) 727 { 728 kvmclock_init(); 729 x86_platform.apic_post_init = kvm_apic_init; 730 } 731 732 const __initconst struct hypervisor_x86 x86_hyper_kvm = { 733 .name = "KVM", 734 .detect = kvm_detect, 735 .type = X86_HYPER_KVM, 736 .init.guest_late_init = kvm_guest_init, 737 .init.x2apic_available = kvm_para_available, 738 .init.init_platform = kvm_init_platform, 739 }; 740 741 static __init int activate_jump_labels(void) 742 { 743 if (has_steal_clock) { 744 static_key_slow_inc(¶virt_steal_enabled); 745 if (steal_acc) 746 static_key_slow_inc(¶virt_steal_rq_enabled); 747 } 748 749 return 0; 750 } 751 arch_initcall(activate_jump_labels); 752 753 static __init int kvm_alloc_cpumask(void) 754 { 755 int cpu; 756 bool alloc = false; 757 758 if (!kvm_para_available() || nopv) 759 return 0; 760 761 if (pv_tlb_flush_supported()) 762 alloc = true; 763 764 #if defined(CONFIG_SMP) 765 if (pv_ipi_supported()) 766 alloc = true; 767 #endif 768 769 if (alloc) 770 for_each_possible_cpu(cpu) { 771 zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), 772 GFP_KERNEL, cpu_to_node(cpu)); 773 } 774 775 return 0; 776 } 777 arch_initcall(kvm_alloc_cpumask); 778 779 #ifdef CONFIG_PARAVIRT_SPINLOCKS 780 781 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ 782 static void kvm_kick_cpu(int cpu) 783 { 784 int apicid; 785 unsigned long flags = 0; 786 787 apicid = per_cpu(x86_cpu_to_apicid, cpu); 788 kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); 789 } 790 791 #include <asm/qspinlock.h> 792 793 static void kvm_wait(u8 *ptr, u8 val) 794 { 795 unsigned long flags; 796 797 if (in_nmi()) 798 return; 799 800 local_irq_save(flags); 801 802 if (READ_ONCE(*ptr) != val) 803 goto out; 804 805 /* 806 * halt until it's our turn and kicked. Note that we do safe halt 807 * for irq enabled case to avoid hang when lock info is overwritten 808 * in irq spinlock slowpath and no spurious interrupt occur to save us. 809 */ 810 if (arch_irqs_disabled_flags(flags)) 811 halt(); 812 else 813 safe_halt(); 814 815 out: 816 local_irq_restore(flags); 817 } 818 819 #ifdef CONFIG_X86_32 820 __visible bool __kvm_vcpu_is_preempted(long cpu) 821 { 822 struct kvm_steal_time *src = &per_cpu(steal_time, cpu); 823 824 return !!(src->preempted & KVM_VCPU_PREEMPTED); 825 } 826 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); 827 828 #else 829 830 #include <asm/asm-offsets.h> 831 832 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); 833 834 /* 835 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and 836 * restoring to/from the stack. 837 */ 838 asm( 839 ".pushsection .text;" 840 ".global __raw_callee_save___kvm_vcpu_is_preempted;" 841 ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" 842 "__raw_callee_save___kvm_vcpu_is_preempted:" 843 "movq __per_cpu_offset(,%rdi,8), %rax;" 844 "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" 845 "setne %al;" 846 "ret;" 847 ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" 848 ".popsection"); 849 850 #endif 851 852 /* 853 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. 854 */ 855 void __init kvm_spinlock_init(void) 856 { 857 /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ 858 if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) 859 return; 860 861 if (kvm_para_has_hint(KVM_HINTS_REALTIME)) 862 return; 863 864 /* Don't use the pvqspinlock code if there is only 1 vCPU. */ 865 if (num_possible_cpus() == 1) 866 return; 867 868 __pv_init_lock_hash(); 869 pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; 870 pv_ops.lock.queued_spin_unlock = 871 PV_CALLEE_SAVE(__pv_queued_spin_unlock); 872 pv_ops.lock.wait = kvm_wait; 873 pv_ops.lock.kick = kvm_kick_cpu; 874 875 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { 876 pv_ops.lock.vcpu_is_preempted = 877 PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); 878 } 879 } 880 881 #endif /* CONFIG_PARAVIRT_SPINLOCKS */ 882 883 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL 884 885 static void kvm_disable_host_haltpoll(void *i) 886 { 887 wrmsrl(MSR_KVM_POLL_CONTROL, 0); 888 } 889 890 static void kvm_enable_host_haltpoll(void *i) 891 { 892 wrmsrl(MSR_KVM_POLL_CONTROL, 1); 893 } 894 895 void arch_haltpoll_enable(unsigned int cpu) 896 { 897 if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { 898 pr_err_once("kvm: host does not support poll control\n"); 899 pr_err_once("kvm: host upgrade recommended\n"); 900 return; 901 } 902 903 /* Enable guest halt poll disables host halt poll */ 904 smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); 905 } 906 EXPORT_SYMBOL_GPL(arch_haltpoll_enable); 907 908 void arch_haltpoll_disable(unsigned int cpu) 909 { 910 if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) 911 return; 912 913 /* Enable guest halt poll disables host halt poll */ 914 smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); 915 } 916 EXPORT_SYMBOL_GPL(arch_haltpoll_disable); 917 #endif 918