xref: /openbmc/linux/arch/x86/kernel/kvm.c (revision 8365a898)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * KVM paravirt_ops implementation
4  *
5  * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6  * Copyright IBM Corporation, 2007
7  *   Authors: Anthony Liguori <aliguori@us.ibm.com>
8  */
9 
10 #include <linux/context_tracking.h>
11 #include <linux/init.h>
12 #include <linux/kernel.h>
13 #include <linux/kvm_para.h>
14 #include <linux/cpu.h>
15 #include <linux/mm.h>
16 #include <linux/highmem.h>
17 #include <linux/hardirq.h>
18 #include <linux/notifier.h>
19 #include <linux/reboot.h>
20 #include <linux/hash.h>
21 #include <linux/sched.h>
22 #include <linux/slab.h>
23 #include <linux/kprobes.h>
24 #include <linux/nmi.h>
25 #include <linux/swait.h>
26 #include <asm/timer.h>
27 #include <asm/cpu.h>
28 #include <asm/traps.h>
29 #include <asm/desc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/apic.h>
32 #include <asm/apicdef.h>
33 #include <asm/hypervisor.h>
34 #include <asm/tlb.h>
35 #include <asm/cpuidle_haltpoll.h>
36 
37 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
38 
39 static int kvmapf = 1;
40 
41 static int __init parse_no_kvmapf(char *arg)
42 {
43         kvmapf = 0;
44         return 0;
45 }
46 
47 early_param("no-kvmapf", parse_no_kvmapf);
48 
49 static int steal_acc = 1;
50 static int __init parse_no_stealacc(char *arg)
51 {
52         steal_acc = 0;
53         return 0;
54 }
55 
56 early_param("no-steal-acc", parse_no_stealacc);
57 
58 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
59 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
60 static int has_steal_clock = 0;
61 
62 /*
63  * No need for any "IO delay" on KVM
64  */
65 static void kvm_io_delay(void)
66 {
67 }
68 
69 #define KVM_TASK_SLEEP_HASHBITS 8
70 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
71 
72 struct kvm_task_sleep_node {
73 	struct hlist_node link;
74 	struct swait_queue_head wq;
75 	u32 token;
76 	int cpu;
77 };
78 
79 static struct kvm_task_sleep_head {
80 	raw_spinlock_t lock;
81 	struct hlist_head list;
82 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
83 
84 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
85 						  u32 token)
86 {
87 	struct hlist_node *p;
88 
89 	hlist_for_each(p, &b->list) {
90 		struct kvm_task_sleep_node *n =
91 			hlist_entry(p, typeof(*n), link);
92 		if (n->token == token)
93 			return n;
94 	}
95 
96 	return NULL;
97 }
98 
99 static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
100 {
101 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
102 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
103 	struct kvm_task_sleep_node *e;
104 
105 	raw_spin_lock(&b->lock);
106 	e = _find_apf_task(b, token);
107 	if (e) {
108 		/* dummy entry exist -> wake up was delivered ahead of PF */
109 		hlist_del(&e->link);
110 		raw_spin_unlock(&b->lock);
111 		kfree(e);
112 		return false;
113 	}
114 
115 	n->token = token;
116 	n->cpu = smp_processor_id();
117 	init_swait_queue_head(&n->wq);
118 	hlist_add_head(&n->link, &b->list);
119 	raw_spin_unlock(&b->lock);
120 	return true;
121 }
122 
123 /*
124  * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
125  * @token:	Token to identify the sleep node entry
126  *
127  * Invoked from the async pagefault handling code or from the VM exit page
128  * fault handler. In both cases RCU is watching.
129  */
130 void kvm_async_pf_task_wait_schedule(u32 token)
131 {
132 	struct kvm_task_sleep_node n;
133 	DECLARE_SWAITQUEUE(wait);
134 
135 	lockdep_assert_irqs_disabled();
136 
137 	if (!kvm_async_pf_queue_task(token, &n))
138 		return;
139 
140 	for (;;) {
141 		prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
142 		if (hlist_unhashed(&n.link))
143 			break;
144 
145 		local_irq_enable();
146 		schedule();
147 		local_irq_disable();
148 	}
149 	finish_swait(&n.wq, &wait);
150 }
151 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
152 
153 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
154 {
155 	hlist_del_init(&n->link);
156 	if (swq_has_sleeper(&n->wq))
157 		swake_up_one(&n->wq);
158 }
159 
160 static void apf_task_wake_all(void)
161 {
162 	int i;
163 
164 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
165 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
166 		struct kvm_task_sleep_node *n;
167 		struct hlist_node *p, *next;
168 
169 		raw_spin_lock(&b->lock);
170 		hlist_for_each_safe(p, next, &b->list) {
171 			n = hlist_entry(p, typeof(*n), link);
172 			if (n->cpu == smp_processor_id())
173 				apf_task_wake_one(n);
174 		}
175 		raw_spin_unlock(&b->lock);
176 	}
177 }
178 
179 void kvm_async_pf_task_wake(u32 token)
180 {
181 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
182 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
183 	struct kvm_task_sleep_node *n;
184 
185 	if (token == ~0) {
186 		apf_task_wake_all();
187 		return;
188 	}
189 
190 again:
191 	raw_spin_lock(&b->lock);
192 	n = _find_apf_task(b, token);
193 	if (!n) {
194 		/*
195 		 * async PF was not yet handled.
196 		 * Add dummy entry for the token.
197 		 */
198 		n = kzalloc(sizeof(*n), GFP_ATOMIC);
199 		if (!n) {
200 			/*
201 			 * Allocation failed! Busy wait while other cpu
202 			 * handles async PF.
203 			 */
204 			raw_spin_unlock(&b->lock);
205 			cpu_relax();
206 			goto again;
207 		}
208 		n->token = token;
209 		n->cpu = smp_processor_id();
210 		init_swait_queue_head(&n->wq);
211 		hlist_add_head(&n->link, &b->list);
212 	} else {
213 		apf_task_wake_one(n);
214 	}
215 	raw_spin_unlock(&b->lock);
216 	return;
217 }
218 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
219 
220 noinstr u32 kvm_read_and_reset_apf_flags(void)
221 {
222 	u32 flags = 0;
223 
224 	if (__this_cpu_read(apf_reason.enabled)) {
225 		flags = __this_cpu_read(apf_reason.flags);
226 		__this_cpu_write(apf_reason.flags, 0);
227 	}
228 
229 	return flags;
230 }
231 EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
232 
233 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
234 {
235 	u32 reason = kvm_read_and_reset_apf_flags();
236 	bool rcu_exit;
237 
238 	switch (reason) {
239 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
240 	case KVM_PV_REASON_PAGE_READY:
241 		break;
242 	default:
243 		return false;
244 	}
245 
246 	rcu_exit = idtentry_enter_cond_rcu(regs);
247 	instrumentation_begin();
248 
249 	/*
250 	 * If the host managed to inject an async #PF into an interrupt
251 	 * disabled region, then die hard as this is not going to end well
252 	 * and the host side is seriously broken.
253 	 */
254 	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
255 		panic("Host injected async #PF in interrupt disabled region\n");
256 
257 	if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
258 		if (unlikely(!(user_mode(regs))))
259 			panic("Host injected async #PF in kernel mode\n");
260 		/* Page is swapped out by the host. */
261 		kvm_async_pf_task_wait_schedule(token);
262 	} else {
263 		kvm_async_pf_task_wake(token);
264 	}
265 
266 	instrumentation_end();
267 	idtentry_exit_cond_rcu(regs, rcu_exit);
268 	return true;
269 }
270 
271 static void __init paravirt_ops_setup(void)
272 {
273 	pv_info.name = "KVM";
274 
275 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
276 		pv_ops.cpu.io_delay = kvm_io_delay;
277 
278 #ifdef CONFIG_X86_IO_APIC
279 	no_timer_check = 1;
280 #endif
281 }
282 
283 static void kvm_register_steal_time(void)
284 {
285 	int cpu = smp_processor_id();
286 	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
287 
288 	if (!has_steal_clock)
289 		return;
290 
291 	wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
292 	pr_info("kvm-stealtime: cpu %d, msr %llx\n",
293 		cpu, (unsigned long long) slow_virt_to_phys(st));
294 }
295 
296 static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
297 
298 static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
299 {
300 	/**
301 	 * This relies on __test_and_clear_bit to modify the memory
302 	 * in a way that is atomic with respect to the local CPU.
303 	 * The hypervisor only accesses this memory from the local CPU so
304 	 * there's no need for lock or memory barriers.
305 	 * An optimization barrier is implied in apic write.
306 	 */
307 	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
308 		return;
309 	apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
310 }
311 
312 static void kvm_guest_cpu_init(void)
313 {
314 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
315 		u64 pa;
316 
317 		WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
318 
319 		pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
320 		pa |= KVM_ASYNC_PF_ENABLED;
321 
322 		if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
323 			pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
324 
325 		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
326 		__this_cpu_write(apf_reason.enabled, 1);
327 		pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
328 	}
329 
330 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
331 		unsigned long pa;
332 
333 		/* Size alignment is implied but just to make it explicit. */
334 		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
335 		__this_cpu_write(kvm_apic_eoi, 0);
336 		pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
337 			| KVM_MSR_ENABLED;
338 		wrmsrl(MSR_KVM_PV_EOI_EN, pa);
339 	}
340 
341 	if (has_steal_clock)
342 		kvm_register_steal_time();
343 }
344 
345 static void kvm_pv_disable_apf(void)
346 {
347 	if (!__this_cpu_read(apf_reason.enabled))
348 		return;
349 
350 	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
351 	__this_cpu_write(apf_reason.enabled, 0);
352 
353 	pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
354 }
355 
356 static void kvm_pv_guest_cpu_reboot(void *unused)
357 {
358 	/*
359 	 * We disable PV EOI before we load a new kernel by kexec,
360 	 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
361 	 * New kernel can re-enable when it boots.
362 	 */
363 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
364 		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
365 	kvm_pv_disable_apf();
366 	kvm_disable_steal_time();
367 }
368 
369 static int kvm_pv_reboot_notify(struct notifier_block *nb,
370 				unsigned long code, void *unused)
371 {
372 	if (code == SYS_RESTART)
373 		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
374 	return NOTIFY_DONE;
375 }
376 
377 static struct notifier_block kvm_pv_reboot_nb = {
378 	.notifier_call = kvm_pv_reboot_notify,
379 };
380 
381 static u64 kvm_steal_clock(int cpu)
382 {
383 	u64 steal;
384 	struct kvm_steal_time *src;
385 	int version;
386 
387 	src = &per_cpu(steal_time, cpu);
388 	do {
389 		version = src->version;
390 		virt_rmb();
391 		steal = src->steal;
392 		virt_rmb();
393 	} while ((version & 1) || (version != src->version));
394 
395 	return steal;
396 }
397 
398 void kvm_disable_steal_time(void)
399 {
400 	if (!has_steal_clock)
401 		return;
402 
403 	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
404 }
405 
406 static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
407 {
408 	early_set_memory_decrypted((unsigned long) ptr, size);
409 }
410 
411 /*
412  * Iterate through all possible CPUs and map the memory region pointed
413  * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
414  *
415  * Note: we iterate through all possible CPUs to ensure that CPUs
416  * hotplugged will have their per-cpu variable already mapped as
417  * decrypted.
418  */
419 static void __init sev_map_percpu_data(void)
420 {
421 	int cpu;
422 
423 	if (!sev_active())
424 		return;
425 
426 	for_each_possible_cpu(cpu) {
427 		__set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
428 		__set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
429 		__set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
430 	}
431 }
432 
433 static bool pv_tlb_flush_supported(void)
434 {
435 	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
436 		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
437 		kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
438 }
439 
440 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
441 
442 #ifdef CONFIG_SMP
443 
444 static bool pv_ipi_supported(void)
445 {
446 	return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
447 }
448 
449 static bool pv_sched_yield_supported(void)
450 {
451 	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
452 		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
453 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
454 }
455 
456 #define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)
457 
458 static void __send_ipi_mask(const struct cpumask *mask, int vector)
459 {
460 	unsigned long flags;
461 	int cpu, apic_id, icr;
462 	int min = 0, max = 0;
463 #ifdef CONFIG_X86_64
464 	__uint128_t ipi_bitmap = 0;
465 #else
466 	u64 ipi_bitmap = 0;
467 #endif
468 	long ret;
469 
470 	if (cpumask_empty(mask))
471 		return;
472 
473 	local_irq_save(flags);
474 
475 	switch (vector) {
476 	default:
477 		icr = APIC_DM_FIXED | vector;
478 		break;
479 	case NMI_VECTOR:
480 		icr = APIC_DM_NMI;
481 		break;
482 	}
483 
484 	for_each_cpu(cpu, mask) {
485 		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
486 		if (!ipi_bitmap) {
487 			min = max = apic_id;
488 		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
489 			ipi_bitmap <<= min - apic_id;
490 			min = apic_id;
491 		} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
492 			max = apic_id < max ? max : apic_id;
493 		} else {
494 			ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
495 				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
496 			WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
497 			min = max = apic_id;
498 			ipi_bitmap = 0;
499 		}
500 		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
501 	}
502 
503 	if (ipi_bitmap) {
504 		ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
505 			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
506 		WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
507 	}
508 
509 	local_irq_restore(flags);
510 }
511 
512 static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
513 {
514 	__send_ipi_mask(mask, vector);
515 }
516 
517 static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
518 {
519 	unsigned int this_cpu = smp_processor_id();
520 	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
521 	const struct cpumask *local_mask;
522 
523 	cpumask_copy(new_mask, mask);
524 	cpumask_clear_cpu(this_cpu, new_mask);
525 	local_mask = new_mask;
526 	__send_ipi_mask(local_mask, vector);
527 }
528 
529 /*
530  * Set the IPI entry points
531  */
532 static void kvm_setup_pv_ipi(void)
533 {
534 	apic->send_IPI_mask = kvm_send_ipi_mask;
535 	apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
536 	pr_info("KVM setup pv IPIs\n");
537 }
538 
539 static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
540 {
541 	int cpu;
542 
543 	native_send_call_func_ipi(mask);
544 
545 	/* Make sure other vCPUs get a chance to run if they need to. */
546 	for_each_cpu(cpu, mask) {
547 		if (vcpu_is_preempted(cpu)) {
548 			kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
549 			break;
550 		}
551 	}
552 }
553 
554 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
555 {
556 	native_smp_prepare_cpus(max_cpus);
557 	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
558 		static_branch_disable(&virt_spin_lock_key);
559 }
560 
561 static void __init kvm_smp_prepare_boot_cpu(void)
562 {
563 	/*
564 	 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
565 	 * shares the guest physical address with the hypervisor.
566 	 */
567 	sev_map_percpu_data();
568 
569 	kvm_guest_cpu_init();
570 	native_smp_prepare_boot_cpu();
571 	kvm_spinlock_init();
572 }
573 
574 static void kvm_guest_cpu_offline(void)
575 {
576 	kvm_disable_steal_time();
577 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
578 		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
579 	kvm_pv_disable_apf();
580 	apf_task_wake_all();
581 }
582 
583 static int kvm_cpu_online(unsigned int cpu)
584 {
585 	local_irq_disable();
586 	kvm_guest_cpu_init();
587 	local_irq_enable();
588 	return 0;
589 }
590 
591 static int kvm_cpu_down_prepare(unsigned int cpu)
592 {
593 	local_irq_disable();
594 	kvm_guest_cpu_offline();
595 	local_irq_enable();
596 	return 0;
597 }
598 #endif
599 
600 static void kvm_flush_tlb_others(const struct cpumask *cpumask,
601 			const struct flush_tlb_info *info)
602 {
603 	u8 state;
604 	int cpu;
605 	struct kvm_steal_time *src;
606 	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
607 
608 	cpumask_copy(flushmask, cpumask);
609 	/*
610 	 * We have to call flush only on online vCPUs. And
611 	 * queue flush_on_enter for pre-empted vCPUs
612 	 */
613 	for_each_cpu(cpu, flushmask) {
614 		src = &per_cpu(steal_time, cpu);
615 		state = READ_ONCE(src->preempted);
616 		if ((state & KVM_VCPU_PREEMPTED)) {
617 			if (try_cmpxchg(&src->preempted, &state,
618 					state | KVM_VCPU_FLUSH_TLB))
619 				__cpumask_clear_cpu(cpu, flushmask);
620 		}
621 	}
622 
623 	native_flush_tlb_others(flushmask, info);
624 }
625 
626 static void __init kvm_guest_init(void)
627 {
628 	int i;
629 
630 	paravirt_ops_setup();
631 	register_reboot_notifier(&kvm_pv_reboot_nb);
632 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
633 		raw_spin_lock_init(&async_pf_sleepers[i].lock);
634 
635 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
636 		has_steal_clock = 1;
637 		pv_ops.time.steal_clock = kvm_steal_clock;
638 	}
639 
640 	if (pv_tlb_flush_supported()) {
641 		pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
642 		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
643 		pr_info("KVM setup pv remote TLB flush\n");
644 	}
645 
646 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
647 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
648 
649 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
650 		static_branch_enable(&kvm_async_pf_enabled);
651 
652 #ifdef CONFIG_SMP
653 	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
654 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
655 	if (pv_sched_yield_supported()) {
656 		smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
657 		pr_info("KVM setup pv sched yield\n");
658 	}
659 	if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
660 				      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
661 		pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
662 #else
663 	sev_map_percpu_data();
664 	kvm_guest_cpu_init();
665 #endif
666 
667 	/*
668 	 * Hard lockup detection is enabled by default. Disable it, as guests
669 	 * can get false positives too easily, for example if the host is
670 	 * overcommitted.
671 	 */
672 	hardlockup_detector_disable();
673 }
674 
675 static noinline uint32_t __kvm_cpuid_base(void)
676 {
677 	if (boot_cpu_data.cpuid_level < 0)
678 		return 0;	/* So we don't blow up on old processors */
679 
680 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
681 		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
682 
683 	return 0;
684 }
685 
686 static inline uint32_t kvm_cpuid_base(void)
687 {
688 	static int kvm_cpuid_base = -1;
689 
690 	if (kvm_cpuid_base == -1)
691 		kvm_cpuid_base = __kvm_cpuid_base();
692 
693 	return kvm_cpuid_base;
694 }
695 
696 bool kvm_para_available(void)
697 {
698 	return kvm_cpuid_base() != 0;
699 }
700 EXPORT_SYMBOL_GPL(kvm_para_available);
701 
702 unsigned int kvm_arch_para_features(void)
703 {
704 	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
705 }
706 
707 unsigned int kvm_arch_para_hints(void)
708 {
709 	return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
710 }
711 EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
712 
713 static uint32_t __init kvm_detect(void)
714 {
715 	return kvm_cpuid_base();
716 }
717 
718 static void __init kvm_apic_init(void)
719 {
720 #if defined(CONFIG_SMP)
721 	if (pv_ipi_supported())
722 		kvm_setup_pv_ipi();
723 #endif
724 }
725 
726 static void __init kvm_init_platform(void)
727 {
728 	kvmclock_init();
729 	x86_platform.apic_post_init = kvm_apic_init;
730 }
731 
732 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
733 	.name			= "KVM",
734 	.detect			= kvm_detect,
735 	.type			= X86_HYPER_KVM,
736 	.init.guest_late_init	= kvm_guest_init,
737 	.init.x2apic_available	= kvm_para_available,
738 	.init.init_platform	= kvm_init_platform,
739 };
740 
741 static __init int activate_jump_labels(void)
742 {
743 	if (has_steal_clock) {
744 		static_key_slow_inc(&paravirt_steal_enabled);
745 		if (steal_acc)
746 			static_key_slow_inc(&paravirt_steal_rq_enabled);
747 	}
748 
749 	return 0;
750 }
751 arch_initcall(activate_jump_labels);
752 
753 static __init int kvm_alloc_cpumask(void)
754 {
755 	int cpu;
756 	bool alloc = false;
757 
758 	if (!kvm_para_available() || nopv)
759 		return 0;
760 
761 	if (pv_tlb_flush_supported())
762 		alloc = true;
763 
764 #if defined(CONFIG_SMP)
765 	if (pv_ipi_supported())
766 		alloc = true;
767 #endif
768 
769 	if (alloc)
770 		for_each_possible_cpu(cpu) {
771 			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
772 				GFP_KERNEL, cpu_to_node(cpu));
773 		}
774 
775 	return 0;
776 }
777 arch_initcall(kvm_alloc_cpumask);
778 
779 #ifdef CONFIG_PARAVIRT_SPINLOCKS
780 
781 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
782 static void kvm_kick_cpu(int cpu)
783 {
784 	int apicid;
785 	unsigned long flags = 0;
786 
787 	apicid = per_cpu(x86_cpu_to_apicid, cpu);
788 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
789 }
790 
791 #include <asm/qspinlock.h>
792 
793 static void kvm_wait(u8 *ptr, u8 val)
794 {
795 	unsigned long flags;
796 
797 	if (in_nmi())
798 		return;
799 
800 	local_irq_save(flags);
801 
802 	if (READ_ONCE(*ptr) != val)
803 		goto out;
804 
805 	/*
806 	 * halt until it's our turn and kicked. Note that we do safe halt
807 	 * for irq enabled case to avoid hang when lock info is overwritten
808 	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
809 	 */
810 	if (arch_irqs_disabled_flags(flags))
811 		halt();
812 	else
813 		safe_halt();
814 
815 out:
816 	local_irq_restore(flags);
817 }
818 
819 #ifdef CONFIG_X86_32
820 __visible bool __kvm_vcpu_is_preempted(long cpu)
821 {
822 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
823 
824 	return !!(src->preempted & KVM_VCPU_PREEMPTED);
825 }
826 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
827 
828 #else
829 
830 #include <asm/asm-offsets.h>
831 
832 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
833 
834 /*
835  * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
836  * restoring to/from the stack.
837  */
838 asm(
839 ".pushsection .text;"
840 ".global __raw_callee_save___kvm_vcpu_is_preempted;"
841 ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
842 "__raw_callee_save___kvm_vcpu_is_preempted:"
843 "movq	__per_cpu_offset(,%rdi,8), %rax;"
844 "cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
845 "setne	%al;"
846 "ret;"
847 ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
848 ".popsection");
849 
850 #endif
851 
852 /*
853  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
854  */
855 void __init kvm_spinlock_init(void)
856 {
857 	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
858 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
859 		return;
860 
861 	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
862 		return;
863 
864 	/* Don't use the pvqspinlock code if there is only 1 vCPU. */
865 	if (num_possible_cpus() == 1)
866 		return;
867 
868 	__pv_init_lock_hash();
869 	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
870 	pv_ops.lock.queued_spin_unlock =
871 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
872 	pv_ops.lock.wait = kvm_wait;
873 	pv_ops.lock.kick = kvm_kick_cpu;
874 
875 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
876 		pv_ops.lock.vcpu_is_preempted =
877 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
878 	}
879 }
880 
881 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
882 
883 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
884 
885 static void kvm_disable_host_haltpoll(void *i)
886 {
887 	wrmsrl(MSR_KVM_POLL_CONTROL, 0);
888 }
889 
890 static void kvm_enable_host_haltpoll(void *i)
891 {
892 	wrmsrl(MSR_KVM_POLL_CONTROL, 1);
893 }
894 
895 void arch_haltpoll_enable(unsigned int cpu)
896 {
897 	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
898 		pr_err_once("kvm: host does not support poll control\n");
899 		pr_err_once("kvm: host upgrade recommended\n");
900 		return;
901 	}
902 
903 	/* Enable guest halt poll disables host halt poll */
904 	smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
905 }
906 EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
907 
908 void arch_haltpoll_disable(unsigned int cpu)
909 {
910 	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
911 		return;
912 
913 	/* Enable guest halt poll disables host halt poll */
914 	smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
915 }
916 EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
917 #endif
918