1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Architecture neutral utility routines for interacting with
5 * Hyper-V. This file is specifically for code that must be
6 * built-in to the kernel image when CONFIG_HYPERV is set
7 * (vs. being in a module) because it is called from architecture
8 * specific code under arch/.
9 *
10 * Copyright (C) 2021, Microsoft, Inc.
11 *
12 * Author : Michael Kelley <mikelley@microsoft.com>
13 */
14
15 #include <linux/types.h>
16 #include <linux/acpi.h>
17 #include <linux/export.h>
18 #include <linux/bitfield.h>
19 #include <linux/cpumask.h>
20 #include <linux/sched/task_stack.h>
21 #include <linux/panic_notifier.h>
22 #include <linux/ptrace.h>
23 #include <linux/kdebug.h>
24 #include <linux/kmsg_dump.h>
25 #include <linux/slab.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/set_memory.h>
28 #include <asm/hyperv-tlfs.h>
29 #include <asm/mshyperv.h>
30
31 /*
32 * hv_root_partition, ms_hyperv and hv_nested are defined here with other
33 * Hyper-V specific globals so they are shared across all architectures and are
34 * built only when CONFIG_HYPERV is defined. But on x86,
35 * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
36 * defined, and it uses these three variables. So mark them as __weak
37 * here, allowing for an overriding definition in the module containing
38 * ms_hyperv_init_platform().
39 */
40 bool __weak hv_root_partition;
41 EXPORT_SYMBOL_GPL(hv_root_partition);
42
43 bool __weak hv_nested;
44 EXPORT_SYMBOL_GPL(hv_nested);
45
46 struct ms_hyperv_info __weak ms_hyperv;
47 EXPORT_SYMBOL_GPL(ms_hyperv);
48
49 u32 *hv_vp_index;
50 EXPORT_SYMBOL_GPL(hv_vp_index);
51
52 u32 hv_max_vp_index;
53 EXPORT_SYMBOL_GPL(hv_max_vp_index);
54
55 void * __percpu *hyperv_pcpu_input_arg;
56 EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
57
58 void * __percpu *hyperv_pcpu_output_arg;
59 EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
60
61 static void hv_kmsg_dump_unregister(void);
62
63 static struct ctl_table_header *hv_ctl_table_hdr;
64
65 /*
66 * Hyper-V specific initialization and shutdown code that is
67 * common across all architectures. Called from architecture
68 * specific initialization functions.
69 */
70
hv_common_free(void)71 void __init hv_common_free(void)
72 {
73 unregister_sysctl_table(hv_ctl_table_hdr);
74 hv_ctl_table_hdr = NULL;
75
76 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
77 hv_kmsg_dump_unregister();
78
79 kfree(hv_vp_index);
80 hv_vp_index = NULL;
81
82 free_percpu(hyperv_pcpu_output_arg);
83 hyperv_pcpu_output_arg = NULL;
84
85 free_percpu(hyperv_pcpu_input_arg);
86 hyperv_pcpu_input_arg = NULL;
87 }
88
89 /*
90 * Functions for allocating and freeing memory with size and
91 * alignment HV_HYP_PAGE_SIZE. These functions are needed because
92 * the guest page size may not be the same as the Hyper-V page
93 * size. We depend upon kmalloc() aligning power-of-two size
94 * allocations to the allocation size boundary, so that the
95 * allocated memory appears to Hyper-V as a page of the size
96 * it expects.
97 */
98
hv_alloc_hyperv_page(void)99 void *hv_alloc_hyperv_page(void)
100 {
101 BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
102
103 if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
104 return (void *)__get_free_page(GFP_KERNEL);
105 else
106 return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
107 }
108 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
109
hv_alloc_hyperv_zeroed_page(void)110 void *hv_alloc_hyperv_zeroed_page(void)
111 {
112 if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
113 return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
114 else
115 return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
116 }
117 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
118
hv_free_hyperv_page(void * addr)119 void hv_free_hyperv_page(void *addr)
120 {
121 if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
122 free_page((unsigned long)addr);
123 else
124 kfree(addr);
125 }
126 EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
127
128 static void *hv_panic_page;
129
130 /*
131 * Boolean to control whether to report panic messages over Hyper-V.
132 *
133 * It can be set via /proc/sys/kernel/hyperv_record_panic_msg
134 */
135 static int sysctl_record_panic_msg = 1;
136
137 /*
138 * sysctl option to allow the user to control whether kmsg data should be
139 * reported to Hyper-V on panic.
140 */
141 static struct ctl_table hv_ctl_table[] = {
142 {
143 .procname = "hyperv_record_panic_msg",
144 .data = &sysctl_record_panic_msg,
145 .maxlen = sizeof(int),
146 .mode = 0644,
147 .proc_handler = proc_dointvec_minmax,
148 .extra1 = SYSCTL_ZERO,
149 .extra2 = SYSCTL_ONE
150 },
151 {}
152 };
153
154 static int hv_die_panic_notify_crash(struct notifier_block *self,
155 unsigned long val, void *args);
156
157 static struct notifier_block hyperv_die_report_block = {
158 .notifier_call = hv_die_panic_notify_crash,
159 };
160
161 static struct notifier_block hyperv_panic_report_block = {
162 .notifier_call = hv_die_panic_notify_crash,
163 };
164
165 /*
166 * The following callback works both as die and panic notifier; its
167 * goal is to provide panic information to the hypervisor unless the
168 * kmsg dumper is used [see hv_kmsg_dump()], which provides more
169 * information but isn't always available.
170 *
171 * Notice that both the panic/die report notifiers are registered only
172 * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
173 */
hv_die_panic_notify_crash(struct notifier_block * self,unsigned long val,void * args)174 static int hv_die_panic_notify_crash(struct notifier_block *self,
175 unsigned long val, void *args)
176 {
177 struct pt_regs *regs;
178 bool is_die;
179
180 /* Don't notify Hyper-V unless we have a die oops event or panic. */
181 if (self == &hyperv_panic_report_block) {
182 is_die = false;
183 regs = current_pt_regs();
184 } else { /* die event */
185 if (val != DIE_OOPS)
186 return NOTIFY_DONE;
187
188 is_die = true;
189 regs = ((struct die_args *)args)->regs;
190 }
191
192 /*
193 * Hyper-V should be notified only once about a panic/die. If we will
194 * be calling hv_kmsg_dump() later with kmsg data, don't do the
195 * notification here.
196 */
197 if (!sysctl_record_panic_msg || !hv_panic_page)
198 hyperv_report_panic(regs, val, is_die);
199
200 return NOTIFY_DONE;
201 }
202
203 /*
204 * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
205 * buffer and call into Hyper-V to transfer the data.
206 */
hv_kmsg_dump(struct kmsg_dumper * dumper,enum kmsg_dump_reason reason)207 static void hv_kmsg_dump(struct kmsg_dumper *dumper,
208 enum kmsg_dump_reason reason)
209 {
210 struct kmsg_dump_iter iter;
211 size_t bytes_written;
212
213 /* We are only interested in panics. */
214 if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
215 return;
216
217 /*
218 * Write dump contents to the page. No need to synchronize; panic should
219 * be single-threaded.
220 */
221 kmsg_dump_rewind(&iter);
222 kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
223 &bytes_written);
224 if (!bytes_written)
225 return;
226 /*
227 * P3 to contain the physical address of the panic page & P4 to
228 * contain the size of the panic data in that page. Rest of the
229 * registers are no-op when the NOTIFY_MSG flag is set.
230 */
231 hv_set_register(HV_REGISTER_CRASH_P0, 0);
232 hv_set_register(HV_REGISTER_CRASH_P1, 0);
233 hv_set_register(HV_REGISTER_CRASH_P2, 0);
234 hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
235 hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
236
237 /*
238 * Let Hyper-V know there is crash data available along with
239 * the panic message.
240 */
241 hv_set_register(HV_REGISTER_CRASH_CTL,
242 (HV_CRASH_CTL_CRASH_NOTIFY |
243 HV_CRASH_CTL_CRASH_NOTIFY_MSG));
244 }
245
246 static struct kmsg_dumper hv_kmsg_dumper = {
247 .dump = hv_kmsg_dump,
248 };
249
hv_kmsg_dump_unregister(void)250 static void hv_kmsg_dump_unregister(void)
251 {
252 kmsg_dump_unregister(&hv_kmsg_dumper);
253 unregister_die_notifier(&hyperv_die_report_block);
254 atomic_notifier_chain_unregister(&panic_notifier_list,
255 &hyperv_panic_report_block);
256
257 hv_free_hyperv_page(hv_panic_page);
258 hv_panic_page = NULL;
259 }
260
hv_kmsg_dump_register(void)261 static void hv_kmsg_dump_register(void)
262 {
263 int ret;
264
265 hv_panic_page = hv_alloc_hyperv_zeroed_page();
266 if (!hv_panic_page) {
267 pr_err("Hyper-V: panic message page memory allocation failed\n");
268 return;
269 }
270
271 ret = kmsg_dump_register(&hv_kmsg_dumper);
272 if (ret) {
273 pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
274 hv_free_hyperv_page(hv_panic_page);
275 hv_panic_page = NULL;
276 }
277 }
278
hv_common_init(void)279 int __init hv_common_init(void)
280 {
281 int i;
282
283 if (hv_is_isolation_supported())
284 sysctl_record_panic_msg = 0;
285
286 /*
287 * Hyper-V expects to get crash register data or kmsg when
288 * crash enlightment is available and system crashes. Set
289 * crash_kexec_post_notifiers to be true to make sure that
290 * calling crash enlightment interface before running kdump
291 * kernel.
292 */
293 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
294 u64 hyperv_crash_ctl;
295
296 crash_kexec_post_notifiers = true;
297 pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n");
298
299 /*
300 * Panic message recording (sysctl_record_panic_msg)
301 * is enabled by default in non-isolated guests and
302 * disabled by default in isolated guests; the panic
303 * message recording won't be available in isolated
304 * guests should the following registration fail.
305 */
306 hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table);
307 if (!hv_ctl_table_hdr)
308 pr_err("Hyper-V: sysctl table register error");
309
310 /*
311 * Register for panic kmsg callback only if the right
312 * capability is supported by the hypervisor.
313 */
314 hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
315 if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
316 hv_kmsg_dump_register();
317
318 register_die_notifier(&hyperv_die_report_block);
319 atomic_notifier_chain_register(&panic_notifier_list,
320 &hyperv_panic_report_block);
321 }
322
323 /*
324 * Allocate the per-CPU state for the hypercall input arg.
325 * If this allocation fails, we will not be able to setup
326 * (per-CPU) hypercall input page and thus this failure is
327 * fatal on Hyper-V.
328 */
329 hyperv_pcpu_input_arg = alloc_percpu(void *);
330 BUG_ON(!hyperv_pcpu_input_arg);
331
332 /* Allocate the per-CPU state for output arg for root */
333 if (hv_root_partition) {
334 hyperv_pcpu_output_arg = alloc_percpu(void *);
335 BUG_ON(!hyperv_pcpu_output_arg);
336 }
337
338 hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
339 GFP_KERNEL);
340 if (!hv_vp_index) {
341 hv_common_free();
342 return -ENOMEM;
343 }
344
345 for (i = 0; i < num_possible_cpus(); i++)
346 hv_vp_index[i] = VP_INVAL;
347
348 return 0;
349 }
350
351 /*
352 * Hyper-V specific initialization and die code for
353 * individual CPUs that is common across all architectures.
354 * Called by the CPU hotplug mechanism.
355 */
356
hv_common_cpu_init(unsigned int cpu)357 int hv_common_cpu_init(unsigned int cpu)
358 {
359 void **inputarg, **outputarg;
360 u64 msr_vp_index;
361 gfp_t flags;
362 int pgcount = hv_root_partition ? 2 : 1;
363 void *mem;
364 int ret;
365
366 /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
367 flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
368
369 inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
370
371 /*
372 * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
373 * allocated if this CPU was previously online and then taken offline
374 */
375 if (!*inputarg) {
376 mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
377 if (!mem)
378 return -ENOMEM;
379
380 if (hv_root_partition) {
381 outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
382 *outputarg = (char *)mem + HV_HYP_PAGE_SIZE;
383 }
384
385 if (!ms_hyperv.paravisor_present &&
386 (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
387 ret = set_memory_decrypted((unsigned long)mem, pgcount);
388 if (ret) {
389 /* It may be unsafe to free 'mem' */
390 return ret;
391 }
392
393 memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE);
394 }
395
396 /*
397 * In a fully enlightened TDX/SNP VM with more than 64 VPs, if
398 * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() ->
399 * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to
400 * use hyperv_pcpu_input_arg as the hypercall input page, which
401 * must be a decrypted page in such a VM, but the page is still
402 * encrypted before set_memory_decrypted() returns. Fix this by
403 * setting *inputarg after the above set_memory_decrypted(): if
404 * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns
405 * HV_STATUS_INVALID_PARAMETER immediately, and the function
406 * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(),
407 * which may be slightly slower than the hypercall, but still
408 * works correctly in such a VM.
409 */
410 *inputarg = mem;
411 }
412
413 msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
414
415 hv_vp_index[cpu] = msr_vp_index;
416
417 if (msr_vp_index > hv_max_vp_index)
418 hv_max_vp_index = msr_vp_index;
419
420 return 0;
421 }
422
hv_common_cpu_die(unsigned int cpu)423 int hv_common_cpu_die(unsigned int cpu)
424 {
425 /*
426 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
427 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
428 * may be used by the Hyper-V vPCI driver in reassigning interrupts
429 * as part of the offlining process. The interrupt reassignment
430 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
431 * called this function.
432 *
433 * If a previously offlined CPU is brought back online again, the
434 * originally allocated memory is reused in hv_common_cpu_init().
435 */
436
437 return 0;
438 }
439
440 /* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
hv_query_ext_cap(u64 cap_query)441 bool hv_query_ext_cap(u64 cap_query)
442 {
443 /*
444 * The address of the 'hv_extended_cap' variable will be used as an
445 * output parameter to the hypercall below and so it should be
446 * compatible with 'virt_to_phys'. Which means, it's address should be
447 * directly mapped. Use 'static' to keep it compatible; stack variables
448 * can be virtually mapped, making them incompatible with
449 * 'virt_to_phys'.
450 * Hypercall input/output addresses should also be 8-byte aligned.
451 */
452 static u64 hv_extended_cap __aligned(8);
453 static bool hv_extended_cap_queried;
454 u64 status;
455
456 /*
457 * Querying extended capabilities is an extended hypercall. Check if the
458 * partition supports extended hypercall, first.
459 */
460 if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
461 return false;
462
463 /* Extended capabilities do not change at runtime. */
464 if (hv_extended_cap_queried)
465 return hv_extended_cap & cap_query;
466
467 status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
468 &hv_extended_cap);
469
470 /*
471 * The query extended capabilities hypercall should not fail under
472 * any normal circumstances. Avoid repeatedly making the hypercall, on
473 * error.
474 */
475 hv_extended_cap_queried = true;
476 if (!hv_result_success(status)) {
477 pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
478 status);
479 return false;
480 }
481
482 return hv_extended_cap & cap_query;
483 }
484 EXPORT_SYMBOL_GPL(hv_query_ext_cap);
485
hv_setup_dma_ops(struct device * dev,bool coherent)486 void hv_setup_dma_ops(struct device *dev, bool coherent)
487 {
488 /*
489 * Hyper-V does not offer a vIOMMU in the guest
490 * VM, so pass 0/NULL for the IOMMU settings
491 */
492 arch_setup_dma_ops(dev, 0, 0, NULL, coherent);
493 }
494 EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
495
hv_is_hibernation_supported(void)496 bool hv_is_hibernation_supported(void)
497 {
498 return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
499 }
500 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
501
502 /*
503 * Default function to read the Hyper-V reference counter, independent
504 * of whether Hyper-V enlightened clocks/timers are being used. But on
505 * architectures where it is used, Hyper-V enlightenment code in
506 * hyperv_timer.c may override this function.
507 */
__hv_read_ref_counter(void)508 static u64 __hv_read_ref_counter(void)
509 {
510 return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
511 }
512
513 u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter;
514 EXPORT_SYMBOL_GPL(hv_read_reference_counter);
515
516 /* These __weak functions provide default "no-op" behavior and
517 * may be overridden by architecture specific versions. Architectures
518 * for which the default "no-op" behavior is sufficient can leave
519 * them unimplemented and not be cluttered with a bunch of stub
520 * functions in arch-specific code.
521 */
522
hv_is_isolation_supported(void)523 bool __weak hv_is_isolation_supported(void)
524 {
525 return false;
526 }
527 EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
528
hv_isolation_type_snp(void)529 bool __weak hv_isolation_type_snp(void)
530 {
531 return false;
532 }
533 EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
534
hv_isolation_type_tdx(void)535 bool __weak hv_isolation_type_tdx(void)
536 {
537 return false;
538 }
539 EXPORT_SYMBOL_GPL(hv_isolation_type_tdx);
540
hv_setup_vmbus_handler(void (* handler)(void))541 void __weak hv_setup_vmbus_handler(void (*handler)(void))
542 {
543 }
544 EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler);
545
hv_remove_vmbus_handler(void)546 void __weak hv_remove_vmbus_handler(void)
547 {
548 }
549 EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
550
hv_setup_kexec_handler(void (* handler)(void))551 void __weak hv_setup_kexec_handler(void (*handler)(void))
552 {
553 }
554 EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
555
hv_remove_kexec_handler(void)556 void __weak hv_remove_kexec_handler(void)
557 {
558 }
559 EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
560
hv_setup_crash_handler(void (* handler)(struct pt_regs * regs))561 void __weak hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
562 {
563 }
564 EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
565
hv_remove_crash_handler(void)566 void __weak hv_remove_crash_handler(void)
567 {
568 }
569 EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
570
hyperv_cleanup(void)571 void __weak hyperv_cleanup(void)
572 {
573 }
574 EXPORT_SYMBOL_GPL(hyperv_cleanup);
575
hv_ghcb_hypercall(u64 control,void * input,void * output,u32 input_size)576 u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
577 {
578 return HV_STATUS_INVALID_PARAMETER;
579 }
580 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
581
hv_tdx_hypercall(u64 control,u64 param1,u64 param2)582 u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
583 {
584 return HV_STATUS_INVALID_PARAMETER;
585 }
586 EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
587