xref: /openbmc/linux/arch/x86/xen/time.c (revision 89df62c3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Xen time implementation.
4  *
5  * This is implemented in terms of a clocksource driver which uses
6  * the hypervisor clock as a nanosecond timebase, and a clockevent
7  * driver which uses the hypervisor's timer mechanism.
8  *
9  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
10  */
11 #include <linux/kernel.h>
12 #include <linux/interrupt.h>
13 #include <linux/clocksource.h>
14 #include <linux/clockchips.h>
15 #include <linux/gfp.h>
16 #include <linux/slab.h>
17 #include <linux/pvclock_gtod.h>
18 #include <linux/timekeeper_internal.h>
19 
20 #include <asm/pvclock.h>
21 #include <asm/xen/hypervisor.h>
22 #include <asm/xen/hypercall.h>
23 #include <asm/xen/cpuid.h>
24 
25 #include <xen/events.h>
26 #include <xen/features.h>
27 #include <xen/interface/xen.h>
28 #include <xen/interface/vcpu.h>
29 
30 #include "xen-ops.h"
31 
32 /* Minimum amount of time until next clock event fires */
33 #define TIMER_SLOP	100000
34 
35 static u64 xen_sched_clock_offset __read_mostly;
36 
37 /* Get the TSC speed from Xen */
38 static unsigned long xen_tsc_khz(void)
39 {
40 	struct pvclock_vcpu_time_info *info =
41 		&HYPERVISOR_shared_info->vcpu_info[0].time;
42 
43 	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
44 	return pvclock_tsc_khz(info);
45 }
46 
47 static u64 xen_clocksource_read(void)
48 {
49         struct pvclock_vcpu_time_info *src;
50 	u64 ret;
51 
52 	preempt_disable_notrace();
53 	src = &__this_cpu_read(xen_vcpu)->time;
54 	ret = pvclock_clocksource_read(src);
55 	preempt_enable_notrace();
56 	return ret;
57 }
58 
59 static u64 xen_clocksource_get_cycles(struct clocksource *cs)
60 {
61 	return xen_clocksource_read();
62 }
63 
64 static noinstr u64 xen_sched_clock(void)
65 {
66         struct pvclock_vcpu_time_info *src;
67 	u64 ret;
68 
69 	preempt_disable_notrace();
70 	src = &__this_cpu_read(xen_vcpu)->time;
71 	ret = pvclock_clocksource_read_nowd(src);
72 	ret -= xen_sched_clock_offset;
73 	preempt_enable_notrace();
74 	return ret;
75 }
76 
77 static void xen_read_wallclock(struct timespec64 *ts)
78 {
79 	struct shared_info *s = HYPERVISOR_shared_info;
80 	struct pvclock_wall_clock *wall_clock = &(s->wc);
81         struct pvclock_vcpu_time_info *vcpu_time;
82 
83 	vcpu_time = &get_cpu_var(xen_vcpu)->time;
84 	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
85 	put_cpu_var(xen_vcpu);
86 }
87 
88 static void xen_get_wallclock(struct timespec64 *now)
89 {
90 	xen_read_wallclock(now);
91 }
92 
93 static int xen_set_wallclock(const struct timespec64 *now)
94 {
95 	return -ENODEV;
96 }
97 
98 static int xen_pvclock_gtod_notify(struct notifier_block *nb,
99 				   unsigned long was_set, void *priv)
100 {
101 	/* Protected by the calling core code serialization */
102 	static struct timespec64 next_sync;
103 
104 	struct xen_platform_op op;
105 	struct timespec64 now;
106 	struct timekeeper *tk = priv;
107 	static bool settime64_supported = true;
108 	int ret;
109 
110 	now.tv_sec = tk->xtime_sec;
111 	now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
112 
113 	/*
114 	 * We only take the expensive HV call when the clock was set
115 	 * or when the 11 minutes RTC synchronization time elapsed.
116 	 */
117 	if (!was_set && timespec64_compare(&now, &next_sync) < 0)
118 		return NOTIFY_OK;
119 
120 again:
121 	if (settime64_supported) {
122 		op.cmd = XENPF_settime64;
123 		op.u.settime64.mbz = 0;
124 		op.u.settime64.secs = now.tv_sec;
125 		op.u.settime64.nsecs = now.tv_nsec;
126 		op.u.settime64.system_time = xen_clocksource_read();
127 	} else {
128 		op.cmd = XENPF_settime32;
129 		op.u.settime32.secs = now.tv_sec;
130 		op.u.settime32.nsecs = now.tv_nsec;
131 		op.u.settime32.system_time = xen_clocksource_read();
132 	}
133 
134 	ret = HYPERVISOR_platform_op(&op);
135 
136 	if (ret == -ENOSYS && settime64_supported) {
137 		settime64_supported = false;
138 		goto again;
139 	}
140 	if (ret < 0)
141 		return NOTIFY_BAD;
142 
143 	/*
144 	 * Move the next drift compensation time 11 minutes
145 	 * ahead. That's emulating the sync_cmos_clock() update for
146 	 * the hardware RTC.
147 	 */
148 	next_sync = now;
149 	next_sync.tv_sec += 11 * 60;
150 
151 	return NOTIFY_OK;
152 }
153 
154 static struct notifier_block xen_pvclock_gtod_notifier = {
155 	.notifier_call = xen_pvclock_gtod_notify,
156 };
157 
158 static int xen_cs_enable(struct clocksource *cs)
159 {
160 	vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
161 	return 0;
162 }
163 
164 static struct clocksource xen_clocksource __read_mostly = {
165 	.name	= "xen",
166 	.rating	= 400,
167 	.read	= xen_clocksource_get_cycles,
168 	.mask	= CLOCKSOURCE_MASK(64),
169 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
170 	.enable = xen_cs_enable,
171 };
172 
173 /*
174    Xen clockevent implementation
175 
176    Xen has two clockevent implementations:
177 
178    The old timer_op one works with all released versions of Xen prior
179    to version 3.0.4.  This version of the hypervisor provides a
180    single-shot timer with nanosecond resolution.  However, sharing the
181    same event channel is a 100Hz tick which is delivered while the
182    vcpu is running.  We don't care about or use this tick, but it will
183    cause the core time code to think the timer fired too soon, and
184    will end up resetting it each time.  It could be filtered, but
185    doing so has complications when the ktime clocksource is not yet
186    the xen clocksource (ie, at boot time).
187 
188    The new vcpu_op-based timer interface allows the tick timer period
189    to be changed or turned off.  The tick timer is not useful as a
190    periodic timer because events are only delivered to running vcpus.
191    The one-shot timer can report when a timeout is in the past, so
192    set_next_event is capable of returning -ETIME when appropriate.
193    This interface is used when available.
194 */
195 
196 
197 /*
198   Get a hypervisor absolute time.  In theory we could maintain an
199   offset between the kernel's time and the hypervisor's time, and
200   apply that to a kernel's absolute timeout.  Unfortunately the
201   hypervisor and kernel times can drift even if the kernel is using
202   the Xen clocksource, because ntp can warp the kernel's clocksource.
203 */
204 static s64 get_abs_timeout(unsigned long delta)
205 {
206 	return xen_clocksource_read() + delta;
207 }
208 
209 static int xen_timerop_shutdown(struct clock_event_device *evt)
210 {
211 	/* cancel timeout */
212 	HYPERVISOR_set_timer_op(0);
213 
214 	return 0;
215 }
216 
217 static int xen_timerop_set_next_event(unsigned long delta,
218 				      struct clock_event_device *evt)
219 {
220 	WARN_ON(!clockevent_state_oneshot(evt));
221 
222 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
223 		BUG();
224 
225 	/* We may have missed the deadline, but there's no real way of
226 	   knowing for sure.  If the event was in the past, then we'll
227 	   get an immediate interrupt. */
228 
229 	return 0;
230 }
231 
232 static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
233 	.name			= "xen",
234 	.features		= CLOCK_EVT_FEAT_ONESHOT,
235 
236 	.max_delta_ns		= 0xffffffff,
237 	.max_delta_ticks	= 0xffffffff,
238 	.min_delta_ns		= TIMER_SLOP,
239 	.min_delta_ticks	= TIMER_SLOP,
240 
241 	.mult			= 1,
242 	.shift			= 0,
243 	.rating			= 500,
244 
245 	.set_state_shutdown	= xen_timerop_shutdown,
246 	.set_next_event		= xen_timerop_set_next_event,
247 };
248 
249 static int xen_vcpuop_shutdown(struct clock_event_device *evt)
250 {
251 	int cpu = smp_processor_id();
252 
253 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
254 			       NULL) ||
255 	    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
256 			       NULL))
257 		BUG();
258 
259 	return 0;
260 }
261 
262 static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
263 {
264 	int cpu = smp_processor_id();
265 
266 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
267 			       NULL))
268 		BUG();
269 
270 	return 0;
271 }
272 
273 static int xen_vcpuop_set_next_event(unsigned long delta,
274 				     struct clock_event_device *evt)
275 {
276 	int cpu = smp_processor_id();
277 	struct vcpu_set_singleshot_timer single;
278 	int ret;
279 
280 	WARN_ON(!clockevent_state_oneshot(evt));
281 
282 	single.timeout_abs_ns = get_abs_timeout(delta);
283 	/* Get an event anyway, even if the timeout is already expired */
284 	single.flags = 0;
285 
286 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
287 				 &single);
288 	BUG_ON(ret != 0);
289 
290 	return ret;
291 }
292 
293 static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
294 	.name = "xen",
295 	.features = CLOCK_EVT_FEAT_ONESHOT,
296 
297 	.max_delta_ns = 0xffffffff,
298 	.max_delta_ticks = 0xffffffff,
299 	.min_delta_ns = TIMER_SLOP,
300 	.min_delta_ticks = TIMER_SLOP,
301 
302 	.mult = 1,
303 	.shift = 0,
304 	.rating = 500,
305 
306 	.set_state_shutdown = xen_vcpuop_shutdown,
307 	.set_state_oneshot = xen_vcpuop_set_oneshot,
308 	.set_next_event = xen_vcpuop_set_next_event,
309 };
310 
311 static const struct clock_event_device *xen_clockevent =
312 	&xen_timerop_clockevent;
313 
314 struct xen_clock_event_device {
315 	struct clock_event_device evt;
316 	char name[16];
317 };
318 static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
319 
320 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
321 {
322 	struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
323 	irqreturn_t ret;
324 
325 	ret = IRQ_NONE;
326 	if (evt->event_handler) {
327 		evt->event_handler(evt);
328 		ret = IRQ_HANDLED;
329 	}
330 
331 	return ret;
332 }
333 
334 void xen_teardown_timer(int cpu)
335 {
336 	struct clock_event_device *evt;
337 	evt = &per_cpu(xen_clock_events, cpu).evt;
338 
339 	if (evt->irq >= 0) {
340 		unbind_from_irqhandler(evt->irq, NULL);
341 		evt->irq = -1;
342 	}
343 }
344 
345 void xen_setup_timer(int cpu)
346 {
347 	struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
348 	struct clock_event_device *evt = &xevt->evt;
349 	int irq;
350 
351 	WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
352 	if (evt->irq >= 0)
353 		xen_teardown_timer(cpu);
354 
355 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
356 
357 	snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
358 
359 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
360 				      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
361 				      IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
362 				      xevt->name, NULL);
363 	(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
364 
365 	memcpy(evt, xen_clockevent, sizeof(*evt));
366 
367 	evt->cpumask = cpumask_of(cpu);
368 	evt->irq = irq;
369 }
370 
371 
372 void xen_setup_cpu_clockevents(void)
373 {
374 	clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
375 }
376 
377 void xen_timer_resume(void)
378 {
379 	int cpu;
380 
381 	if (xen_clockevent != &xen_vcpuop_clockevent)
382 		return;
383 
384 	for_each_online_cpu(cpu) {
385 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
386 				       xen_vcpu_nr(cpu), NULL))
387 			BUG();
388 	}
389 }
390 
391 static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
392 static u64 xen_clock_value_saved;
393 
394 void xen_save_time_memory_area(void)
395 {
396 	struct vcpu_register_time_memory_area t;
397 	int ret;
398 
399 	xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
400 
401 	if (!xen_clock)
402 		return;
403 
404 	t.addr.v = NULL;
405 
406 	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
407 	if (ret != 0)
408 		pr_notice("Cannot save secondary vcpu_time_info (err %d)",
409 			  ret);
410 	else
411 		clear_page(xen_clock);
412 }
413 
414 void xen_restore_time_memory_area(void)
415 {
416 	struct vcpu_register_time_memory_area t;
417 	int ret;
418 
419 	if (!xen_clock)
420 		goto out;
421 
422 	t.addr.v = &xen_clock->pvti;
423 
424 	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
425 
426 	/*
427 	 * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
428 	 * register the secondary time info with Xen or if we migrated to a
429 	 * host without the necessary flags. On both of these cases what
430 	 * happens is either process seeing a zeroed out pvti or seeing no
431 	 * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
432 	 * if 0, it discards the data in pvti and fallbacks to a system
433 	 * call for a reliable timestamp.
434 	 */
435 	if (ret != 0)
436 		pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
437 			  ret);
438 
439 out:
440 	/* Need pvclock_resume() before using xen_clocksource_read(). */
441 	pvclock_resume();
442 	xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
443 }
444 
445 static void xen_setup_vsyscall_time_info(void)
446 {
447 	struct vcpu_register_time_memory_area t;
448 	struct pvclock_vsyscall_time_info *ti;
449 	int ret;
450 
451 	ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
452 	if (!ti)
453 		return;
454 
455 	t.addr.v = &ti->pvti;
456 
457 	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
458 	if (ret) {
459 		pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
460 		free_page((unsigned long)ti);
461 		return;
462 	}
463 
464 	/*
465 	 * If primary time info had this bit set, secondary should too since
466 	 * it's the same data on both just different memory regions. But we
467 	 * still check it in case hypervisor is buggy.
468 	 */
469 	if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
470 		t.addr.v = NULL;
471 		ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
472 					 0, &t);
473 		if (!ret)
474 			free_page((unsigned long)ti);
475 
476 		pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
477 		return;
478 	}
479 
480 	xen_clock = ti;
481 	pvclock_set_pvti_cpu0_va(xen_clock);
482 
483 	xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
484 }
485 
486 /*
487  * Check if it is possible to safely use the tsc as a clocksource.  This is
488  * only true if the hypervisor notifies the guest that its tsc is invariant,
489  * the tsc is stable, and the tsc instruction will never be emulated.
490  */
491 static int __init xen_tsc_safe_clocksource(void)
492 {
493 	u32 eax, ebx, ecx, edx;
494 
495 	if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
496 		return 0;
497 
498 	if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
499 		return 0;
500 
501 	if (check_tsc_unstable())
502 		return 0;
503 
504 	/* Leaf 4, sub-leaf 0 (0x40000x03) */
505 	cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
506 
507 	return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
508 }
509 
510 static void __init xen_time_init(void)
511 {
512 	struct pvclock_vcpu_time_info *pvti;
513 	int cpu = smp_processor_id();
514 	struct timespec64 tp;
515 
516 	/*
517 	 * As Dom0 is never moved, no penalty on using TSC there.
518 	 *
519 	 * If it is possible for the guest to determine that the tsc is a safe
520 	 * clocksource, then set xen_clocksource rating below that of the tsc
521 	 * so that the system prefers tsc instead.
522 	 */
523 	if (xen_initial_domain())
524 		xen_clocksource.rating = 275;
525 	else if (xen_tsc_safe_clocksource())
526 		xen_clocksource.rating = 299;
527 
528 	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
529 
530 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
531 			       NULL) == 0) {
532 		/* Successfully turned off 100Hz tick, so we have the
533 		   vcpuop-based timer interface */
534 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
535 		xen_clockevent = &xen_vcpuop_clockevent;
536 	}
537 
538 	/* Set initial system time with full resolution */
539 	xen_read_wallclock(&tp);
540 	do_settimeofday64(&tp);
541 
542 	setup_force_cpu_cap(X86_FEATURE_TSC);
543 
544 	/*
545 	 * We check ahead on the primary time info if this
546 	 * bit is supported hence speeding up Xen clocksource.
547 	 */
548 	pvti = &__this_cpu_read(xen_vcpu)->time;
549 	if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
550 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
551 		xen_setup_vsyscall_time_info();
552 	}
553 
554 	xen_setup_runstate_info(cpu);
555 	xen_setup_timer(cpu);
556 	xen_setup_cpu_clockevents();
557 
558 	xen_time_setup_guest();
559 
560 	if (xen_initial_domain())
561 		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
562 }
563 
564 static void __init xen_init_time_common(void)
565 {
566 	xen_sched_clock_offset = xen_clocksource_read();
567 	static_call_update(pv_steal_clock, xen_steal_clock);
568 	paravirt_set_sched_clock(xen_sched_clock);
569 
570 	x86_platform.calibrate_tsc = xen_tsc_khz;
571 	x86_platform.get_wallclock = xen_get_wallclock;
572 }
573 
574 void __init xen_init_time_ops(void)
575 {
576 	xen_init_time_common();
577 
578 	x86_init.timers.timer_init = xen_time_init;
579 	x86_init.timers.setup_percpu_clockev = x86_init_noop;
580 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
581 
582 	/* Dom0 uses the native method to set the hardware RTC. */
583 	if (!xen_initial_domain())
584 		x86_platform.set_wallclock = xen_set_wallclock;
585 }
586 
587 #ifdef CONFIG_XEN_PVHVM
588 static void xen_hvm_setup_cpu_clockevents(void)
589 {
590 	int cpu = smp_processor_id();
591 	xen_setup_runstate_info(cpu);
592 	/*
593 	 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
594 	 * doing it xen_hvm_cpu_notify (which gets called by smp_init during
595 	 * early bootup and also during CPU hotplug events).
596 	 */
597 	xen_setup_cpu_clockevents();
598 }
599 
600 void __init xen_hvm_init_time_ops(void)
601 {
602 	static bool hvm_time_initialized;
603 
604 	if (hvm_time_initialized)
605 		return;
606 
607 	/*
608 	 * vector callback is needed otherwise we cannot receive interrupts
609 	 * on cpu > 0 and at this point we don't know how many cpus are
610 	 * available.
611 	 */
612 	if (!xen_have_vector_callback)
613 		return;
614 
615 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
616 		pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
617 		return;
618 	}
619 
620 	/*
621 	 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
622 	 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
623 	 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
624 	 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
625 	 *
626 	 * The xen_hvm_init_time_ops() should be called again later after
627 	 * __this_cpu_read(xen_vcpu) is available.
628 	 */
629 	if (!__this_cpu_read(xen_vcpu)) {
630 		pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
631 			xen_vcpu_nr(0));
632 		return;
633 	}
634 
635 	xen_init_time_common();
636 
637 	x86_init.timers.setup_percpu_clockev = xen_time_init;
638 	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
639 
640 	x86_platform.set_wallclock = xen_set_wallclock;
641 
642 	hvm_time_initialized = true;
643 }
644 #endif
645 
646 /* Kernel parameter to specify Xen timer slop */
647 static int __init parse_xen_timer_slop(char *ptr)
648 {
649 	unsigned long slop = memparse(ptr, NULL);
650 
651 	xen_timerop_clockevent.min_delta_ns = slop;
652 	xen_timerop_clockevent.min_delta_ticks = slop;
653 	xen_vcpuop_clockevent.min_delta_ns = slop;
654 	xen_vcpuop_clockevent.min_delta_ticks = slop;
655 
656 	return 0;
657 }
658 early_param("xen_timer_slop", parse_xen_timer_slop);
659