xref: /openbmc/linux/arch/x86/xen/time.c (revision 9c1f8594)
1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 #include <linux/gfp.h>
17 
18 #include <asm/pvclock.h>
19 #include <asm/xen/hypervisor.h>
20 #include <asm/xen/hypercall.h>
21 
22 #include <xen/events.h>
23 #include <xen/features.h>
24 #include <xen/interface/xen.h>
25 #include <xen/interface/vcpu.h>
26 
27 #include "xen-ops.h"
28 
29 /* Xen may fire a timer up to this many ns early */
30 #define TIMER_SLOP	100000
31 #define NS_PER_TICK	(1000000000LL / HZ)
32 
33 /* runstate info updated by Xen */
34 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
35 
36 /* snapshots of runstate info */
37 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
38 
39 /* unused ns of stolen and blocked time */
40 static DEFINE_PER_CPU(u64, xen_residual_stolen);
41 static DEFINE_PER_CPU(u64, xen_residual_blocked);
42 
43 /* return an consistent snapshot of 64-bit time/counter value */
44 static u64 get64(const u64 *p)
45 {
46 	u64 ret;
47 
48 	if (BITS_PER_LONG < 64) {
49 		u32 *p32 = (u32 *)p;
50 		u32 h, l;
51 
52 		/*
53 		 * Read high then low, and then make sure high is
54 		 * still the same; this will only loop if low wraps
55 		 * and carries into high.
56 		 * XXX some clean way to make this endian-proof?
57 		 */
58 		do {
59 			h = p32[1];
60 			barrier();
61 			l = p32[0];
62 			barrier();
63 		} while (p32[1] != h);
64 
65 		ret = (((u64)h) << 32) | l;
66 	} else
67 		ret = *p;
68 
69 	return ret;
70 }
71 
72 /*
73  * Runstate accounting
74  */
75 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
76 {
77 	u64 state_time;
78 	struct vcpu_runstate_info *state;
79 
80 	BUG_ON(preemptible());
81 
82 	state = &__get_cpu_var(xen_runstate);
83 
84 	/*
85 	 * The runstate info is always updated by the hypervisor on
86 	 * the current CPU, so there's no need to use anything
87 	 * stronger than a compiler barrier when fetching it.
88 	 */
89 	do {
90 		state_time = get64(&state->state_entry_time);
91 		barrier();
92 		*res = *state;
93 		barrier();
94 	} while (get64(&state->state_entry_time) != state_time);
95 }
96 
97 /* return true when a vcpu could run but has no real cpu to run on */
98 bool xen_vcpu_stolen(int vcpu)
99 {
100 	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
101 }
102 
103 void xen_setup_runstate_info(int cpu)
104 {
105 	struct vcpu_register_runstate_memory_area area;
106 
107 	area.addr.v = &per_cpu(xen_runstate, cpu);
108 
109 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 			       cpu, &area))
111 		BUG();
112 }
113 
114 static void do_stolen_accounting(void)
115 {
116 	struct vcpu_runstate_info state;
117 	struct vcpu_runstate_info *snap;
118 	s64 blocked, runnable, offline, stolen;
119 	cputime_t ticks;
120 
121 	get_runstate_snapshot(&state);
122 
123 	WARN_ON(state.state != RUNSTATE_running);
124 
125 	snap = &__get_cpu_var(xen_runstate_snapshot);
126 
127 	/* work out how much time the VCPU has not been runn*ing*  */
128 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
129 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
130 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
131 
132 	*snap = state;
133 
134 	/* Add the appropriate number of ticks of stolen time,
135 	   including any left-overs from last time. */
136 	stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
137 
138 	if (stolen < 0)
139 		stolen = 0;
140 
141 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 	__this_cpu_write(xen_residual_stolen, stolen);
143 	account_steal_ticks(ticks);
144 
145 	/* Add the appropriate number of ticks of blocked time,
146 	   including any left-overs from last time. */
147 	blocked += __this_cpu_read(xen_residual_blocked);
148 
149 	if (blocked < 0)
150 		blocked = 0;
151 
152 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 	__this_cpu_write(xen_residual_blocked, blocked);
154 	account_idle_ticks(ticks);
155 }
156 
157 /* Get the TSC speed from Xen */
158 static unsigned long xen_tsc_khz(void)
159 {
160 	struct pvclock_vcpu_time_info *info =
161 		&HYPERVISOR_shared_info->vcpu_info[0].time;
162 
163 	return pvclock_tsc_khz(info);
164 }
165 
166 cycle_t xen_clocksource_read(void)
167 {
168         struct pvclock_vcpu_time_info *src;
169 	cycle_t ret;
170 
171 	preempt_disable_notrace();
172 	src = &__get_cpu_var(xen_vcpu)->time;
173 	ret = pvclock_clocksource_read(src);
174 	preempt_enable_notrace();
175 	return ret;
176 }
177 
178 static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
179 {
180 	return xen_clocksource_read();
181 }
182 
183 static void xen_read_wallclock(struct timespec *ts)
184 {
185 	struct shared_info *s = HYPERVISOR_shared_info;
186 	struct pvclock_wall_clock *wall_clock = &(s->wc);
187         struct pvclock_vcpu_time_info *vcpu_time;
188 
189 	vcpu_time = &get_cpu_var(xen_vcpu)->time;
190 	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
191 	put_cpu_var(xen_vcpu);
192 }
193 
194 static unsigned long xen_get_wallclock(void)
195 {
196 	struct timespec ts;
197 
198 	xen_read_wallclock(&ts);
199 	return ts.tv_sec;
200 }
201 
202 static int xen_set_wallclock(unsigned long now)
203 {
204 	/* do nothing for domU */
205 	return -1;
206 }
207 
208 static struct clocksource xen_clocksource __read_mostly = {
209 	.name = "xen",
210 	.rating = 400,
211 	.read = xen_clocksource_get_cycles,
212 	.mask = ~0,
213 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
214 };
215 
216 /*
217    Xen clockevent implementation
218 
219    Xen has two clockevent implementations:
220 
221    The old timer_op one works with all released versions of Xen prior
222    to version 3.0.4.  This version of the hypervisor provides a
223    single-shot timer with nanosecond resolution.  However, sharing the
224    same event channel is a 100Hz tick which is delivered while the
225    vcpu is running.  We don't care about or use this tick, but it will
226    cause the core time code to think the timer fired too soon, and
227    will end up resetting it each time.  It could be filtered, but
228    doing so has complications when the ktime clocksource is not yet
229    the xen clocksource (ie, at boot time).
230 
231    The new vcpu_op-based timer interface allows the tick timer period
232    to be changed or turned off.  The tick timer is not useful as a
233    periodic timer because events are only delivered to running vcpus.
234    The one-shot timer can report when a timeout is in the past, so
235    set_next_event is capable of returning -ETIME when appropriate.
236    This interface is used when available.
237 */
238 
239 
240 /*
241   Get a hypervisor absolute time.  In theory we could maintain an
242   offset between the kernel's time and the hypervisor's time, and
243   apply that to a kernel's absolute timeout.  Unfortunately the
244   hypervisor and kernel times can drift even if the kernel is using
245   the Xen clocksource, because ntp can warp the kernel's clocksource.
246 */
247 static s64 get_abs_timeout(unsigned long delta)
248 {
249 	return xen_clocksource_read() + delta;
250 }
251 
252 static void xen_timerop_set_mode(enum clock_event_mode mode,
253 				 struct clock_event_device *evt)
254 {
255 	switch (mode) {
256 	case CLOCK_EVT_MODE_PERIODIC:
257 		/* unsupported */
258 		WARN_ON(1);
259 		break;
260 
261 	case CLOCK_EVT_MODE_ONESHOT:
262 	case CLOCK_EVT_MODE_RESUME:
263 		break;
264 
265 	case CLOCK_EVT_MODE_UNUSED:
266 	case CLOCK_EVT_MODE_SHUTDOWN:
267 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
268 		break;
269 	}
270 }
271 
272 static int xen_timerop_set_next_event(unsigned long delta,
273 				      struct clock_event_device *evt)
274 {
275 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
276 
277 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
278 		BUG();
279 
280 	/* We may have missed the deadline, but there's no real way of
281 	   knowing for sure.  If the event was in the past, then we'll
282 	   get an immediate interrupt. */
283 
284 	return 0;
285 }
286 
287 static const struct clock_event_device xen_timerop_clockevent = {
288 	.name = "xen",
289 	.features = CLOCK_EVT_FEAT_ONESHOT,
290 
291 	.max_delta_ns = 0xffffffff,
292 	.min_delta_ns = TIMER_SLOP,
293 
294 	.mult = 1,
295 	.shift = 0,
296 	.rating = 500,
297 
298 	.set_mode = xen_timerop_set_mode,
299 	.set_next_event = xen_timerop_set_next_event,
300 };
301 
302 
303 
304 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
305 				struct clock_event_device *evt)
306 {
307 	int cpu = smp_processor_id();
308 
309 	switch (mode) {
310 	case CLOCK_EVT_MODE_PERIODIC:
311 		WARN_ON(1);	/* unsupported */
312 		break;
313 
314 	case CLOCK_EVT_MODE_ONESHOT:
315 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
316 			BUG();
317 		break;
318 
319 	case CLOCK_EVT_MODE_UNUSED:
320 	case CLOCK_EVT_MODE_SHUTDOWN:
321 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
322 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
323 			BUG();
324 		break;
325 	case CLOCK_EVT_MODE_RESUME:
326 		break;
327 	}
328 }
329 
330 static int xen_vcpuop_set_next_event(unsigned long delta,
331 				     struct clock_event_device *evt)
332 {
333 	int cpu = smp_processor_id();
334 	struct vcpu_set_singleshot_timer single;
335 	int ret;
336 
337 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
338 
339 	single.timeout_abs_ns = get_abs_timeout(delta);
340 	single.flags = VCPU_SSHOTTMR_future;
341 
342 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
343 
344 	BUG_ON(ret != 0 && ret != -ETIME);
345 
346 	return ret;
347 }
348 
349 static const struct clock_event_device xen_vcpuop_clockevent = {
350 	.name = "xen",
351 	.features = CLOCK_EVT_FEAT_ONESHOT,
352 
353 	.max_delta_ns = 0xffffffff,
354 	.min_delta_ns = TIMER_SLOP,
355 
356 	.mult = 1,
357 	.shift = 0,
358 	.rating = 500,
359 
360 	.set_mode = xen_vcpuop_set_mode,
361 	.set_next_event = xen_vcpuop_set_next_event,
362 };
363 
364 static const struct clock_event_device *xen_clockevent =
365 	&xen_timerop_clockevent;
366 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
367 
368 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
369 {
370 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
371 	irqreturn_t ret;
372 
373 	ret = IRQ_NONE;
374 	if (evt->event_handler) {
375 		evt->event_handler(evt);
376 		ret = IRQ_HANDLED;
377 	}
378 
379 	do_stolen_accounting();
380 
381 	return ret;
382 }
383 
384 void xen_setup_timer(int cpu)
385 {
386 	const char *name;
387 	struct clock_event_device *evt;
388 	int irq;
389 
390 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
391 
392 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
393 	if (!name)
394 		name = "<timer kasprintf failed>";
395 
396 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
397 				      IRQF_DISABLED|IRQF_PERCPU|
398 				      IRQF_NOBALANCING|IRQF_TIMER|
399 				      IRQF_FORCE_RESUME,
400 				      name, NULL);
401 
402 	evt = &per_cpu(xen_clock_events, cpu);
403 	memcpy(evt, xen_clockevent, sizeof(*evt));
404 
405 	evt->cpumask = cpumask_of(cpu);
406 	evt->irq = irq;
407 }
408 
409 void xen_teardown_timer(int cpu)
410 {
411 	struct clock_event_device *evt;
412 	BUG_ON(cpu == 0);
413 	evt = &per_cpu(xen_clock_events, cpu);
414 	unbind_from_irqhandler(evt->irq, NULL);
415 }
416 
417 void xen_setup_cpu_clockevents(void)
418 {
419 	BUG_ON(preemptible());
420 
421 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
422 }
423 
424 void xen_timer_resume(void)
425 {
426 	int cpu;
427 
428 	pvclock_resume();
429 
430 	if (xen_clockevent != &xen_vcpuop_clockevent)
431 		return;
432 
433 	for_each_online_cpu(cpu) {
434 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
435 			BUG();
436 	}
437 }
438 
439 static const struct pv_time_ops xen_time_ops __initconst = {
440 	.sched_clock = xen_clocksource_read,
441 };
442 
443 static void __init xen_time_init(void)
444 {
445 	int cpu = smp_processor_id();
446 	struct timespec tp;
447 
448 	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
449 
450 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
451 		/* Successfully turned off 100Hz tick, so we have the
452 		   vcpuop-based timer interface */
453 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
454 		xen_clockevent = &xen_vcpuop_clockevent;
455 	}
456 
457 	/* Set initial system time with full resolution */
458 	xen_read_wallclock(&tp);
459 	do_settimeofday(&tp);
460 
461 	setup_force_cpu_cap(X86_FEATURE_TSC);
462 
463 	xen_setup_runstate_info(cpu);
464 	xen_setup_timer(cpu);
465 	xen_setup_cpu_clockevents();
466 }
467 
468 void __init xen_init_time_ops(void)
469 {
470 	pv_time_ops = xen_time_ops;
471 
472 	x86_init.timers.timer_init = xen_time_init;
473 	x86_init.timers.setup_percpu_clockev = x86_init_noop;
474 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
475 
476 	x86_platform.calibrate_tsc = xen_tsc_khz;
477 	x86_platform.get_wallclock = xen_get_wallclock;
478 	x86_platform.set_wallclock = xen_set_wallclock;
479 }
480 
481 #ifdef CONFIG_XEN_PVHVM
482 static void xen_hvm_setup_cpu_clockevents(void)
483 {
484 	int cpu = smp_processor_id();
485 	xen_setup_runstate_info(cpu);
486 	xen_setup_timer(cpu);
487 	xen_setup_cpu_clockevents();
488 }
489 
490 void __init xen_hvm_init_time_ops(void)
491 {
492 	/* vector callback is needed otherwise we cannot receive interrupts
493 	 * on cpu > 0 and at this point we don't know how many cpus are
494 	 * available */
495 	if (!xen_have_vector_callback)
496 		return;
497 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
498 		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
499 				"disable pv timer\n");
500 		return;
501 	}
502 
503 	pv_time_ops = xen_time_ops;
504 	x86_init.timers.setup_percpu_clockev = xen_time_init;
505 	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
506 
507 	x86_platform.calibrate_tsc = xen_tsc_khz;
508 	x86_platform.get_wallclock = xen_get_wallclock;
509 	x86_platform.set_wallclock = xen_set_wallclock;
510 }
511 #endif
512