xref: /openbmc/linux/arch/x86/xen/time.c (revision 22fd411a)
1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 #include <linux/gfp.h>
17 
18 #include <asm/pvclock.h>
19 #include <asm/xen/hypervisor.h>
20 #include <asm/xen/hypercall.h>
21 
22 #include <xen/events.h>
23 #include <xen/features.h>
24 #include <xen/interface/xen.h>
25 #include <xen/interface/vcpu.h>
26 
27 #include "xen-ops.h"
28 
29 #define XEN_SHIFT 22
30 
31 /* Xen may fire a timer up to this many ns early */
32 #define TIMER_SLOP	100000
33 #define NS_PER_TICK	(1000000000LL / HZ)
34 
35 /* runstate info updated by Xen */
36 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
37 
38 /* snapshots of runstate info */
39 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
40 
41 /* unused ns of stolen and blocked time */
42 static DEFINE_PER_CPU(u64, xen_residual_stolen);
43 static DEFINE_PER_CPU(u64, xen_residual_blocked);
44 
45 /* return an consistent snapshot of 64-bit time/counter value */
46 static u64 get64(const u64 *p)
47 {
48 	u64 ret;
49 
50 	if (BITS_PER_LONG < 64) {
51 		u32 *p32 = (u32 *)p;
52 		u32 h, l;
53 
54 		/*
55 		 * Read high then low, and then make sure high is
56 		 * still the same; this will only loop if low wraps
57 		 * and carries into high.
58 		 * XXX some clean way to make this endian-proof?
59 		 */
60 		do {
61 			h = p32[1];
62 			barrier();
63 			l = p32[0];
64 			barrier();
65 		} while (p32[1] != h);
66 
67 		ret = (((u64)h) << 32) | l;
68 	} else
69 		ret = *p;
70 
71 	return ret;
72 }
73 
74 /*
75  * Runstate accounting
76  */
77 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
78 {
79 	u64 state_time;
80 	struct vcpu_runstate_info *state;
81 
82 	BUG_ON(preemptible());
83 
84 	state = &__get_cpu_var(xen_runstate);
85 
86 	/*
87 	 * The runstate info is always updated by the hypervisor on
88 	 * the current CPU, so there's no need to use anything
89 	 * stronger than a compiler barrier when fetching it.
90 	 */
91 	do {
92 		state_time = get64(&state->state_entry_time);
93 		barrier();
94 		*res = *state;
95 		barrier();
96 	} while (get64(&state->state_entry_time) != state_time);
97 }
98 
99 /* return true when a vcpu could run but has no real cpu to run on */
100 bool xen_vcpu_stolen(int vcpu)
101 {
102 	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
103 }
104 
105 void xen_setup_runstate_info(int cpu)
106 {
107 	struct vcpu_register_runstate_memory_area area;
108 
109 	area.addr.v = &per_cpu(xen_runstate, cpu);
110 
111 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
112 			       cpu, &area))
113 		BUG();
114 }
115 
116 static void do_stolen_accounting(void)
117 {
118 	struct vcpu_runstate_info state;
119 	struct vcpu_runstate_info *snap;
120 	s64 blocked, runnable, offline, stolen;
121 	cputime_t ticks;
122 
123 	get_runstate_snapshot(&state);
124 
125 	WARN_ON(state.state != RUNSTATE_running);
126 
127 	snap = &__get_cpu_var(xen_runstate_snapshot);
128 
129 	/* work out how much time the VCPU has not been runn*ing*  */
130 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
131 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
132 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
133 
134 	*snap = state;
135 
136 	/* Add the appropriate number of ticks of stolen time,
137 	   including any left-overs from last time. */
138 	stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
139 
140 	if (stolen < 0)
141 		stolen = 0;
142 
143 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
144 	__this_cpu_write(xen_residual_stolen, stolen);
145 	account_steal_ticks(ticks);
146 
147 	/* Add the appropriate number of ticks of blocked time,
148 	   including any left-overs from last time. */
149 	blocked += __this_cpu_read(xen_residual_blocked);
150 
151 	if (blocked < 0)
152 		blocked = 0;
153 
154 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 	__this_cpu_write(xen_residual_blocked, blocked);
156 	account_idle_ticks(ticks);
157 }
158 
159 /* Get the TSC speed from Xen */
160 static unsigned long xen_tsc_khz(void)
161 {
162 	struct pvclock_vcpu_time_info *info =
163 		&HYPERVISOR_shared_info->vcpu_info[0].time;
164 
165 	return pvclock_tsc_khz(info);
166 }
167 
168 cycle_t xen_clocksource_read(void)
169 {
170         struct pvclock_vcpu_time_info *src;
171 	cycle_t ret;
172 
173 	src = &get_cpu_var(xen_vcpu)->time;
174 	ret = pvclock_clocksource_read(src);
175 	put_cpu_var(xen_vcpu);
176 	return ret;
177 }
178 
179 static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
180 {
181 	return xen_clocksource_read();
182 }
183 
184 static void xen_read_wallclock(struct timespec *ts)
185 {
186 	struct shared_info *s = HYPERVISOR_shared_info;
187 	struct pvclock_wall_clock *wall_clock = &(s->wc);
188         struct pvclock_vcpu_time_info *vcpu_time;
189 
190 	vcpu_time = &get_cpu_var(xen_vcpu)->time;
191 	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
192 	put_cpu_var(xen_vcpu);
193 }
194 
195 static unsigned long xen_get_wallclock(void)
196 {
197 	struct timespec ts;
198 
199 	xen_read_wallclock(&ts);
200 	return ts.tv_sec;
201 }
202 
203 static int xen_set_wallclock(unsigned long now)
204 {
205 	/* do nothing for domU */
206 	return -1;
207 }
208 
209 static struct clocksource xen_clocksource __read_mostly = {
210 	.name = "xen",
211 	.rating = 400,
212 	.read = xen_clocksource_get_cycles,
213 	.mask = ~0,
214 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
215 	.shift = XEN_SHIFT,
216 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
217 };
218 
219 /*
220    Xen clockevent implementation
221 
222    Xen has two clockevent implementations:
223 
224    The old timer_op one works with all released versions of Xen prior
225    to version 3.0.4.  This version of the hypervisor provides a
226    single-shot timer with nanosecond resolution.  However, sharing the
227    same event channel is a 100Hz tick which is delivered while the
228    vcpu is running.  We don't care about or use this tick, but it will
229    cause the core time code to think the timer fired too soon, and
230    will end up resetting it each time.  It could be filtered, but
231    doing so has complications when the ktime clocksource is not yet
232    the xen clocksource (ie, at boot time).
233 
234    The new vcpu_op-based timer interface allows the tick timer period
235    to be changed or turned off.  The tick timer is not useful as a
236    periodic timer because events are only delivered to running vcpus.
237    The one-shot timer can report when a timeout is in the past, so
238    set_next_event is capable of returning -ETIME when appropriate.
239    This interface is used when available.
240 */
241 
242 
243 /*
244   Get a hypervisor absolute time.  In theory we could maintain an
245   offset between the kernel's time and the hypervisor's time, and
246   apply that to a kernel's absolute timeout.  Unfortunately the
247   hypervisor and kernel times can drift even if the kernel is using
248   the Xen clocksource, because ntp can warp the kernel's clocksource.
249 */
250 static s64 get_abs_timeout(unsigned long delta)
251 {
252 	return xen_clocksource_read() + delta;
253 }
254 
255 static void xen_timerop_set_mode(enum clock_event_mode mode,
256 				 struct clock_event_device *evt)
257 {
258 	switch (mode) {
259 	case CLOCK_EVT_MODE_PERIODIC:
260 		/* unsupported */
261 		WARN_ON(1);
262 		break;
263 
264 	case CLOCK_EVT_MODE_ONESHOT:
265 	case CLOCK_EVT_MODE_RESUME:
266 		break;
267 
268 	case CLOCK_EVT_MODE_UNUSED:
269 	case CLOCK_EVT_MODE_SHUTDOWN:
270 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
271 		break;
272 	}
273 }
274 
275 static int xen_timerop_set_next_event(unsigned long delta,
276 				      struct clock_event_device *evt)
277 {
278 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
279 
280 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
281 		BUG();
282 
283 	/* We may have missed the deadline, but there's no real way of
284 	   knowing for sure.  If the event was in the past, then we'll
285 	   get an immediate interrupt. */
286 
287 	return 0;
288 }
289 
290 static const struct clock_event_device xen_timerop_clockevent = {
291 	.name = "xen",
292 	.features = CLOCK_EVT_FEAT_ONESHOT,
293 
294 	.max_delta_ns = 0xffffffff,
295 	.min_delta_ns = TIMER_SLOP,
296 
297 	.mult = 1,
298 	.shift = 0,
299 	.rating = 500,
300 
301 	.set_mode = xen_timerop_set_mode,
302 	.set_next_event = xen_timerop_set_next_event,
303 };
304 
305 
306 
307 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
308 				struct clock_event_device *evt)
309 {
310 	int cpu = smp_processor_id();
311 
312 	switch (mode) {
313 	case CLOCK_EVT_MODE_PERIODIC:
314 		WARN_ON(1);	/* unsupported */
315 		break;
316 
317 	case CLOCK_EVT_MODE_ONESHOT:
318 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
319 			BUG();
320 		break;
321 
322 	case CLOCK_EVT_MODE_UNUSED:
323 	case CLOCK_EVT_MODE_SHUTDOWN:
324 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
325 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
326 			BUG();
327 		break;
328 	case CLOCK_EVT_MODE_RESUME:
329 		break;
330 	}
331 }
332 
333 static int xen_vcpuop_set_next_event(unsigned long delta,
334 				     struct clock_event_device *evt)
335 {
336 	int cpu = smp_processor_id();
337 	struct vcpu_set_singleshot_timer single;
338 	int ret;
339 
340 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
341 
342 	single.timeout_abs_ns = get_abs_timeout(delta);
343 	single.flags = VCPU_SSHOTTMR_future;
344 
345 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
346 
347 	BUG_ON(ret != 0 && ret != -ETIME);
348 
349 	return ret;
350 }
351 
352 static const struct clock_event_device xen_vcpuop_clockevent = {
353 	.name = "xen",
354 	.features = CLOCK_EVT_FEAT_ONESHOT,
355 
356 	.max_delta_ns = 0xffffffff,
357 	.min_delta_ns = TIMER_SLOP,
358 
359 	.mult = 1,
360 	.shift = 0,
361 	.rating = 500,
362 
363 	.set_mode = xen_vcpuop_set_mode,
364 	.set_next_event = xen_vcpuop_set_next_event,
365 };
366 
367 static const struct clock_event_device *xen_clockevent =
368 	&xen_timerop_clockevent;
369 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
370 
371 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
372 {
373 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
374 	irqreturn_t ret;
375 
376 	ret = IRQ_NONE;
377 	if (evt->event_handler) {
378 		evt->event_handler(evt);
379 		ret = IRQ_HANDLED;
380 	}
381 
382 	do_stolen_accounting();
383 
384 	return ret;
385 }
386 
387 void xen_setup_timer(int cpu)
388 {
389 	const char *name;
390 	struct clock_event_device *evt;
391 	int irq;
392 
393 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
394 
395 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
396 	if (!name)
397 		name = "<timer kasprintf failed>";
398 
399 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
401 				      name, NULL);
402 
403 	evt = &per_cpu(xen_clock_events, cpu);
404 	memcpy(evt, xen_clockevent, sizeof(*evt));
405 
406 	evt->cpumask = cpumask_of(cpu);
407 	evt->irq = irq;
408 }
409 
410 void xen_teardown_timer(int cpu)
411 {
412 	struct clock_event_device *evt;
413 	BUG_ON(cpu == 0);
414 	evt = &per_cpu(xen_clock_events, cpu);
415 	unbind_from_irqhandler(evt->irq, NULL);
416 }
417 
418 void xen_setup_cpu_clockevents(void)
419 {
420 	BUG_ON(preemptible());
421 
422 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
423 }
424 
425 void xen_timer_resume(void)
426 {
427 	int cpu;
428 
429 	pvclock_resume();
430 
431 	if (xen_clockevent != &xen_vcpuop_clockevent)
432 		return;
433 
434 	for_each_online_cpu(cpu) {
435 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
436 			BUG();
437 	}
438 }
439 
440 static const struct pv_time_ops xen_time_ops __initdata = {
441 	.sched_clock = xen_clocksource_read,
442 };
443 
444 static __init void xen_time_init(void)
445 {
446 	int cpu = smp_processor_id();
447 	struct timespec tp;
448 
449 	clocksource_register(&xen_clocksource);
450 
451 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
452 		/* Successfully turned off 100Hz tick, so we have the
453 		   vcpuop-based timer interface */
454 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
455 		xen_clockevent = &xen_vcpuop_clockevent;
456 	}
457 
458 	/* Set initial system time with full resolution */
459 	xen_read_wallclock(&tp);
460 	do_settimeofday(&tp);
461 
462 	setup_force_cpu_cap(X86_FEATURE_TSC);
463 
464 	xen_setup_runstate_info(cpu);
465 	xen_setup_timer(cpu);
466 	xen_setup_cpu_clockevents();
467 }
468 
469 __init void xen_init_time_ops(void)
470 {
471 	pv_time_ops = xen_time_ops;
472 
473 	x86_init.timers.timer_init = xen_time_init;
474 	x86_init.timers.setup_percpu_clockev = x86_init_noop;
475 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
476 
477 	x86_platform.calibrate_tsc = xen_tsc_khz;
478 	x86_platform.get_wallclock = xen_get_wallclock;
479 	x86_platform.set_wallclock = xen_set_wallclock;
480 }
481 
482 #ifdef CONFIG_XEN_PVHVM
483 static void xen_hvm_setup_cpu_clockevents(void)
484 {
485 	int cpu = smp_processor_id();
486 	xen_setup_runstate_info(cpu);
487 	xen_setup_timer(cpu);
488 	xen_setup_cpu_clockevents();
489 }
490 
491 __init void xen_hvm_init_time_ops(void)
492 {
493 	/* vector callback is needed otherwise we cannot receive interrupts
494 	 * on cpu > 0 and at this point we don't know how many cpus are
495 	 * available */
496 	if (!xen_have_vector_callback)
497 		return;
498 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
499 		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
500 				"disable pv timer\n");
501 		return;
502 	}
503 
504 	pv_time_ops = xen_time_ops;
505 	x86_init.timers.setup_percpu_clockev = xen_time_init;
506 	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
507 
508 	x86_platform.calibrate_tsc = xen_tsc_khz;
509 	x86_platform.get_wallclock = xen_get_wallclock;
510 	x86_platform.set_wallclock = xen_set_wallclock;
511 }
512 #endif
513