xref: /openbmc/linux/arch/x86/xen/time.c (revision 643d1f7f)
1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 
16 #include <asm/xen/hypervisor.h>
17 #include <asm/xen/hypercall.h>
18 
19 #include <xen/events.h>
20 #include <xen/interface/xen.h>
21 #include <xen/interface/vcpu.h>
22 
23 #include "xen-ops.h"
24 
25 #define XEN_SHIFT 22
26 
27 /* Xen may fire a timer up to this many ns early */
28 #define TIMER_SLOP	100000
29 #define NS_PER_TICK	(1000000000LL / HZ)
30 
31 static cycle_t xen_clocksource_read(void);
32 
33 /* These are perodically updated in shared_info, and then copied here. */
34 struct shadow_time_info {
35 	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
36 	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
37 	u32 tsc_to_nsec_mul;
38 	int tsc_shift;
39 	u32 version;
40 };
41 
42 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43 
44 /* runstate info updated by Xen */
45 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46 
47 /* snapshots of runstate info */
48 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
49 
50 /* unused ns of stolen and blocked time */
51 static DEFINE_PER_CPU(u64, residual_stolen);
52 static DEFINE_PER_CPU(u64, residual_blocked);
53 
54 /* return an consistent snapshot of 64-bit time/counter value */
55 static u64 get64(const u64 *p)
56 {
57 	u64 ret;
58 
59 	if (BITS_PER_LONG < 64) {
60 		u32 *p32 = (u32 *)p;
61 		u32 h, l;
62 
63 		/*
64 		 * Read high then low, and then make sure high is
65 		 * still the same; this will only loop if low wraps
66 		 * and carries into high.
67 		 * XXX some clean way to make this endian-proof?
68 		 */
69 		do {
70 			h = p32[1];
71 			barrier();
72 			l = p32[0];
73 			barrier();
74 		} while (p32[1] != h);
75 
76 		ret = (((u64)h) << 32) | l;
77 	} else
78 		ret = *p;
79 
80 	return ret;
81 }
82 
83 /*
84  * Runstate accounting
85  */
86 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
87 {
88 	u64 state_time;
89 	struct vcpu_runstate_info *state;
90 
91 	BUG_ON(preemptible());
92 
93 	state = &__get_cpu_var(runstate);
94 
95 	/*
96 	 * The runstate info is always updated by the hypervisor on
97 	 * the current CPU, so there's no need to use anything
98 	 * stronger than a compiler barrier when fetching it.
99 	 */
100 	do {
101 		state_time = get64(&state->state_entry_time);
102 		barrier();
103 		*res = *state;
104 		barrier();
105 	} while (get64(&state->state_entry_time) != state_time);
106 }
107 
108 /* return true when a vcpu could run but has no real cpu to run on */
109 bool xen_vcpu_stolen(int vcpu)
110 {
111 	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
112 }
113 
114 static void setup_runstate_info(int cpu)
115 {
116 	struct vcpu_register_runstate_memory_area area;
117 
118 	area.addr.v = &per_cpu(runstate, cpu);
119 
120 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
121 			       cpu, &area))
122 		BUG();
123 }
124 
125 static void do_stolen_accounting(void)
126 {
127 	struct vcpu_runstate_info state;
128 	struct vcpu_runstate_info *snap;
129 	s64 blocked, runnable, offline, stolen;
130 	cputime_t ticks;
131 
132 	get_runstate_snapshot(&state);
133 
134 	WARN_ON(state.state != RUNSTATE_running);
135 
136 	snap = &__get_cpu_var(runstate_snapshot);
137 
138 	/* work out how much time the VCPU has not been runn*ing*  */
139 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
140 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
141 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
142 
143 	*snap = state;
144 
145 	/* Add the appropriate number of ticks of stolen time,
146 	   including any left-overs from last time.  Passing NULL to
147 	   account_steal_time accounts the time as stolen. */
148 	stolen = runnable + offline + __get_cpu_var(residual_stolen);
149 
150 	if (stolen < 0)
151 		stolen = 0;
152 
153 	ticks = 0;
154 	while (stolen >= NS_PER_TICK) {
155 		ticks++;
156 		stolen -= NS_PER_TICK;
157 	}
158 	__get_cpu_var(residual_stolen) = stolen;
159 	account_steal_time(NULL, ticks);
160 
161 	/* Add the appropriate number of ticks of blocked time,
162 	   including any left-overs from last time.  Passing idle to
163 	   account_steal_time accounts the time as idle/wait. */
164 	blocked += __get_cpu_var(residual_blocked);
165 
166 	if (blocked < 0)
167 		blocked = 0;
168 
169 	ticks = 0;
170 	while (blocked >= NS_PER_TICK) {
171 		ticks++;
172 		blocked -= NS_PER_TICK;
173 	}
174 	__get_cpu_var(residual_blocked) = blocked;
175 	account_steal_time(idle_task(smp_processor_id()), ticks);
176 }
177 
178 /*
179  * Xen sched_clock implementation.  Returns the number of unstolen
180  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
181  * states.
182  */
183 unsigned long long xen_sched_clock(void)
184 {
185 	struct vcpu_runstate_info state;
186 	cycle_t now;
187 	u64 ret;
188 	s64 offset;
189 
190 	/*
191 	 * Ideally sched_clock should be called on a per-cpu basis
192 	 * anyway, so preempt should already be disabled, but that's
193 	 * not current practice at the moment.
194 	 */
195 	preempt_disable();
196 
197 	now = xen_clocksource_read();
198 
199 	get_runstate_snapshot(&state);
200 
201 	WARN_ON(state.state != RUNSTATE_running);
202 
203 	offset = now - state.state_entry_time;
204 	if (offset < 0)
205 		offset = 0;
206 
207 	ret = state.time[RUNSTATE_blocked] +
208 		state.time[RUNSTATE_running] +
209 		offset;
210 
211 	preempt_enable();
212 
213 	return ret;
214 }
215 
216 
217 /* Get the CPU speed from Xen */
218 unsigned long xen_cpu_khz(void)
219 {
220 	u64 cpu_khz = 1000000ULL << 32;
221 	const struct vcpu_time_info *info =
222 		&HYPERVISOR_shared_info->vcpu_info[0].time;
223 
224 	do_div(cpu_khz, info->tsc_to_system_mul);
225 	if (info->tsc_shift < 0)
226 		cpu_khz <<= -info->tsc_shift;
227 	else
228 		cpu_khz >>= info->tsc_shift;
229 
230 	return cpu_khz;
231 }
232 
233 /*
234  * Reads a consistent set of time-base values from Xen, into a shadow data
235  * area.
236  */
237 static unsigned get_time_values_from_xen(void)
238 {
239 	struct vcpu_time_info   *src;
240 	struct shadow_time_info *dst;
241 
242 	/* src is shared memory with the hypervisor, so we need to
243 	   make sure we get a consistent snapshot, even in the face of
244 	   being preempted. */
245 	src = &__get_cpu_var(xen_vcpu)->time;
246 	dst = &__get_cpu_var(shadow_time);
247 
248 	do {
249 		dst->version = src->version;
250 		rmb();		/* fetch version before data */
251 		dst->tsc_timestamp     = src->tsc_timestamp;
252 		dst->system_timestamp  = src->system_time;
253 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
254 		dst->tsc_shift         = src->tsc_shift;
255 		rmb();		/* test version after fetching data */
256 	} while ((src->version & 1) | (dst->version ^ src->version));
257 
258 	return dst->version;
259 }
260 
261 /*
262  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
263  * yielding a 64-bit result.
264  */
265 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
266 {
267 	u64 product;
268 #ifdef __i386__
269 	u32 tmp1, tmp2;
270 #endif
271 
272 	if (shift < 0)
273 		delta >>= -shift;
274 	else
275 		delta <<= shift;
276 
277 #ifdef __i386__
278 	__asm__ (
279 		"mul  %5       ; "
280 		"mov  %4,%%eax ; "
281 		"mov  %%edx,%4 ; "
282 		"mul  %5       ; "
283 		"xor  %5,%5    ; "
284 		"add  %4,%%eax ; "
285 		"adc  %5,%%edx ; "
286 		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
287 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
288 #elif __x86_64__
289 	__asm__ (
290 		"mul %%rdx ; shrd $32,%%rdx,%%rax"
291 		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
292 #else
293 #error implement me!
294 #endif
295 
296 	return product;
297 }
298 
299 static u64 get_nsec_offset(struct shadow_time_info *shadow)
300 {
301 	u64 now, delta;
302 	now = native_read_tsc();
303 	delta = now - shadow->tsc_timestamp;
304 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
305 }
306 
307 static cycle_t xen_clocksource_read(void)
308 {
309 	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
310 	cycle_t ret;
311 	unsigned version;
312 
313 	do {
314 		version = get_time_values_from_xen();
315 		barrier();
316 		ret = shadow->system_timestamp + get_nsec_offset(shadow);
317 		barrier();
318 	} while (version != __get_cpu_var(xen_vcpu)->time.version);
319 
320 	put_cpu_var(shadow_time);
321 
322 	return ret;
323 }
324 
325 static void xen_read_wallclock(struct timespec *ts)
326 {
327 	const struct shared_info *s = HYPERVISOR_shared_info;
328 	u32 version;
329 	u64 delta;
330 	struct timespec now;
331 
332 	/* get wallclock at system boot */
333 	do {
334 		version = s->wc_version;
335 		rmb();		/* fetch version before time */
336 		now.tv_sec  = s->wc_sec;
337 		now.tv_nsec = s->wc_nsec;
338 		rmb();		/* fetch time before checking version */
339 	} while ((s->wc_version & 1) | (version ^ s->wc_version));
340 
341 	delta = xen_clocksource_read();	/* time since system boot */
342 	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
343 
344 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
345 	now.tv_sec = delta;
346 
347 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
348 }
349 
350 unsigned long xen_get_wallclock(void)
351 {
352 	struct timespec ts;
353 
354 	xen_read_wallclock(&ts);
355 
356 	return ts.tv_sec;
357 }
358 
359 int xen_set_wallclock(unsigned long now)
360 {
361 	/* do nothing for domU */
362 	return -1;
363 }
364 
365 static struct clocksource xen_clocksource __read_mostly = {
366 	.name = "xen",
367 	.rating = 400,
368 	.read = xen_clocksource_read,
369 	.mask = ~0,
370 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
371 	.shift = XEN_SHIFT,
372 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
373 };
374 
375 /*
376    Xen clockevent implementation
377 
378    Xen has two clockevent implementations:
379 
380    The old timer_op one works with all released versions of Xen prior
381    to version 3.0.4.  This version of the hypervisor provides a
382    single-shot timer with nanosecond resolution.  However, sharing the
383    same event channel is a 100Hz tick which is delivered while the
384    vcpu is running.  We don't care about or use this tick, but it will
385    cause the core time code to think the timer fired too soon, and
386    will end up resetting it each time.  It could be filtered, but
387    doing so has complications when the ktime clocksource is not yet
388    the xen clocksource (ie, at boot time).
389 
390    The new vcpu_op-based timer interface allows the tick timer period
391    to be changed or turned off.  The tick timer is not useful as a
392    periodic timer because events are only delivered to running vcpus.
393    The one-shot timer can report when a timeout is in the past, so
394    set_next_event is capable of returning -ETIME when appropriate.
395    This interface is used when available.
396 */
397 
398 
399 /*
400   Get a hypervisor absolute time.  In theory we could maintain an
401   offset between the kernel's time and the hypervisor's time, and
402   apply that to a kernel's absolute timeout.  Unfortunately the
403   hypervisor and kernel times can drift even if the kernel is using
404   the Xen clocksource, because ntp can warp the kernel's clocksource.
405 */
406 static s64 get_abs_timeout(unsigned long delta)
407 {
408 	return xen_clocksource_read() + delta;
409 }
410 
411 static void xen_timerop_set_mode(enum clock_event_mode mode,
412 				 struct clock_event_device *evt)
413 {
414 	switch (mode) {
415 	case CLOCK_EVT_MODE_PERIODIC:
416 		/* unsupported */
417 		WARN_ON(1);
418 		break;
419 
420 	case CLOCK_EVT_MODE_ONESHOT:
421 	case CLOCK_EVT_MODE_RESUME:
422 		break;
423 
424 	case CLOCK_EVT_MODE_UNUSED:
425 	case CLOCK_EVT_MODE_SHUTDOWN:
426 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
427 		break;
428 	}
429 }
430 
431 static int xen_timerop_set_next_event(unsigned long delta,
432 				      struct clock_event_device *evt)
433 {
434 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
435 
436 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
437 		BUG();
438 
439 	/* We may have missed the deadline, but there's no real way of
440 	   knowing for sure.  If the event was in the past, then we'll
441 	   get an immediate interrupt. */
442 
443 	return 0;
444 }
445 
446 static const struct clock_event_device xen_timerop_clockevent = {
447 	.name = "xen",
448 	.features = CLOCK_EVT_FEAT_ONESHOT,
449 
450 	.max_delta_ns = 0xffffffff,
451 	.min_delta_ns = TIMER_SLOP,
452 
453 	.mult = 1,
454 	.shift = 0,
455 	.rating = 500,
456 
457 	.set_mode = xen_timerop_set_mode,
458 	.set_next_event = xen_timerop_set_next_event,
459 };
460 
461 
462 
463 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
464 				struct clock_event_device *evt)
465 {
466 	int cpu = smp_processor_id();
467 
468 	switch (mode) {
469 	case CLOCK_EVT_MODE_PERIODIC:
470 		WARN_ON(1);	/* unsupported */
471 		break;
472 
473 	case CLOCK_EVT_MODE_ONESHOT:
474 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
475 			BUG();
476 		break;
477 
478 	case CLOCK_EVT_MODE_UNUSED:
479 	case CLOCK_EVT_MODE_SHUTDOWN:
480 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
481 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
482 			BUG();
483 		break;
484 	case CLOCK_EVT_MODE_RESUME:
485 		break;
486 	}
487 }
488 
489 static int xen_vcpuop_set_next_event(unsigned long delta,
490 				     struct clock_event_device *evt)
491 {
492 	int cpu = smp_processor_id();
493 	struct vcpu_set_singleshot_timer single;
494 	int ret;
495 
496 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
497 
498 	single.timeout_abs_ns = get_abs_timeout(delta);
499 	single.flags = VCPU_SSHOTTMR_future;
500 
501 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
502 
503 	BUG_ON(ret != 0 && ret != -ETIME);
504 
505 	return ret;
506 }
507 
508 static const struct clock_event_device xen_vcpuop_clockevent = {
509 	.name = "xen",
510 	.features = CLOCK_EVT_FEAT_ONESHOT,
511 
512 	.max_delta_ns = 0xffffffff,
513 	.min_delta_ns = TIMER_SLOP,
514 
515 	.mult = 1,
516 	.shift = 0,
517 	.rating = 500,
518 
519 	.set_mode = xen_vcpuop_set_mode,
520 	.set_next_event = xen_vcpuop_set_next_event,
521 };
522 
523 static const struct clock_event_device *xen_clockevent =
524 	&xen_timerop_clockevent;
525 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
526 
527 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
528 {
529 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
530 	irqreturn_t ret;
531 
532 	ret = IRQ_NONE;
533 	if (evt->event_handler) {
534 		evt->event_handler(evt);
535 		ret = IRQ_HANDLED;
536 	}
537 
538 	do_stolen_accounting();
539 
540 	return ret;
541 }
542 
543 void xen_setup_timer(int cpu)
544 {
545 	const char *name;
546 	struct clock_event_device *evt;
547 	int irq;
548 
549 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
550 
551 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
552 	if (!name)
553 		name = "<timer kasprintf failed>";
554 
555 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
556 				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
557 				      name, NULL);
558 
559 	evt = &per_cpu(xen_clock_events, cpu);
560 	memcpy(evt, xen_clockevent, sizeof(*evt));
561 
562 	evt->cpumask = cpumask_of_cpu(cpu);
563 	evt->irq = irq;
564 
565 	setup_runstate_info(cpu);
566 }
567 
568 void xen_setup_cpu_clockevents(void)
569 {
570 	BUG_ON(preemptible());
571 
572 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
573 }
574 
575 __init void xen_time_init(void)
576 {
577 	int cpu = smp_processor_id();
578 
579 	get_time_values_from_xen();
580 
581 	clocksource_register(&xen_clocksource);
582 
583 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
584 		/* Successfully turned off 100Hz tick, so we have the
585 		   vcpuop-based timer interface */
586 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
587 		xen_clockevent = &xen_vcpuop_clockevent;
588 	}
589 
590 	/* Set initial system time with full resolution */
591 	xen_read_wallclock(&xtime);
592 	set_normalized_timespec(&wall_to_monotonic,
593 				-xtime.tv_sec, -xtime.tv_nsec);
594 
595 	setup_force_cpu_cap(X86_FEATURE_TSC);
596 
597 	xen_setup_timer(cpu);
598 	xen_setup_cpu_clockevents();
599 }
600