xref: /openbmc/linux/arch/x86/xen/time.c (revision 367b8112)
1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 
17 #include <asm/pvclock.h>
18 #include <asm/xen/hypervisor.h>
19 #include <asm/xen/hypercall.h>
20 
21 #include <xen/events.h>
22 #include <xen/interface/xen.h>
23 #include <xen/interface/vcpu.h>
24 
25 #include "xen-ops.h"
26 
27 #define XEN_SHIFT 22
28 
29 /* Xen may fire a timer up to this many ns early */
30 #define TIMER_SLOP	100000
31 #define NS_PER_TICK	(1000000000LL / HZ)
32 
33 /* runstate info updated by Xen */
34 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
35 
36 /* snapshots of runstate info */
37 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
38 
39 /* unused ns of stolen and blocked time */
40 static DEFINE_PER_CPU(u64, residual_stolen);
41 static DEFINE_PER_CPU(u64, residual_blocked);
42 
43 /* return an consistent snapshot of 64-bit time/counter value */
44 static u64 get64(const u64 *p)
45 {
46 	u64 ret;
47 
48 	if (BITS_PER_LONG < 64) {
49 		u32 *p32 = (u32 *)p;
50 		u32 h, l;
51 
52 		/*
53 		 * Read high then low, and then make sure high is
54 		 * still the same; this will only loop if low wraps
55 		 * and carries into high.
56 		 * XXX some clean way to make this endian-proof?
57 		 */
58 		do {
59 			h = p32[1];
60 			barrier();
61 			l = p32[0];
62 			barrier();
63 		} while (p32[1] != h);
64 
65 		ret = (((u64)h) << 32) | l;
66 	} else
67 		ret = *p;
68 
69 	return ret;
70 }
71 
72 /*
73  * Runstate accounting
74  */
75 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
76 {
77 	u64 state_time;
78 	struct vcpu_runstate_info *state;
79 
80 	BUG_ON(preemptible());
81 
82 	state = &__get_cpu_var(runstate);
83 
84 	/*
85 	 * The runstate info is always updated by the hypervisor on
86 	 * the current CPU, so there's no need to use anything
87 	 * stronger than a compiler barrier when fetching it.
88 	 */
89 	do {
90 		state_time = get64(&state->state_entry_time);
91 		barrier();
92 		*res = *state;
93 		barrier();
94 	} while (get64(&state->state_entry_time) != state_time);
95 }
96 
97 /* return true when a vcpu could run but has no real cpu to run on */
98 bool xen_vcpu_stolen(int vcpu)
99 {
100 	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
101 }
102 
103 static void setup_runstate_info(int cpu)
104 {
105 	struct vcpu_register_runstate_memory_area area;
106 
107 	area.addr.v = &per_cpu(runstate, cpu);
108 
109 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 			       cpu, &area))
111 		BUG();
112 }
113 
114 static void do_stolen_accounting(void)
115 {
116 	struct vcpu_runstate_info state;
117 	struct vcpu_runstate_info *snap;
118 	s64 blocked, runnable, offline, stolen;
119 	cputime_t ticks;
120 
121 	get_runstate_snapshot(&state);
122 
123 	WARN_ON(state.state != RUNSTATE_running);
124 
125 	snap = &__get_cpu_var(runstate_snapshot);
126 
127 	/* work out how much time the VCPU has not been runn*ing*  */
128 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
129 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
130 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
131 
132 	*snap = state;
133 
134 	/* Add the appropriate number of ticks of stolen time,
135 	   including any left-overs from last time.  Passing NULL to
136 	   account_steal_time accounts the time as stolen. */
137 	stolen = runnable + offline + __get_cpu_var(residual_stolen);
138 
139 	if (stolen < 0)
140 		stolen = 0;
141 
142 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
143 	__get_cpu_var(residual_stolen) = stolen;
144 	account_steal_time(NULL, ticks);
145 
146 	/* Add the appropriate number of ticks of blocked time,
147 	   including any left-overs from last time.  Passing idle to
148 	   account_steal_time accounts the time as idle/wait. */
149 	blocked += __get_cpu_var(residual_blocked);
150 
151 	if (blocked < 0)
152 		blocked = 0;
153 
154 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 	__get_cpu_var(residual_blocked) = blocked;
156 	account_steal_time(idle_task(smp_processor_id()), ticks);
157 }
158 
159 /*
160  * Xen sched_clock implementation.  Returns the number of unstolen
161  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
162  * states.
163  */
164 unsigned long long xen_sched_clock(void)
165 {
166 	struct vcpu_runstate_info state;
167 	cycle_t now;
168 	u64 ret;
169 	s64 offset;
170 
171 	/*
172 	 * Ideally sched_clock should be called on a per-cpu basis
173 	 * anyway, so preempt should already be disabled, but that's
174 	 * not current practice at the moment.
175 	 */
176 	preempt_disable();
177 
178 	now = xen_clocksource_read();
179 
180 	get_runstate_snapshot(&state);
181 
182 	WARN_ON(state.state != RUNSTATE_running);
183 
184 	offset = now - state.state_entry_time;
185 	if (offset < 0)
186 		offset = 0;
187 
188 	ret = state.time[RUNSTATE_blocked] +
189 		state.time[RUNSTATE_running] +
190 		offset;
191 
192 	preempt_enable();
193 
194 	return ret;
195 }
196 
197 
198 /* Get the TSC speed from Xen */
199 unsigned long xen_tsc_khz(void)
200 {
201 	struct pvclock_vcpu_time_info *info =
202 		&HYPERVISOR_shared_info->vcpu_info[0].time;
203 
204 	return pvclock_tsc_khz(info);
205 }
206 
207 cycle_t xen_clocksource_read(void)
208 {
209         struct pvclock_vcpu_time_info *src;
210 	cycle_t ret;
211 
212 	src = &get_cpu_var(xen_vcpu)->time;
213 	ret = pvclock_clocksource_read(src);
214 	put_cpu_var(xen_vcpu);
215 	return ret;
216 }
217 
218 static void xen_read_wallclock(struct timespec *ts)
219 {
220 	struct shared_info *s = HYPERVISOR_shared_info;
221 	struct pvclock_wall_clock *wall_clock = &(s->wc);
222         struct pvclock_vcpu_time_info *vcpu_time;
223 
224 	vcpu_time = &get_cpu_var(xen_vcpu)->time;
225 	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
226 	put_cpu_var(xen_vcpu);
227 }
228 
229 unsigned long xen_get_wallclock(void)
230 {
231 	struct timespec ts;
232 
233 	xen_read_wallclock(&ts);
234 	return ts.tv_sec;
235 }
236 
237 int xen_set_wallclock(unsigned long now)
238 {
239 	/* do nothing for domU */
240 	return -1;
241 }
242 
243 static struct clocksource xen_clocksource __read_mostly = {
244 	.name = "xen",
245 	.rating = 400,
246 	.read = xen_clocksource_read,
247 	.mask = ~0,
248 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
249 	.shift = XEN_SHIFT,
250 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
251 };
252 
253 /*
254    Xen clockevent implementation
255 
256    Xen has two clockevent implementations:
257 
258    The old timer_op one works with all released versions of Xen prior
259    to version 3.0.4.  This version of the hypervisor provides a
260    single-shot timer with nanosecond resolution.  However, sharing the
261    same event channel is a 100Hz tick which is delivered while the
262    vcpu is running.  We don't care about or use this tick, but it will
263    cause the core time code to think the timer fired too soon, and
264    will end up resetting it each time.  It could be filtered, but
265    doing so has complications when the ktime clocksource is not yet
266    the xen clocksource (ie, at boot time).
267 
268    The new vcpu_op-based timer interface allows the tick timer period
269    to be changed or turned off.  The tick timer is not useful as a
270    periodic timer because events are only delivered to running vcpus.
271    The one-shot timer can report when a timeout is in the past, so
272    set_next_event is capable of returning -ETIME when appropriate.
273    This interface is used when available.
274 */
275 
276 
277 /*
278   Get a hypervisor absolute time.  In theory we could maintain an
279   offset between the kernel's time and the hypervisor's time, and
280   apply that to a kernel's absolute timeout.  Unfortunately the
281   hypervisor and kernel times can drift even if the kernel is using
282   the Xen clocksource, because ntp can warp the kernel's clocksource.
283 */
284 static s64 get_abs_timeout(unsigned long delta)
285 {
286 	return xen_clocksource_read() + delta;
287 }
288 
289 static void xen_timerop_set_mode(enum clock_event_mode mode,
290 				 struct clock_event_device *evt)
291 {
292 	switch (mode) {
293 	case CLOCK_EVT_MODE_PERIODIC:
294 		/* unsupported */
295 		WARN_ON(1);
296 		break;
297 
298 	case CLOCK_EVT_MODE_ONESHOT:
299 	case CLOCK_EVT_MODE_RESUME:
300 		break;
301 
302 	case CLOCK_EVT_MODE_UNUSED:
303 	case CLOCK_EVT_MODE_SHUTDOWN:
304 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
305 		break;
306 	}
307 }
308 
309 static int xen_timerop_set_next_event(unsigned long delta,
310 				      struct clock_event_device *evt)
311 {
312 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
313 
314 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
315 		BUG();
316 
317 	/* We may have missed the deadline, but there's no real way of
318 	   knowing for sure.  If the event was in the past, then we'll
319 	   get an immediate interrupt. */
320 
321 	return 0;
322 }
323 
324 static const struct clock_event_device xen_timerop_clockevent = {
325 	.name = "xen",
326 	.features = CLOCK_EVT_FEAT_ONESHOT,
327 
328 	.max_delta_ns = 0xffffffff,
329 	.min_delta_ns = TIMER_SLOP,
330 
331 	.mult = 1,
332 	.shift = 0,
333 	.rating = 500,
334 
335 	.set_mode = xen_timerop_set_mode,
336 	.set_next_event = xen_timerop_set_next_event,
337 };
338 
339 
340 
341 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
342 				struct clock_event_device *evt)
343 {
344 	int cpu = smp_processor_id();
345 
346 	switch (mode) {
347 	case CLOCK_EVT_MODE_PERIODIC:
348 		WARN_ON(1);	/* unsupported */
349 		break;
350 
351 	case CLOCK_EVT_MODE_ONESHOT:
352 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
353 			BUG();
354 		break;
355 
356 	case CLOCK_EVT_MODE_UNUSED:
357 	case CLOCK_EVT_MODE_SHUTDOWN:
358 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
359 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
360 			BUG();
361 		break;
362 	case CLOCK_EVT_MODE_RESUME:
363 		break;
364 	}
365 }
366 
367 static int xen_vcpuop_set_next_event(unsigned long delta,
368 				     struct clock_event_device *evt)
369 {
370 	int cpu = smp_processor_id();
371 	struct vcpu_set_singleshot_timer single;
372 	int ret;
373 
374 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
375 
376 	single.timeout_abs_ns = get_abs_timeout(delta);
377 	single.flags = VCPU_SSHOTTMR_future;
378 
379 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
380 
381 	BUG_ON(ret != 0 && ret != -ETIME);
382 
383 	return ret;
384 }
385 
386 static const struct clock_event_device xen_vcpuop_clockevent = {
387 	.name = "xen",
388 	.features = CLOCK_EVT_FEAT_ONESHOT,
389 
390 	.max_delta_ns = 0xffffffff,
391 	.min_delta_ns = TIMER_SLOP,
392 
393 	.mult = 1,
394 	.shift = 0,
395 	.rating = 500,
396 
397 	.set_mode = xen_vcpuop_set_mode,
398 	.set_next_event = xen_vcpuop_set_next_event,
399 };
400 
401 static const struct clock_event_device *xen_clockevent =
402 	&xen_timerop_clockevent;
403 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
404 
405 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
406 {
407 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
408 	irqreturn_t ret;
409 
410 	ret = IRQ_NONE;
411 	if (evt->event_handler) {
412 		evt->event_handler(evt);
413 		ret = IRQ_HANDLED;
414 	}
415 
416 	do_stolen_accounting();
417 
418 	return ret;
419 }
420 
421 void xen_setup_timer(int cpu)
422 {
423 	const char *name;
424 	struct clock_event_device *evt;
425 	int irq;
426 
427 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
428 
429 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
430 	if (!name)
431 		name = "<timer kasprintf failed>";
432 
433 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
434 				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
435 				      name, NULL);
436 
437 	evt = &per_cpu(xen_clock_events, cpu);
438 	memcpy(evt, xen_clockevent, sizeof(*evt));
439 
440 	evt->cpumask = cpumask_of_cpu(cpu);
441 	evt->irq = irq;
442 
443 	setup_runstate_info(cpu);
444 }
445 
446 void xen_teardown_timer(int cpu)
447 {
448 	struct clock_event_device *evt;
449 	BUG_ON(cpu == 0);
450 	evt = &per_cpu(xen_clock_events, cpu);
451 	unbind_from_irqhandler(evt->irq, NULL);
452 }
453 
454 void xen_setup_cpu_clockevents(void)
455 {
456 	BUG_ON(preemptible());
457 
458 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
459 }
460 
461 void xen_timer_resume(void)
462 {
463 	int cpu;
464 
465 	if (xen_clockevent != &xen_vcpuop_clockevent)
466 		return;
467 
468 	for_each_online_cpu(cpu) {
469 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
470 			BUG();
471 	}
472 }
473 
474 __init void xen_time_init(void)
475 {
476 	int cpu = smp_processor_id();
477 
478 	clocksource_register(&xen_clocksource);
479 
480 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
481 		/* Successfully turned off 100Hz tick, so we have the
482 		   vcpuop-based timer interface */
483 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
484 		xen_clockevent = &xen_vcpuop_clockevent;
485 	}
486 
487 	/* Set initial system time with full resolution */
488 	xen_read_wallclock(&xtime);
489 	set_normalized_timespec(&wall_to_monotonic,
490 				-xtime.tv_sec, -xtime.tv_nsec);
491 
492 	setup_force_cpu_cap(X86_FEATURE_TSC);
493 
494 	xen_setup_timer(cpu);
495 	xen_setup_cpu_clockevents();
496 }
497