xref: /openbmc/linux/kernel/sched/clock.c (revision 3b23dc52)
1 /*
2  * sched_clock() for unstable CPU clocks
3  *
4  *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
5  *
6  *  Updates and enhancements:
7  *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8  *
9  * Based on code by:
10  *   Ingo Molnar <mingo@redhat.com>
11  *   Guillaume Chazarain <guichaz@gmail.com>
12  *
13  *
14  * What this file implements:
15  *
16  * cpu_clock(i) provides a fast (execution time) high resolution
17  * clock with bounded drift between CPUs. The value of cpu_clock(i)
18  * is monotonic for constant i. The timestamp returned is in nanoseconds.
19  *
20  * ######################### BIG FAT WARNING ##########################
21  * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22  * # go backwards !!                                                  #
23  * ####################################################################
24  *
25  * There is no strict promise about the base, although it tends to start
26  * at 0 on boot (but people really shouldn't rely on that).
27  *
28  * cpu_clock(i)       -- can be used from any context, including NMI.
29  * local_clock()      -- is cpu_clock() on the current CPU.
30  *
31  * sched_clock_cpu(i)
32  *
33  * How it is implemented:
34  *
35  * The implementation either uses sched_clock() when
36  * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
37  * sched_clock() is assumed to provide these properties (mostly it means
38  * the architecture provides a globally synchronized highres time source).
39  *
40  * Otherwise it tries to create a semi stable clock from a mixture of other
41  * clocks, including:
42  *
43  *  - GTOD (clock monotomic)
44  *  - sched_clock()
45  *  - explicit idle events
46  *
47  * We use GTOD as base and use sched_clock() deltas to improve resolution. The
48  * deltas are filtered to provide monotonicity and keeping it within an
49  * expected window.
50  *
51  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
52  * that is otherwise invisible (TSC gets stopped).
53  *
54  */
55 #include "sched.h"
56 
57 /*
58  * Scheduler clock - returns current time in nanosec units.
59  * This is default implementation.
60  * Architectures and sub-architectures can override this.
61  */
62 unsigned long long __weak sched_clock(void)
63 {
64 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
65 					* (NSEC_PER_SEC / HZ);
66 }
67 EXPORT_SYMBOL_GPL(sched_clock);
68 
69 __read_mostly int sched_clock_running;
70 
71 void sched_clock_init(void)
72 {
73 	sched_clock_running = 1;
74 }
75 
76 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
77 /*
78  * We must start with !__sched_clock_stable because the unstable -> stable
79  * transition is accurate, while the stable -> unstable transition is not.
80  *
81  * Similarly we start with __sched_clock_stable_early, thereby assuming we
82  * will become stable, such that there's only a single 1 -> 0 transition.
83  */
84 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
85 static int __sched_clock_stable_early = 1;
86 
87 /*
88  * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
89  */
90 __read_mostly u64 __sched_clock_offset;
91 static __read_mostly u64 __gtod_offset;
92 
93 struct sched_clock_data {
94 	u64			tick_raw;
95 	u64			tick_gtod;
96 	u64			clock;
97 };
98 
99 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
100 
101 static inline struct sched_clock_data *this_scd(void)
102 {
103 	return this_cpu_ptr(&sched_clock_data);
104 }
105 
106 static inline struct sched_clock_data *cpu_sdc(int cpu)
107 {
108 	return &per_cpu(sched_clock_data, cpu);
109 }
110 
111 int sched_clock_stable(void)
112 {
113 	return static_branch_likely(&__sched_clock_stable);
114 }
115 
116 static void __scd_stamp(struct sched_clock_data *scd)
117 {
118 	scd->tick_gtod = ktime_get_ns();
119 	scd->tick_raw = sched_clock();
120 }
121 
122 static void __set_sched_clock_stable(void)
123 {
124 	struct sched_clock_data *scd;
125 
126 	/*
127 	 * Since we're still unstable and the tick is already running, we have
128 	 * to disable IRQs in order to get a consistent scd->tick* reading.
129 	 */
130 	local_irq_disable();
131 	scd = this_scd();
132 	/*
133 	 * Attempt to make the (initial) unstable->stable transition continuous.
134 	 */
135 	__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
136 	local_irq_enable();
137 
138 	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
139 			scd->tick_gtod, __gtod_offset,
140 			scd->tick_raw,  __sched_clock_offset);
141 
142 	static_branch_enable(&__sched_clock_stable);
143 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
144 }
145 
146 /*
147  * If we ever get here, we're screwed, because we found out -- typically after
148  * the fact -- that TSC wasn't good. This means all our clocksources (including
149  * ktime) could have reported wrong values.
150  *
151  * What we do here is an attempt to fix up and continue sort of where we left
152  * off in a coherent manner.
153  *
154  * The only way to fully avoid random clock jumps is to boot with:
155  * "tsc=unstable".
156  */
157 static void __sched_clock_work(struct work_struct *work)
158 {
159 	struct sched_clock_data *scd;
160 	int cpu;
161 
162 	/* take a current timestamp and set 'now' */
163 	preempt_disable();
164 	scd = this_scd();
165 	__scd_stamp(scd);
166 	scd->clock = scd->tick_gtod + __gtod_offset;
167 	preempt_enable();
168 
169 	/* clone to all CPUs */
170 	for_each_possible_cpu(cpu)
171 		per_cpu(sched_clock_data, cpu) = *scd;
172 
173 	printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
174 	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
175 			scd->tick_gtod, __gtod_offset,
176 			scd->tick_raw,  __sched_clock_offset);
177 
178 	static_branch_disable(&__sched_clock_stable);
179 }
180 
181 static DECLARE_WORK(sched_clock_work, __sched_clock_work);
182 
183 static void __clear_sched_clock_stable(void)
184 {
185 	if (!sched_clock_stable())
186 		return;
187 
188 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
189 	schedule_work(&sched_clock_work);
190 }
191 
192 void clear_sched_clock_stable(void)
193 {
194 	__sched_clock_stable_early = 0;
195 
196 	smp_mb(); /* matches sched_clock_init_late() */
197 
198 	if (sched_clock_running == 2)
199 		__clear_sched_clock_stable();
200 }
201 
202 /*
203  * We run this as late_initcall() such that it runs after all built-in drivers,
204  * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
205  */
206 static int __init sched_clock_init_late(void)
207 {
208 	sched_clock_running = 2;
209 	/*
210 	 * Ensure that it is impossible to not do a static_key update.
211 	 *
212 	 * Either {set,clear}_sched_clock_stable() must see sched_clock_running
213 	 * and do the update, or we must see their __sched_clock_stable_early
214 	 * and do the update, or both.
215 	 */
216 	smp_mb(); /* matches {set,clear}_sched_clock_stable() */
217 
218 	if (__sched_clock_stable_early)
219 		__set_sched_clock_stable();
220 
221 	return 0;
222 }
223 late_initcall(sched_clock_init_late);
224 
225 /*
226  * min, max except they take wrapping into account
227  */
228 
229 static inline u64 wrap_min(u64 x, u64 y)
230 {
231 	return (s64)(x - y) < 0 ? x : y;
232 }
233 
234 static inline u64 wrap_max(u64 x, u64 y)
235 {
236 	return (s64)(x - y) > 0 ? x : y;
237 }
238 
239 /*
240  * update the percpu scd from the raw @now value
241  *
242  *  - filter out backward motion
243  *  - use the GTOD tick value to create a window to filter crazy TSC values
244  */
245 static u64 sched_clock_local(struct sched_clock_data *scd)
246 {
247 	u64 now, clock, old_clock, min_clock, max_clock, gtod;
248 	s64 delta;
249 
250 again:
251 	now = sched_clock();
252 	delta = now - scd->tick_raw;
253 	if (unlikely(delta < 0))
254 		delta = 0;
255 
256 	old_clock = scd->clock;
257 
258 	/*
259 	 * scd->clock = clamp(scd->tick_gtod + delta,
260 	 *		      max(scd->tick_gtod, scd->clock),
261 	 *		      scd->tick_gtod + TICK_NSEC);
262 	 */
263 
264 	gtod = scd->tick_gtod + __gtod_offset;
265 	clock = gtod + delta;
266 	min_clock = wrap_max(gtod, old_clock);
267 	max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
268 
269 	clock = wrap_max(clock, min_clock);
270 	clock = wrap_min(clock, max_clock);
271 
272 	if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
273 		goto again;
274 
275 	return clock;
276 }
277 
278 static u64 sched_clock_remote(struct sched_clock_data *scd)
279 {
280 	struct sched_clock_data *my_scd = this_scd();
281 	u64 this_clock, remote_clock;
282 	u64 *ptr, old_val, val;
283 
284 #if BITS_PER_LONG != 64
285 again:
286 	/*
287 	 * Careful here: The local and the remote clock values need to
288 	 * be read out atomic as we need to compare the values and
289 	 * then update either the local or the remote side. So the
290 	 * cmpxchg64 below only protects one readout.
291 	 *
292 	 * We must reread via sched_clock_local() in the retry case on
293 	 * 32-bit kernels as an NMI could use sched_clock_local() via the
294 	 * tracer and hit between the readout of
295 	 * the low 32-bit and the high 32-bit portion.
296 	 */
297 	this_clock = sched_clock_local(my_scd);
298 	/*
299 	 * We must enforce atomic readout on 32-bit, otherwise the
300 	 * update on the remote CPU can hit inbetween the readout of
301 	 * the low 32-bit and the high 32-bit portion.
302 	 */
303 	remote_clock = cmpxchg64(&scd->clock, 0, 0);
304 #else
305 	/*
306 	 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
307 	 * update, so we can avoid the above 32-bit dance.
308 	 */
309 	sched_clock_local(my_scd);
310 again:
311 	this_clock = my_scd->clock;
312 	remote_clock = scd->clock;
313 #endif
314 
315 	/*
316 	 * Use the opportunity that we have both locks
317 	 * taken to couple the two clocks: we take the
318 	 * larger time as the latest time for both
319 	 * runqueues. (this creates monotonic movement)
320 	 */
321 	if (likely((s64)(remote_clock - this_clock) < 0)) {
322 		ptr = &scd->clock;
323 		old_val = remote_clock;
324 		val = this_clock;
325 	} else {
326 		/*
327 		 * Should be rare, but possible:
328 		 */
329 		ptr = &my_scd->clock;
330 		old_val = this_clock;
331 		val = remote_clock;
332 	}
333 
334 	if (cmpxchg64(ptr, old_val, val) != old_val)
335 		goto again;
336 
337 	return val;
338 }
339 
340 /*
341  * Similar to cpu_clock(), but requires local IRQs to be disabled.
342  *
343  * See cpu_clock().
344  */
345 u64 sched_clock_cpu(int cpu)
346 {
347 	struct sched_clock_data *scd;
348 	u64 clock;
349 
350 	if (sched_clock_stable())
351 		return sched_clock() + __sched_clock_offset;
352 
353 	if (unlikely(!sched_clock_running))
354 		return 0ull;
355 
356 	preempt_disable_notrace();
357 	scd = cpu_sdc(cpu);
358 
359 	if (cpu != smp_processor_id())
360 		clock = sched_clock_remote(scd);
361 	else
362 		clock = sched_clock_local(scd);
363 	preempt_enable_notrace();
364 
365 	return clock;
366 }
367 EXPORT_SYMBOL_GPL(sched_clock_cpu);
368 
369 void sched_clock_tick(void)
370 {
371 	struct sched_clock_data *scd;
372 
373 	if (sched_clock_stable())
374 		return;
375 
376 	if (unlikely(!sched_clock_running))
377 		return;
378 
379 	lockdep_assert_irqs_disabled();
380 
381 	scd = this_scd();
382 	__scd_stamp(scd);
383 	sched_clock_local(scd);
384 }
385 
386 void sched_clock_tick_stable(void)
387 {
388 	u64 gtod, clock;
389 
390 	if (!sched_clock_stable())
391 		return;
392 
393 	/*
394 	 * Called under watchdog_lock.
395 	 *
396 	 * The watchdog just found this TSC to (still) be stable, so now is a
397 	 * good moment to update our __gtod_offset. Because once we find the
398 	 * TSC to be unstable, any computation will be computing crap.
399 	 */
400 	local_irq_disable();
401 	gtod = ktime_get_ns();
402 	clock = sched_clock();
403 	__gtod_offset = (clock + __sched_clock_offset) - gtod;
404 	local_irq_enable();
405 }
406 
407 /*
408  * We are going deep-idle (irqs are disabled):
409  */
410 void sched_clock_idle_sleep_event(void)
411 {
412 	sched_clock_cpu(smp_processor_id());
413 }
414 EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
415 
416 /*
417  * We just idled; resync with ktime.
418  */
419 void sched_clock_idle_wakeup_event(void)
420 {
421 	unsigned long flags;
422 
423 	if (sched_clock_stable())
424 		return;
425 
426 	if (unlikely(timekeeping_suspended))
427 		return;
428 
429 	local_irq_save(flags);
430 	sched_clock_tick();
431 	local_irq_restore(flags);
432 }
433 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
434 
435 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
436 
437 u64 sched_clock_cpu(int cpu)
438 {
439 	if (unlikely(!sched_clock_running))
440 		return 0;
441 
442 	return sched_clock();
443 }
444 
445 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
446 
447 /*
448  * Running clock - returns the time that has elapsed while a guest has been
449  * running.
450  * On a guest this value should be local_clock minus the time the guest was
451  * suspended by the hypervisor (for any reason).
452  * On bare metal this function should return the same as local_clock.
453  * Architectures and sub-architectures can override this.
454  */
455 u64 __weak running_clock(void)
456 {
457 	return local_clock();
458 }
459