xref: /openbmc/linux/kernel/time/clocksource.c (revision ed84ef1c)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * This file contains the functions which manage clocksource drivers.
4  *
5  * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/device.h>
11 #include <linux/clocksource.h>
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
15 #include <linux/tick.h>
16 #include <linux/kthread.h>
17 #include <linux/prandom.h>
18 #include <linux/cpu.h>
19 
20 #include "tick-internal.h"
21 #include "timekeeping_internal.h"
22 
23 /**
24  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
25  * @mult:	pointer to mult variable
26  * @shift:	pointer to shift variable
27  * @from:	frequency to convert from
28  * @to:		frequency to convert to
29  * @maxsec:	guaranteed runtime conversion range in seconds
30  *
31  * The function evaluates the shift/mult pair for the scaled math
32  * operations of clocksources and clockevents.
33  *
34  * @to and @from are frequency values in HZ. For clock sources @to is
35  * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
36  * event @to is the counter frequency and @from is NSEC_PER_SEC.
37  *
38  * The @maxsec conversion range argument controls the time frame in
39  * seconds which must be covered by the runtime conversion with the
40  * calculated mult and shift factors. This guarantees that no 64bit
41  * overflow happens when the input value of the conversion is
42  * multiplied with the calculated mult factor. Larger ranges may
43  * reduce the conversion accuracy by choosing smaller mult and shift
44  * factors.
45  */
46 void
47 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
48 {
49 	u64 tmp;
50 	u32 sft, sftacc= 32;
51 
52 	/*
53 	 * Calculate the shift factor which is limiting the conversion
54 	 * range:
55 	 */
56 	tmp = ((u64)maxsec * from) >> 32;
57 	while (tmp) {
58 		tmp >>=1;
59 		sftacc--;
60 	}
61 
62 	/*
63 	 * Find the conversion shift/mult pair which has the best
64 	 * accuracy and fits the maxsec conversion range:
65 	 */
66 	for (sft = 32; sft > 0; sft--) {
67 		tmp = (u64) to << sft;
68 		tmp += from / 2;
69 		do_div(tmp, from);
70 		if ((tmp >> sftacc) == 0)
71 			break;
72 	}
73 	*mult = tmp;
74 	*shift = sft;
75 }
76 EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
77 
78 /*[Clocksource internal variables]---------
79  * curr_clocksource:
80  *	currently selected clocksource.
81  * suspend_clocksource:
82  *	used to calculate the suspend time.
83  * clocksource_list:
84  *	linked list with the registered clocksources
85  * clocksource_mutex:
86  *	protects manipulations to curr_clocksource and the clocksource_list
87  * override_name:
88  *	Name of the user-specified clocksource.
89  */
90 static struct clocksource *curr_clocksource;
91 static struct clocksource *suspend_clocksource;
92 static LIST_HEAD(clocksource_list);
93 static DEFINE_MUTEX(clocksource_mutex);
94 static char override_name[CS_NAME_LEN];
95 static int finished_booting;
96 static u64 suspend_start;
97 
98 /*
99  * Threshold: 0.0312s, when doubled: 0.0625s.
100  * Also a default for cs->uncertainty_margin when registering clocks.
101  */
102 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
103 
104 /*
105  * Maximum permissible delay between two readouts of the watchdog
106  * clocksource surrounding a read of the clocksource being validated.
107  * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
108  * a lower bound for cs->uncertainty_margin values when registering clocks.
109  */
110 #define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
111 
112 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
113 static void clocksource_watchdog_work(struct work_struct *work);
114 static void clocksource_select(void);
115 
116 static LIST_HEAD(watchdog_list);
117 static struct clocksource *watchdog;
118 static struct timer_list watchdog_timer;
119 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
120 static DEFINE_SPINLOCK(watchdog_lock);
121 static int watchdog_running;
122 static atomic_t watchdog_reset_pending;
123 
124 static inline void clocksource_watchdog_lock(unsigned long *flags)
125 {
126 	spin_lock_irqsave(&watchdog_lock, *flags);
127 }
128 
129 static inline void clocksource_watchdog_unlock(unsigned long *flags)
130 {
131 	spin_unlock_irqrestore(&watchdog_lock, *flags);
132 }
133 
134 static int clocksource_watchdog_kthread(void *data);
135 static void __clocksource_change_rating(struct clocksource *cs, int rating);
136 
137 /*
138  * Interval: 0.5sec.
139  */
140 #define WATCHDOG_INTERVAL (HZ >> 1)
141 
142 static void clocksource_watchdog_work(struct work_struct *work)
143 {
144 	/*
145 	 * We cannot directly run clocksource_watchdog_kthread() here, because
146 	 * clocksource_select() calls timekeeping_notify() which uses
147 	 * stop_machine(). One cannot use stop_machine() from a workqueue() due
148 	 * lock inversions wrt CPU hotplug.
149 	 *
150 	 * Also, we only ever run this work once or twice during the lifetime
151 	 * of the kernel, so there is no point in creating a more permanent
152 	 * kthread for this.
153 	 *
154 	 * If kthread_run fails the next watchdog scan over the
155 	 * watchdog_list will find the unstable clock again.
156 	 */
157 	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
158 }
159 
160 static void __clocksource_unstable(struct clocksource *cs)
161 {
162 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
163 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
164 
165 	/*
166 	 * If the clocksource is registered clocksource_watchdog_kthread() will
167 	 * re-rate and re-select.
168 	 */
169 	if (list_empty(&cs->list)) {
170 		cs->rating = 0;
171 		return;
172 	}
173 
174 	if (cs->mark_unstable)
175 		cs->mark_unstable(cs);
176 
177 	/* kick clocksource_watchdog_kthread() */
178 	if (finished_booting)
179 		schedule_work(&watchdog_work);
180 }
181 
182 /**
183  * clocksource_mark_unstable - mark clocksource unstable via watchdog
184  * @cs:		clocksource to be marked unstable
185  *
186  * This function is called by the x86 TSC code to mark clocksources as unstable;
187  * it defers demotion and re-selection to a kthread.
188  */
189 void clocksource_mark_unstable(struct clocksource *cs)
190 {
191 	unsigned long flags;
192 
193 	spin_lock_irqsave(&watchdog_lock, flags);
194 	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
195 		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
196 			list_add(&cs->wd_list, &watchdog_list);
197 		__clocksource_unstable(cs);
198 	}
199 	spin_unlock_irqrestore(&watchdog_lock, flags);
200 }
201 
202 ulong max_cswd_read_retries = 3;
203 module_param(max_cswd_read_retries, ulong, 0644);
204 EXPORT_SYMBOL_GPL(max_cswd_read_retries);
205 static int verify_n_cpus = 8;
206 module_param(verify_n_cpus, int, 0644);
207 
208 static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
209 {
210 	unsigned int nretries;
211 	u64 wd_end, wd_delta;
212 	int64_t wd_delay;
213 
214 	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
215 		local_irq_disable();
216 		*wdnow = watchdog->read(watchdog);
217 		*csnow = cs->read(cs);
218 		wd_end = watchdog->read(watchdog);
219 		local_irq_enable();
220 
221 		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
222 		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
223 					      watchdog->shift);
224 		if (wd_delay <= WATCHDOG_MAX_SKEW) {
225 			if (nretries > 1 || nretries >= max_cswd_read_retries) {
226 				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
227 					smp_processor_id(), watchdog->name, nretries);
228 			}
229 			return true;
230 		}
231 	}
232 
233 	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
234 		smp_processor_id(), watchdog->name, wd_delay, nretries);
235 	return false;
236 }
237 
238 static u64 csnow_mid;
239 static cpumask_t cpus_ahead;
240 static cpumask_t cpus_behind;
241 static cpumask_t cpus_chosen;
242 
243 static void clocksource_verify_choose_cpus(void)
244 {
245 	int cpu, i, n = verify_n_cpus;
246 
247 	if (n < 0) {
248 		/* Check all of the CPUs. */
249 		cpumask_copy(&cpus_chosen, cpu_online_mask);
250 		cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
251 		return;
252 	}
253 
254 	/* If no checking desired, or no other CPU to check, leave. */
255 	cpumask_clear(&cpus_chosen);
256 	if (n == 0 || num_online_cpus() <= 1)
257 		return;
258 
259 	/* Make sure to select at least one CPU other than the current CPU. */
260 	cpu = cpumask_next(-1, cpu_online_mask);
261 	if (cpu == smp_processor_id())
262 		cpu = cpumask_next(cpu, cpu_online_mask);
263 	if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
264 		return;
265 	cpumask_set_cpu(cpu, &cpus_chosen);
266 
267 	/* Force a sane value for the boot parameter. */
268 	if (n > nr_cpu_ids)
269 		n = nr_cpu_ids;
270 
271 	/*
272 	 * Randomly select the specified number of CPUs.  If the same
273 	 * CPU is selected multiple times, that CPU is checked only once,
274 	 * and no replacement CPU is selected.  This gracefully handles
275 	 * situations where verify_n_cpus is greater than the number of
276 	 * CPUs that are currently online.
277 	 */
278 	for (i = 1; i < n; i++) {
279 		cpu = prandom_u32() % nr_cpu_ids;
280 		cpu = cpumask_next(cpu - 1, cpu_online_mask);
281 		if (cpu >= nr_cpu_ids)
282 			cpu = cpumask_next(-1, cpu_online_mask);
283 		if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
284 			cpumask_set_cpu(cpu, &cpus_chosen);
285 	}
286 
287 	/* Don't verify ourselves. */
288 	cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
289 }
290 
291 static void clocksource_verify_one_cpu(void *csin)
292 {
293 	struct clocksource *cs = (struct clocksource *)csin;
294 
295 	csnow_mid = cs->read(cs);
296 }
297 
298 void clocksource_verify_percpu(struct clocksource *cs)
299 {
300 	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
301 	u64 csnow_begin, csnow_end;
302 	int cpu, testcpu;
303 	s64 delta;
304 
305 	if (verify_n_cpus == 0)
306 		return;
307 	cpumask_clear(&cpus_ahead);
308 	cpumask_clear(&cpus_behind);
309 	cpus_read_lock();
310 	preempt_disable();
311 	clocksource_verify_choose_cpus();
312 	if (cpumask_weight(&cpus_chosen) == 0) {
313 		preempt_enable();
314 		cpus_read_unlock();
315 		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
316 		return;
317 	}
318 	testcpu = smp_processor_id();
319 	pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
320 	for_each_cpu(cpu, &cpus_chosen) {
321 		if (cpu == testcpu)
322 			continue;
323 		csnow_begin = cs->read(cs);
324 		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
325 		csnow_end = cs->read(cs);
326 		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
327 		if (delta < 0)
328 			cpumask_set_cpu(cpu, &cpus_behind);
329 		delta = (csnow_end - csnow_mid) & cs->mask;
330 		if (delta < 0)
331 			cpumask_set_cpu(cpu, &cpus_ahead);
332 		delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
333 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
334 		if (cs_nsec > cs_nsec_max)
335 			cs_nsec_max = cs_nsec;
336 		if (cs_nsec < cs_nsec_min)
337 			cs_nsec_min = cs_nsec;
338 	}
339 	preempt_enable();
340 	cpus_read_unlock();
341 	if (!cpumask_empty(&cpus_ahead))
342 		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
343 			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
344 	if (!cpumask_empty(&cpus_behind))
345 		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
346 			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
347 	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
348 		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
349 			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
350 }
351 EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
352 
353 static void clocksource_watchdog(struct timer_list *unused)
354 {
355 	u64 csnow, wdnow, cslast, wdlast, delta;
356 	int next_cpu, reset_pending;
357 	int64_t wd_nsec, cs_nsec;
358 	struct clocksource *cs;
359 	u32 md;
360 
361 	spin_lock(&watchdog_lock);
362 	if (!watchdog_running)
363 		goto out;
364 
365 	reset_pending = atomic_read(&watchdog_reset_pending);
366 
367 	list_for_each_entry(cs, &watchdog_list, wd_list) {
368 
369 		/* Clocksource already marked unstable? */
370 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
371 			if (finished_booting)
372 				schedule_work(&watchdog_work);
373 			continue;
374 		}
375 
376 		if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
377 			/* Clock readout unreliable, so give it up. */
378 			__clocksource_unstable(cs);
379 			continue;
380 		}
381 
382 		/* Clocksource initialized ? */
383 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
384 		    atomic_read(&watchdog_reset_pending)) {
385 			cs->flags |= CLOCK_SOURCE_WATCHDOG;
386 			cs->wd_last = wdnow;
387 			cs->cs_last = csnow;
388 			continue;
389 		}
390 
391 		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
392 		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
393 					     watchdog->shift);
394 
395 		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
396 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
397 		wdlast = cs->wd_last; /* save these in case we print them */
398 		cslast = cs->cs_last;
399 		cs->cs_last = csnow;
400 		cs->wd_last = wdnow;
401 
402 		if (atomic_read(&watchdog_reset_pending))
403 			continue;
404 
405 		/* Check the deviation from the watchdog clocksource. */
406 		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
407 		if (abs(cs_nsec - wd_nsec) > md) {
408 			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
409 				smp_processor_id(), cs->name);
410 			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
411 				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
412 			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
413 				cs->name, cs_nsec, csnow, cslast, cs->mask);
414 			if (curr_clocksource == cs)
415 				pr_warn("                      '%s' is current clocksource.\n", cs->name);
416 			else if (curr_clocksource)
417 				pr_warn("                      '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
418 			else
419 				pr_warn("                      No current clocksource.\n");
420 			__clocksource_unstable(cs);
421 			continue;
422 		}
423 
424 		if (cs == curr_clocksource && cs->tick_stable)
425 			cs->tick_stable(cs);
426 
427 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
428 		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
429 		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
430 			/* Mark it valid for high-res. */
431 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
432 
433 			/*
434 			 * clocksource_done_booting() will sort it if
435 			 * finished_booting is not set yet.
436 			 */
437 			if (!finished_booting)
438 				continue;
439 
440 			/*
441 			 * If this is not the current clocksource let
442 			 * the watchdog thread reselect it. Due to the
443 			 * change to high res this clocksource might
444 			 * be preferred now. If it is the current
445 			 * clocksource let the tick code know about
446 			 * that change.
447 			 */
448 			if (cs != curr_clocksource) {
449 				cs->flags |= CLOCK_SOURCE_RESELECT;
450 				schedule_work(&watchdog_work);
451 			} else {
452 				tick_clock_notify();
453 			}
454 		}
455 	}
456 
457 	/*
458 	 * We only clear the watchdog_reset_pending, when we did a
459 	 * full cycle through all clocksources.
460 	 */
461 	if (reset_pending)
462 		atomic_dec(&watchdog_reset_pending);
463 
464 	/*
465 	 * Cycle through CPUs to check if the CPUs stay synchronized
466 	 * to each other.
467 	 */
468 	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
469 	if (next_cpu >= nr_cpu_ids)
470 		next_cpu = cpumask_first(cpu_online_mask);
471 
472 	/*
473 	 * Arm timer if not already pending: could race with concurrent
474 	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
475 	 */
476 	if (!timer_pending(&watchdog_timer)) {
477 		watchdog_timer.expires += WATCHDOG_INTERVAL;
478 		add_timer_on(&watchdog_timer, next_cpu);
479 	}
480 out:
481 	spin_unlock(&watchdog_lock);
482 }
483 
484 static inline void clocksource_start_watchdog(void)
485 {
486 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
487 		return;
488 	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
489 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
490 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
491 	watchdog_running = 1;
492 }
493 
494 static inline void clocksource_stop_watchdog(void)
495 {
496 	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
497 		return;
498 	del_timer(&watchdog_timer);
499 	watchdog_running = 0;
500 }
501 
502 static inline void clocksource_reset_watchdog(void)
503 {
504 	struct clocksource *cs;
505 
506 	list_for_each_entry(cs, &watchdog_list, wd_list)
507 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
508 }
509 
510 static void clocksource_resume_watchdog(void)
511 {
512 	atomic_inc(&watchdog_reset_pending);
513 }
514 
515 static void clocksource_enqueue_watchdog(struct clocksource *cs)
516 {
517 	INIT_LIST_HEAD(&cs->wd_list);
518 
519 	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
520 		/* cs is a clocksource to be watched. */
521 		list_add(&cs->wd_list, &watchdog_list);
522 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
523 	} else {
524 		/* cs is a watchdog. */
525 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
526 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
527 	}
528 }
529 
530 static void clocksource_select_watchdog(bool fallback)
531 {
532 	struct clocksource *cs, *old_wd;
533 	unsigned long flags;
534 
535 	spin_lock_irqsave(&watchdog_lock, flags);
536 	/* save current watchdog */
537 	old_wd = watchdog;
538 	if (fallback)
539 		watchdog = NULL;
540 
541 	list_for_each_entry(cs, &clocksource_list, list) {
542 		/* cs is a clocksource to be watched. */
543 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
544 			continue;
545 
546 		/* Skip current if we were requested for a fallback. */
547 		if (fallback && cs == old_wd)
548 			continue;
549 
550 		/* Pick the best watchdog. */
551 		if (!watchdog || cs->rating > watchdog->rating)
552 			watchdog = cs;
553 	}
554 	/* If we failed to find a fallback restore the old one. */
555 	if (!watchdog)
556 		watchdog = old_wd;
557 
558 	/* If we changed the watchdog we need to reset cycles. */
559 	if (watchdog != old_wd)
560 		clocksource_reset_watchdog();
561 
562 	/* Check if the watchdog timer needs to be started. */
563 	clocksource_start_watchdog();
564 	spin_unlock_irqrestore(&watchdog_lock, flags);
565 }
566 
567 static void clocksource_dequeue_watchdog(struct clocksource *cs)
568 {
569 	if (cs != watchdog) {
570 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
571 			/* cs is a watched clocksource. */
572 			list_del_init(&cs->wd_list);
573 			/* Check if the watchdog timer needs to be stopped. */
574 			clocksource_stop_watchdog();
575 		}
576 	}
577 }
578 
579 static int __clocksource_watchdog_kthread(void)
580 {
581 	struct clocksource *cs, *tmp;
582 	unsigned long flags;
583 	int select = 0;
584 
585 	/* Do any required per-CPU skew verification. */
586 	if (curr_clocksource &&
587 	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
588 	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
589 		clocksource_verify_percpu(curr_clocksource);
590 
591 	spin_lock_irqsave(&watchdog_lock, flags);
592 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
593 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
594 			list_del_init(&cs->wd_list);
595 			__clocksource_change_rating(cs, 0);
596 			select = 1;
597 		}
598 		if (cs->flags & CLOCK_SOURCE_RESELECT) {
599 			cs->flags &= ~CLOCK_SOURCE_RESELECT;
600 			select = 1;
601 		}
602 	}
603 	/* Check if the watchdog timer needs to be stopped. */
604 	clocksource_stop_watchdog();
605 	spin_unlock_irqrestore(&watchdog_lock, flags);
606 
607 	return select;
608 }
609 
610 static int clocksource_watchdog_kthread(void *data)
611 {
612 	mutex_lock(&clocksource_mutex);
613 	if (__clocksource_watchdog_kthread())
614 		clocksource_select();
615 	mutex_unlock(&clocksource_mutex);
616 	return 0;
617 }
618 
619 static bool clocksource_is_watchdog(struct clocksource *cs)
620 {
621 	return cs == watchdog;
622 }
623 
624 #else /* CONFIG_CLOCKSOURCE_WATCHDOG */
625 
626 static void clocksource_enqueue_watchdog(struct clocksource *cs)
627 {
628 	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
629 		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
630 }
631 
632 static void clocksource_select_watchdog(bool fallback) { }
633 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
634 static inline void clocksource_resume_watchdog(void) { }
635 static inline int __clocksource_watchdog_kthread(void) { return 0; }
636 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
637 void clocksource_mark_unstable(struct clocksource *cs) { }
638 
639 static inline void clocksource_watchdog_lock(unsigned long *flags) { }
640 static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
641 
642 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
643 
644 static bool clocksource_is_suspend(struct clocksource *cs)
645 {
646 	return cs == suspend_clocksource;
647 }
648 
649 static void __clocksource_suspend_select(struct clocksource *cs)
650 {
651 	/*
652 	 * Skip the clocksource which will be stopped in suspend state.
653 	 */
654 	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
655 		return;
656 
657 	/*
658 	 * The nonstop clocksource can be selected as the suspend clocksource to
659 	 * calculate the suspend time, so it should not supply suspend/resume
660 	 * interfaces to suspend the nonstop clocksource when system suspends.
661 	 */
662 	if (cs->suspend || cs->resume) {
663 		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
664 			cs->name);
665 	}
666 
667 	/* Pick the best rating. */
668 	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
669 		suspend_clocksource = cs;
670 }
671 
672 /**
673  * clocksource_suspend_select - Select the best clocksource for suspend timing
674  * @fallback:	if select a fallback clocksource
675  */
676 static void clocksource_suspend_select(bool fallback)
677 {
678 	struct clocksource *cs, *old_suspend;
679 
680 	old_suspend = suspend_clocksource;
681 	if (fallback)
682 		suspend_clocksource = NULL;
683 
684 	list_for_each_entry(cs, &clocksource_list, list) {
685 		/* Skip current if we were requested for a fallback. */
686 		if (fallback && cs == old_suspend)
687 			continue;
688 
689 		__clocksource_suspend_select(cs);
690 	}
691 }
692 
693 /**
694  * clocksource_start_suspend_timing - Start measuring the suspend timing
695  * @cs:			current clocksource from timekeeping
696  * @start_cycles:	current cycles from timekeeping
697  *
698  * This function will save the start cycle values of suspend timer to calculate
699  * the suspend time when resuming system.
700  *
701  * This function is called late in the suspend process from timekeeping_suspend(),
702  * that means processes are frozen, non-boot cpus and interrupts are disabled
703  * now. It is therefore possible to start the suspend timer without taking the
704  * clocksource mutex.
705  */
706 void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
707 {
708 	if (!suspend_clocksource)
709 		return;
710 
711 	/*
712 	 * If current clocksource is the suspend timer, we should use the
713 	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
714 	 * from suspend timer.
715 	 */
716 	if (clocksource_is_suspend(cs)) {
717 		suspend_start = start_cycles;
718 		return;
719 	}
720 
721 	if (suspend_clocksource->enable &&
722 	    suspend_clocksource->enable(suspend_clocksource)) {
723 		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
724 		return;
725 	}
726 
727 	suspend_start = suspend_clocksource->read(suspend_clocksource);
728 }
729 
730 /**
731  * clocksource_stop_suspend_timing - Stop measuring the suspend timing
732  * @cs:		current clocksource from timekeeping
733  * @cycle_now:	current cycles from timekeeping
734  *
735  * This function will calculate the suspend time from suspend timer.
736  *
737  * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
738  *
739  * This function is called early in the resume process from timekeeping_resume(),
740  * that means there is only one cpu, no processes are running and the interrupts
741  * are disabled. It is therefore possible to stop the suspend timer without
742  * taking the clocksource mutex.
743  */
744 u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
745 {
746 	u64 now, delta, nsec = 0;
747 
748 	if (!suspend_clocksource)
749 		return 0;
750 
751 	/*
752 	 * If current clocksource is the suspend timer, we should use the
753 	 * tkr_mono.cycle_last value from timekeeping as current cycle to
754 	 * avoid same reading from suspend timer.
755 	 */
756 	if (clocksource_is_suspend(cs))
757 		now = cycle_now;
758 	else
759 		now = suspend_clocksource->read(suspend_clocksource);
760 
761 	if (now > suspend_start) {
762 		delta = clocksource_delta(now, suspend_start,
763 					  suspend_clocksource->mask);
764 		nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
765 				       suspend_clocksource->shift);
766 	}
767 
768 	/*
769 	 * Disable the suspend timer to save power if current clocksource is
770 	 * not the suspend timer.
771 	 */
772 	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
773 		suspend_clocksource->disable(suspend_clocksource);
774 
775 	return nsec;
776 }
777 
778 /**
779  * clocksource_suspend - suspend the clocksource(s)
780  */
781 void clocksource_suspend(void)
782 {
783 	struct clocksource *cs;
784 
785 	list_for_each_entry_reverse(cs, &clocksource_list, list)
786 		if (cs->suspend)
787 			cs->suspend(cs);
788 }
789 
790 /**
791  * clocksource_resume - resume the clocksource(s)
792  */
793 void clocksource_resume(void)
794 {
795 	struct clocksource *cs;
796 
797 	list_for_each_entry(cs, &clocksource_list, list)
798 		if (cs->resume)
799 			cs->resume(cs);
800 
801 	clocksource_resume_watchdog();
802 }
803 
804 /**
805  * clocksource_touch_watchdog - Update watchdog
806  *
807  * Update the watchdog after exception contexts such as kgdb so as not
808  * to incorrectly trip the watchdog. This might fail when the kernel
809  * was stopped in code which holds watchdog_lock.
810  */
811 void clocksource_touch_watchdog(void)
812 {
813 	clocksource_resume_watchdog();
814 }
815 
816 /**
817  * clocksource_max_adjustment- Returns max adjustment amount
818  * @cs:         Pointer to clocksource
819  *
820  */
821 static u32 clocksource_max_adjustment(struct clocksource *cs)
822 {
823 	u64 ret;
824 	/*
825 	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
826 	 */
827 	ret = (u64)cs->mult * 11;
828 	do_div(ret,100);
829 	return (u32)ret;
830 }
831 
832 /**
833  * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
834  * @mult:	cycle to nanosecond multiplier
835  * @shift:	cycle to nanosecond divisor (power of two)
836  * @maxadj:	maximum adjustment value to mult (~11%)
837  * @mask:	bitmask for two's complement subtraction of non 64 bit counters
838  * @max_cyc:	maximum cycle value before potential overflow (does not include
839  *		any safety margin)
840  *
841  * NOTE: This function includes a safety margin of 50%, in other words, we
842  * return half the number of nanoseconds the hardware counter can technically
843  * cover. This is done so that we can potentially detect problems caused by
844  * delayed timers or bad hardware, which might result in time intervals that
845  * are larger than what the math used can handle without overflows.
846  */
847 u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
848 {
849 	u64 max_nsecs, max_cycles;
850 
851 	/*
852 	 * Calculate the maximum number of cycles that we can pass to the
853 	 * cyc2ns() function without overflowing a 64-bit result.
854 	 */
855 	max_cycles = ULLONG_MAX;
856 	do_div(max_cycles, mult+maxadj);
857 
858 	/*
859 	 * The actual maximum number of cycles we can defer the clocksource is
860 	 * determined by the minimum of max_cycles and mask.
861 	 * Note: Here we subtract the maxadj to make sure we don't sleep for
862 	 * too long if there's a large negative adjustment.
863 	 */
864 	max_cycles = min(max_cycles, mask);
865 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
866 
867 	/* return the max_cycles value as well if requested */
868 	if (max_cyc)
869 		*max_cyc = max_cycles;
870 
871 	/* Return 50% of the actual maximum, so we can detect bad values */
872 	max_nsecs >>= 1;
873 
874 	return max_nsecs;
875 }
876 
877 /**
878  * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
879  * @cs:         Pointer to clocksource to be updated
880  *
881  */
882 static inline void clocksource_update_max_deferment(struct clocksource *cs)
883 {
884 	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
885 						cs->maxadj, cs->mask,
886 						&cs->max_cycles);
887 }
888 
889 static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
890 {
891 	struct clocksource *cs;
892 
893 	if (!finished_booting || list_empty(&clocksource_list))
894 		return NULL;
895 
896 	/*
897 	 * We pick the clocksource with the highest rating. If oneshot
898 	 * mode is active, we pick the highres valid clocksource with
899 	 * the best rating.
900 	 */
901 	list_for_each_entry(cs, &clocksource_list, list) {
902 		if (skipcur && cs == curr_clocksource)
903 			continue;
904 		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
905 			continue;
906 		return cs;
907 	}
908 	return NULL;
909 }
910 
911 static void __clocksource_select(bool skipcur)
912 {
913 	bool oneshot = tick_oneshot_mode_active();
914 	struct clocksource *best, *cs;
915 
916 	/* Find the best suitable clocksource */
917 	best = clocksource_find_best(oneshot, skipcur);
918 	if (!best)
919 		return;
920 
921 	if (!strlen(override_name))
922 		goto found;
923 
924 	/* Check for the override clocksource. */
925 	list_for_each_entry(cs, &clocksource_list, list) {
926 		if (skipcur && cs == curr_clocksource)
927 			continue;
928 		if (strcmp(cs->name, override_name) != 0)
929 			continue;
930 		/*
931 		 * Check to make sure we don't switch to a non-highres
932 		 * capable clocksource if the tick code is in oneshot
933 		 * mode (highres or nohz)
934 		 */
935 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
936 			/* Override clocksource cannot be used. */
937 			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
938 				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
939 					cs->name);
940 				override_name[0] = 0;
941 			} else {
942 				/*
943 				 * The override cannot be currently verified.
944 				 * Deferring to let the watchdog check.
945 				 */
946 				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
947 					cs->name);
948 			}
949 		} else
950 			/* Override clocksource can be used. */
951 			best = cs;
952 		break;
953 	}
954 
955 found:
956 	if (curr_clocksource != best && !timekeeping_notify(best)) {
957 		pr_info("Switched to clocksource %s\n", best->name);
958 		curr_clocksource = best;
959 	}
960 }
961 
962 /**
963  * clocksource_select - Select the best clocksource available
964  *
965  * Private function. Must hold clocksource_mutex when called.
966  *
967  * Select the clocksource with the best rating, or the clocksource,
968  * which is selected by userspace override.
969  */
970 static void clocksource_select(void)
971 {
972 	__clocksource_select(false);
973 }
974 
975 static void clocksource_select_fallback(void)
976 {
977 	__clocksource_select(true);
978 }
979 
980 /*
981  * clocksource_done_booting - Called near the end of core bootup
982  *
983  * Hack to avoid lots of clocksource churn at boot time.
984  * We use fs_initcall because we want this to start before
985  * device_initcall but after subsys_initcall.
986  */
987 static int __init clocksource_done_booting(void)
988 {
989 	mutex_lock(&clocksource_mutex);
990 	curr_clocksource = clocksource_default_clock();
991 	finished_booting = 1;
992 	/*
993 	 * Run the watchdog first to eliminate unstable clock sources
994 	 */
995 	__clocksource_watchdog_kthread();
996 	clocksource_select();
997 	mutex_unlock(&clocksource_mutex);
998 	return 0;
999 }
1000 fs_initcall(clocksource_done_booting);
1001 
1002 /*
1003  * Enqueue the clocksource sorted by rating
1004  */
1005 static void clocksource_enqueue(struct clocksource *cs)
1006 {
1007 	struct list_head *entry = &clocksource_list;
1008 	struct clocksource *tmp;
1009 
1010 	list_for_each_entry(tmp, &clocksource_list, list) {
1011 		/* Keep track of the place, where to insert */
1012 		if (tmp->rating < cs->rating)
1013 			break;
1014 		entry = &tmp->list;
1015 	}
1016 	list_add(&cs->list, entry);
1017 }
1018 
1019 /**
1020  * __clocksource_update_freq_scale - Used update clocksource with new freq
1021  * @cs:		clocksource to be registered
1022  * @scale:	Scale factor multiplied against freq to get clocksource hz
1023  * @freq:	clocksource frequency (cycles per second) divided by scale
1024  *
1025  * This should only be called from the clocksource->enable() method.
1026  *
1027  * This *SHOULD NOT* be called directly! Please use the
1028  * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
1029  * functions.
1030  */
1031 void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
1032 {
1033 	u64 sec;
1034 
1035 	/*
1036 	 * Default clocksources are *special* and self-define their mult/shift.
1037 	 * But, you're not special, so you should specify a freq value.
1038 	 */
1039 	if (freq) {
1040 		/*
1041 		 * Calc the maximum number of seconds which we can run before
1042 		 * wrapping around. For clocksources which have a mask > 32-bit
1043 		 * we need to limit the max sleep time to have a good
1044 		 * conversion precision. 10 minutes is still a reasonable
1045 		 * amount. That results in a shift value of 24 for a
1046 		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
1047 		 * ~ 0.06ppm granularity for NTP.
1048 		 */
1049 		sec = cs->mask;
1050 		do_div(sec, freq);
1051 		do_div(sec, scale);
1052 		if (!sec)
1053 			sec = 1;
1054 		else if (sec > 600 && cs->mask > UINT_MAX)
1055 			sec = 600;
1056 
1057 		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
1058 				       NSEC_PER_SEC / scale, sec * scale);
1059 	}
1060 
1061 	/*
1062 	 * If the uncertainty margin is not specified, calculate it.
1063 	 * If both scale and freq are non-zero, calculate the clock
1064 	 * period, but bound below at 2*WATCHDOG_MAX_SKEW.  However,
1065 	 * if either of scale or freq is zero, be very conservative and
1066 	 * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
1067 	 * uncertainty margin.  Allow stupidly small uncertainty margins
1068 	 * to be specified by the caller for testing purposes, but warn
1069 	 * to discourage production use of this capability.
1070 	 */
1071 	if (scale && freq && !cs->uncertainty_margin) {
1072 		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
1073 		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
1074 			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
1075 	} else if (!cs->uncertainty_margin) {
1076 		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
1077 	}
1078 	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
1079 
1080 	/*
1081 	 * Ensure clocksources that have large 'mult' values don't overflow
1082 	 * when adjusted.
1083 	 */
1084 	cs->maxadj = clocksource_max_adjustment(cs);
1085 	while (freq && ((cs->mult + cs->maxadj < cs->mult)
1086 		|| (cs->mult - cs->maxadj > cs->mult))) {
1087 		cs->mult >>= 1;
1088 		cs->shift--;
1089 		cs->maxadj = clocksource_max_adjustment(cs);
1090 	}
1091 
1092 	/*
1093 	 * Only warn for *special* clocksources that self-define
1094 	 * their mult/shift values and don't specify a freq.
1095 	 */
1096 	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
1097 		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
1098 		cs->name);
1099 
1100 	clocksource_update_max_deferment(cs);
1101 
1102 	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
1103 		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
1104 }
1105 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
1106 
1107 /**
1108  * __clocksource_register_scale - Used to install new clocksources
1109  * @cs:		clocksource to be registered
1110  * @scale:	Scale factor multiplied against freq to get clocksource hz
1111  * @freq:	clocksource frequency (cycles per second) divided by scale
1112  *
1113  * Returns -EBUSY if registration fails, zero otherwise.
1114  *
1115  * This *SHOULD NOT* be called directly! Please use the
1116  * clocksource_register_hz() or clocksource_register_khz helper functions.
1117  */
1118 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
1119 {
1120 	unsigned long flags;
1121 
1122 	clocksource_arch_init(cs);
1123 
1124 	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
1125 		cs->id = CSID_GENERIC;
1126 	if (cs->vdso_clock_mode < 0 ||
1127 	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
1128 		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
1129 			cs->name, cs->vdso_clock_mode);
1130 		cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
1131 	}
1132 
1133 	/* Initialize mult/shift and max_idle_ns */
1134 	__clocksource_update_freq_scale(cs, scale, freq);
1135 
1136 	/* Add clocksource to the clocksource list */
1137 	mutex_lock(&clocksource_mutex);
1138 
1139 	clocksource_watchdog_lock(&flags);
1140 	clocksource_enqueue(cs);
1141 	clocksource_enqueue_watchdog(cs);
1142 	clocksource_watchdog_unlock(&flags);
1143 
1144 	clocksource_select();
1145 	clocksource_select_watchdog(false);
1146 	__clocksource_suspend_select(cs);
1147 	mutex_unlock(&clocksource_mutex);
1148 	return 0;
1149 }
1150 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
1151 
1152 static void __clocksource_change_rating(struct clocksource *cs, int rating)
1153 {
1154 	list_del(&cs->list);
1155 	cs->rating = rating;
1156 	clocksource_enqueue(cs);
1157 }
1158 
1159 /**
1160  * clocksource_change_rating - Change the rating of a registered clocksource
1161  * @cs:		clocksource to be changed
1162  * @rating:	new rating
1163  */
1164 void clocksource_change_rating(struct clocksource *cs, int rating)
1165 {
1166 	unsigned long flags;
1167 
1168 	mutex_lock(&clocksource_mutex);
1169 	clocksource_watchdog_lock(&flags);
1170 	__clocksource_change_rating(cs, rating);
1171 	clocksource_watchdog_unlock(&flags);
1172 
1173 	clocksource_select();
1174 	clocksource_select_watchdog(false);
1175 	clocksource_suspend_select(false);
1176 	mutex_unlock(&clocksource_mutex);
1177 }
1178 EXPORT_SYMBOL(clocksource_change_rating);
1179 
1180 /*
1181  * Unbind clocksource @cs. Called with clocksource_mutex held
1182  */
1183 static int clocksource_unbind(struct clocksource *cs)
1184 {
1185 	unsigned long flags;
1186 
1187 	if (clocksource_is_watchdog(cs)) {
1188 		/* Select and try to install a replacement watchdog. */
1189 		clocksource_select_watchdog(true);
1190 		if (clocksource_is_watchdog(cs))
1191 			return -EBUSY;
1192 	}
1193 
1194 	if (cs == curr_clocksource) {
1195 		/* Select and try to install a replacement clock source */
1196 		clocksource_select_fallback();
1197 		if (curr_clocksource == cs)
1198 			return -EBUSY;
1199 	}
1200 
1201 	if (clocksource_is_suspend(cs)) {
1202 		/*
1203 		 * Select and try to install a replacement suspend clocksource.
1204 		 * If no replacement suspend clocksource, we will just let the
1205 		 * clocksource go and have no suspend clocksource.
1206 		 */
1207 		clocksource_suspend_select(true);
1208 	}
1209 
1210 	clocksource_watchdog_lock(&flags);
1211 	clocksource_dequeue_watchdog(cs);
1212 	list_del_init(&cs->list);
1213 	clocksource_watchdog_unlock(&flags);
1214 
1215 	return 0;
1216 }
1217 
1218 /**
1219  * clocksource_unregister - remove a registered clocksource
1220  * @cs:	clocksource to be unregistered
1221  */
1222 int clocksource_unregister(struct clocksource *cs)
1223 {
1224 	int ret = 0;
1225 
1226 	mutex_lock(&clocksource_mutex);
1227 	if (!list_empty(&cs->list))
1228 		ret = clocksource_unbind(cs);
1229 	mutex_unlock(&clocksource_mutex);
1230 	return ret;
1231 }
1232 EXPORT_SYMBOL(clocksource_unregister);
1233 
1234 #ifdef CONFIG_SYSFS
1235 /**
1236  * current_clocksource_show - sysfs interface for current clocksource
1237  * @dev:	unused
1238  * @attr:	unused
1239  * @buf:	char buffer to be filled with clocksource list
1240  *
1241  * Provides sysfs interface for listing current clocksource.
1242  */
1243 static ssize_t current_clocksource_show(struct device *dev,
1244 					struct device_attribute *attr,
1245 					char *buf)
1246 {
1247 	ssize_t count = 0;
1248 
1249 	mutex_lock(&clocksource_mutex);
1250 	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
1251 	mutex_unlock(&clocksource_mutex);
1252 
1253 	return count;
1254 }
1255 
1256 ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
1257 {
1258 	size_t ret = cnt;
1259 
1260 	/* strings from sysfs write are not 0 terminated! */
1261 	if (!cnt || cnt >= CS_NAME_LEN)
1262 		return -EINVAL;
1263 
1264 	/* strip of \n: */
1265 	if (buf[cnt-1] == '\n')
1266 		cnt--;
1267 	if (cnt > 0)
1268 		memcpy(dst, buf, cnt);
1269 	dst[cnt] = 0;
1270 	return ret;
1271 }
1272 
1273 /**
1274  * current_clocksource_store - interface for manually overriding clocksource
1275  * @dev:	unused
1276  * @attr:	unused
1277  * @buf:	name of override clocksource
1278  * @count:	length of buffer
1279  *
1280  * Takes input from sysfs interface for manually overriding the default
1281  * clocksource selection.
1282  */
1283 static ssize_t current_clocksource_store(struct device *dev,
1284 					 struct device_attribute *attr,
1285 					 const char *buf, size_t count)
1286 {
1287 	ssize_t ret;
1288 
1289 	mutex_lock(&clocksource_mutex);
1290 
1291 	ret = sysfs_get_uname(buf, override_name, count);
1292 	if (ret >= 0)
1293 		clocksource_select();
1294 
1295 	mutex_unlock(&clocksource_mutex);
1296 
1297 	return ret;
1298 }
1299 static DEVICE_ATTR_RW(current_clocksource);
1300 
1301 /**
1302  * unbind_clocksource_store - interface for manually unbinding clocksource
1303  * @dev:	unused
1304  * @attr:	unused
1305  * @buf:	unused
1306  * @count:	length of buffer
1307  *
1308  * Takes input from sysfs interface for manually unbinding a clocksource.
1309  */
1310 static ssize_t unbind_clocksource_store(struct device *dev,
1311 					struct device_attribute *attr,
1312 					const char *buf, size_t count)
1313 {
1314 	struct clocksource *cs;
1315 	char name[CS_NAME_LEN];
1316 	ssize_t ret;
1317 
1318 	ret = sysfs_get_uname(buf, name, count);
1319 	if (ret < 0)
1320 		return ret;
1321 
1322 	ret = -ENODEV;
1323 	mutex_lock(&clocksource_mutex);
1324 	list_for_each_entry(cs, &clocksource_list, list) {
1325 		if (strcmp(cs->name, name))
1326 			continue;
1327 		ret = clocksource_unbind(cs);
1328 		break;
1329 	}
1330 	mutex_unlock(&clocksource_mutex);
1331 
1332 	return ret ? ret : count;
1333 }
1334 static DEVICE_ATTR_WO(unbind_clocksource);
1335 
1336 /**
1337  * available_clocksource_show - sysfs interface for listing clocksource
1338  * @dev:	unused
1339  * @attr:	unused
1340  * @buf:	char buffer to be filled with clocksource list
1341  *
1342  * Provides sysfs interface for listing registered clocksources
1343  */
1344 static ssize_t available_clocksource_show(struct device *dev,
1345 					  struct device_attribute *attr,
1346 					  char *buf)
1347 {
1348 	struct clocksource *src;
1349 	ssize_t count = 0;
1350 
1351 	mutex_lock(&clocksource_mutex);
1352 	list_for_each_entry(src, &clocksource_list, list) {
1353 		/*
1354 		 * Don't show non-HRES clocksource if the tick code is
1355 		 * in one shot mode (highres=on or nohz=on)
1356 		 */
1357 		if (!tick_oneshot_mode_active() ||
1358 		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
1359 			count += snprintf(buf + count,
1360 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
1361 				  "%s ", src->name);
1362 	}
1363 	mutex_unlock(&clocksource_mutex);
1364 
1365 	count += snprintf(buf + count,
1366 			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
1367 
1368 	return count;
1369 }
1370 static DEVICE_ATTR_RO(available_clocksource);
1371 
1372 static struct attribute *clocksource_attrs[] = {
1373 	&dev_attr_current_clocksource.attr,
1374 	&dev_attr_unbind_clocksource.attr,
1375 	&dev_attr_available_clocksource.attr,
1376 	NULL
1377 };
1378 ATTRIBUTE_GROUPS(clocksource);
1379 
1380 static struct bus_type clocksource_subsys = {
1381 	.name = "clocksource",
1382 	.dev_name = "clocksource",
1383 };
1384 
1385 static struct device device_clocksource = {
1386 	.id	= 0,
1387 	.bus	= &clocksource_subsys,
1388 	.groups	= clocksource_groups,
1389 };
1390 
1391 static int __init init_clocksource_sysfs(void)
1392 {
1393 	int error = subsys_system_register(&clocksource_subsys, NULL);
1394 
1395 	if (!error)
1396 		error = device_register(&device_clocksource);
1397 
1398 	return error;
1399 }
1400 
1401 device_initcall(init_clocksource_sysfs);
1402 #endif /* CONFIG_SYSFS */
1403 
1404 /**
1405  * boot_override_clocksource - boot clock override
1406  * @str:	override name
1407  *
1408  * Takes a clocksource= boot argument and uses it
1409  * as the clocksource override name.
1410  */
1411 static int __init boot_override_clocksource(char* str)
1412 {
1413 	mutex_lock(&clocksource_mutex);
1414 	if (str)
1415 		strlcpy(override_name, str, sizeof(override_name));
1416 	mutex_unlock(&clocksource_mutex);
1417 	return 1;
1418 }
1419 
1420 __setup("clocksource=", boot_override_clocksource);
1421 
1422 /**
1423  * boot_override_clock - Compatibility layer for deprecated boot option
1424  * @str:	override name
1425  *
1426  * DEPRECATED! Takes a clock= boot argument and uses it
1427  * as the clocksource override name
1428  */
1429 static int __init boot_override_clock(char* str)
1430 {
1431 	if (!strcmp(str, "pmtmr")) {
1432 		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
1433 		return boot_override_clocksource("acpi_pm");
1434 	}
1435 	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
1436 	return boot_override_clocksource(str);
1437 }
1438 
1439 __setup("clock=", boot_override_clock);
1440