xref: /openbmc/linux/arch/powerpc/kernel/time.c (revision 07c7c6bf)
1 /*
2  * Common time routines among all ppc machines.
3  *
4  * Written by Cort Dougan (cort@cs.nmt.edu) to merge
5  * Paul Mackerras' version and mine for PReP and Pmac.
6  * MPC8xx/MBX changes by Dan Malek (dmalek@jlc.net).
7  * Converted for 64-bit by Mike Corrigan (mikejc@us.ibm.com)
8  *
9  * First round of bugfixes by Gabriel Paubert (paubert@iram.es)
10  * to make clock more stable (2.4.0-test5). The only thing
11  * that this code assumes is that the timebases have been synchronized
12  * by firmware on SMP and are never stopped (never do sleep
13  * on SMP then, nap and doze are OK).
14  *
15  * Speeded up do_gettimeofday by getting rid of references to
16  * xtime (which required locks for consistency). (mikejc@us.ibm.com)
17  *
18  * TODO (not necessarily in this file):
19  * - improve precision and reproducibility of timebase frequency
20  * measurement at boot time.
21  * - for astronomical applications: add a new function to get
22  * non ambiguous timestamps even around leap seconds. This needs
23  * a new timestamp format and a good name.
24  *
25  * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
26  *             "A Kernel Model for Precision Timekeeping" by Dave Mills
27  *
28  *      This program is free software; you can redistribute it and/or
29  *      modify it under the terms of the GNU General Public License
30  *      as published by the Free Software Foundation; either version
31  *      2 of the License, or (at your option) any later version.
32  */
33 
34 #include <linux/errno.h>
35 #include <linux/export.h>
36 #include <linux/sched.h>
37 #include <linux/sched/clock.h>
38 #include <linux/kernel.h>
39 #include <linux/param.h>
40 #include <linux/string.h>
41 #include <linux/mm.h>
42 #include <linux/interrupt.h>
43 #include <linux/timex.h>
44 #include <linux/kernel_stat.h>
45 #include <linux/time.h>
46 #include <linux/init.h>
47 #include <linux/profile.h>
48 #include <linux/cpu.h>
49 #include <linux/security.h>
50 #include <linux/percpu.h>
51 #include <linux/rtc.h>
52 #include <linux/jiffies.h>
53 #include <linux/posix-timers.h>
54 #include <linux/irq.h>
55 #include <linux/delay.h>
56 #include <linux/irq_work.h>
57 #include <linux/clk-provider.h>
58 #include <linux/suspend.h>
59 #include <linux/sched/cputime.h>
60 #include <linux/processor.h>
61 #include <asm/trace.h>
62 
63 #include <asm/io.h>
64 #include <asm/nvram.h>
65 #include <asm/cache.h>
66 #include <asm/machdep.h>
67 #include <linux/uaccess.h>
68 #include <asm/time.h>
69 #include <asm/prom.h>
70 #include <asm/irq.h>
71 #include <asm/div64.h>
72 #include <asm/smp.h>
73 #include <asm/vdso_datapage.h>
74 #include <asm/firmware.h>
75 #include <asm/asm-prototypes.h>
76 
77 /* powerpc clocksource/clockevent code */
78 
79 #include <linux/clockchips.h>
80 #include <linux/timekeeper_internal.h>
81 
82 static u64 rtc_read(struct clocksource *);
83 static struct clocksource clocksource_rtc = {
84 	.name         = "rtc",
85 	.rating       = 400,
86 	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
87 	.mask         = CLOCKSOURCE_MASK(64),
88 	.read         = rtc_read,
89 };
90 
91 static u64 timebase_read(struct clocksource *);
92 static struct clocksource clocksource_timebase = {
93 	.name         = "timebase",
94 	.rating       = 400,
95 	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
96 	.mask         = CLOCKSOURCE_MASK(64),
97 	.read         = timebase_read,
98 };
99 
100 #define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
101 u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
102 
103 static int decrementer_set_next_event(unsigned long evt,
104 				      struct clock_event_device *dev);
105 static int decrementer_shutdown(struct clock_event_device *evt);
106 
107 struct clock_event_device decrementer_clockevent = {
108 	.name			= "decrementer",
109 	.rating			= 200,
110 	.irq			= 0,
111 	.set_next_event		= decrementer_set_next_event,
112 	.set_state_oneshot_stopped = decrementer_shutdown,
113 	.set_state_shutdown	= decrementer_shutdown,
114 	.tick_resume		= decrementer_shutdown,
115 	.features		= CLOCK_EVT_FEAT_ONESHOT |
116 				  CLOCK_EVT_FEAT_C3STOP,
117 };
118 EXPORT_SYMBOL(decrementer_clockevent);
119 
120 DEFINE_PER_CPU(u64, decrementers_next_tb);
121 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
122 
123 #define XSEC_PER_SEC (1024*1024)
124 
125 #ifdef CONFIG_PPC64
126 #define SCALE_XSEC(xsec, max)	(((xsec) * max) / XSEC_PER_SEC)
127 #else
128 /* compute ((xsec << 12) * max) >> 32 */
129 #define SCALE_XSEC(xsec, max)	mulhwu((xsec) << 12, max)
130 #endif
131 
132 unsigned long tb_ticks_per_jiffy;
133 unsigned long tb_ticks_per_usec = 100; /* sane default */
134 EXPORT_SYMBOL(tb_ticks_per_usec);
135 unsigned long tb_ticks_per_sec;
136 EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
137 
138 DEFINE_SPINLOCK(rtc_lock);
139 EXPORT_SYMBOL_GPL(rtc_lock);
140 
141 static u64 tb_to_ns_scale __read_mostly;
142 static unsigned tb_to_ns_shift __read_mostly;
143 static u64 boot_tb __read_mostly;
144 
145 extern struct timezone sys_tz;
146 static long timezone_offset;
147 
148 unsigned long ppc_proc_freq;
149 EXPORT_SYMBOL_GPL(ppc_proc_freq);
150 unsigned long ppc_tb_freq;
151 EXPORT_SYMBOL_GPL(ppc_tb_freq);
152 
153 bool tb_invalid;
154 
155 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
156 /*
157  * Factor for converting from cputime_t (timebase ticks) to
158  * microseconds. This is stored as 0.64 fixed-point binary fraction.
159  */
160 u64 __cputime_usec_factor;
161 EXPORT_SYMBOL(__cputime_usec_factor);
162 
163 #ifdef CONFIG_PPC_SPLPAR
164 void (*dtl_consumer)(struct dtl_entry *, u64);
165 #endif
166 
167 static void calc_cputime_factors(void)
168 {
169 	struct div_result res;
170 
171 	div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
172 	__cputime_usec_factor = res.result_low;
173 }
174 
175 /*
176  * Read the SPURR on systems that have it, otherwise the PURR,
177  * or if that doesn't exist return the timebase value passed in.
178  */
179 static inline unsigned long read_spurr(unsigned long tb)
180 {
181 	if (cpu_has_feature(CPU_FTR_SPURR))
182 		return mfspr(SPRN_SPURR);
183 	if (cpu_has_feature(CPU_FTR_PURR))
184 		return mfspr(SPRN_PURR);
185 	return tb;
186 }
187 
188 #ifdef CONFIG_PPC_SPLPAR
189 
190 /*
191  * Scan the dispatch trace log and count up the stolen time.
192  * Should be called with interrupts disabled.
193  */
194 static u64 scan_dispatch_log(u64 stop_tb)
195 {
196 	u64 i = local_paca->dtl_ridx;
197 	struct dtl_entry *dtl = local_paca->dtl_curr;
198 	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
199 	struct lppaca *vpa = local_paca->lppaca_ptr;
200 	u64 tb_delta;
201 	u64 stolen = 0;
202 	u64 dtb;
203 
204 	if (!dtl)
205 		return 0;
206 
207 	if (i == be64_to_cpu(vpa->dtl_idx))
208 		return 0;
209 	while (i < be64_to_cpu(vpa->dtl_idx)) {
210 		dtb = be64_to_cpu(dtl->timebase);
211 		tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
212 			be32_to_cpu(dtl->ready_to_enqueue_time);
213 		barrier();
214 		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
215 			/* buffer has overflowed */
216 			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
217 			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
218 			continue;
219 		}
220 		if (dtb > stop_tb)
221 			break;
222 		if (dtl_consumer)
223 			dtl_consumer(dtl, i);
224 		stolen += tb_delta;
225 		++i;
226 		++dtl;
227 		if (dtl == dtl_end)
228 			dtl = local_paca->dispatch_log;
229 	}
230 	local_paca->dtl_ridx = i;
231 	local_paca->dtl_curr = dtl;
232 	return stolen;
233 }
234 
235 /*
236  * Accumulate stolen time by scanning the dispatch trace log.
237  * Called on entry from user mode.
238  */
239 void accumulate_stolen_time(void)
240 {
241 	u64 sst, ust;
242 	unsigned long save_irq_soft_mask = irq_soft_mask_return();
243 	struct cpu_accounting_data *acct = &local_paca->accounting;
244 
245 	/* We are called early in the exception entry, before
246 	 * soft/hard_enabled are sync'ed to the expected state
247 	 * for the exception. We are hard disabled but the PACA
248 	 * needs to reflect that so various debug stuff doesn't
249 	 * complain
250 	 */
251 	irq_soft_mask_set(IRQS_DISABLED);
252 
253 	sst = scan_dispatch_log(acct->starttime_user);
254 	ust = scan_dispatch_log(acct->starttime);
255 	acct->stime -= sst;
256 	acct->utime -= ust;
257 	acct->steal_time += ust + sst;
258 
259 	irq_soft_mask_set(save_irq_soft_mask);
260 }
261 
262 static inline u64 calculate_stolen_time(u64 stop_tb)
263 {
264 	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
265 		return 0;
266 
267 	if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx))
268 		return scan_dispatch_log(stop_tb);
269 
270 	return 0;
271 }
272 
273 #else /* CONFIG_PPC_SPLPAR */
274 static inline u64 calculate_stolen_time(u64 stop_tb)
275 {
276 	return 0;
277 }
278 
279 #endif /* CONFIG_PPC_SPLPAR */
280 
281 /*
282  * Account time for a transition between system, hard irq
283  * or soft irq state.
284  */
285 static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct,
286 					unsigned long now, unsigned long stime)
287 {
288 	unsigned long stime_scaled = 0;
289 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
290 	unsigned long nowscaled, deltascaled;
291 	unsigned long utime, utime_scaled;
292 
293 	nowscaled = read_spurr(now);
294 	deltascaled = nowscaled - acct->startspurr;
295 	acct->startspurr = nowscaled;
296 	utime = acct->utime - acct->utime_sspurr;
297 	acct->utime_sspurr = acct->utime;
298 
299 	/*
300 	 * Because we don't read the SPURR on every kernel entry/exit,
301 	 * deltascaled includes both user and system SPURR ticks.
302 	 * Apportion these ticks to system SPURR ticks and user
303 	 * SPURR ticks in the same ratio as the system time (delta)
304 	 * and user time (udelta) values obtained from the timebase
305 	 * over the same interval.  The system ticks get accounted here;
306 	 * the user ticks get saved up in paca->user_time_scaled to be
307 	 * used by account_process_tick.
308 	 */
309 	stime_scaled = stime;
310 	utime_scaled = utime;
311 	if (deltascaled != stime + utime) {
312 		if (utime) {
313 			stime_scaled = deltascaled * stime / (stime + utime);
314 			utime_scaled = deltascaled - stime_scaled;
315 		} else {
316 			stime_scaled = deltascaled;
317 		}
318 	}
319 	acct->utime_scaled += utime_scaled;
320 #endif
321 
322 	return stime_scaled;
323 }
324 
325 static unsigned long vtime_delta(struct task_struct *tsk,
326 				 unsigned long *stime_scaled,
327 				 unsigned long *steal_time)
328 {
329 	unsigned long now, stime;
330 	struct cpu_accounting_data *acct = get_accounting(tsk);
331 
332 	WARN_ON_ONCE(!irqs_disabled());
333 
334 	now = mftb();
335 	stime = now - acct->starttime;
336 	acct->starttime = now;
337 
338 	*stime_scaled = vtime_delta_scaled(acct, now, stime);
339 
340 	*steal_time = calculate_stolen_time(now);
341 
342 	return stime;
343 }
344 
345 void vtime_account_system(struct task_struct *tsk)
346 {
347 	unsigned long stime, stime_scaled, steal_time;
348 	struct cpu_accounting_data *acct = get_accounting(tsk);
349 
350 	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
351 
352 	stime -= min(stime, steal_time);
353 	acct->steal_time += steal_time;
354 
355 	if ((tsk->flags & PF_VCPU) && !irq_count()) {
356 		acct->gtime += stime;
357 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
358 		acct->utime_scaled += stime_scaled;
359 #endif
360 	} else {
361 		if (hardirq_count())
362 			acct->hardirq_time += stime;
363 		else if (in_serving_softirq())
364 			acct->softirq_time += stime;
365 		else
366 			acct->stime += stime;
367 
368 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
369 		acct->stime_scaled += stime_scaled;
370 #endif
371 	}
372 }
373 EXPORT_SYMBOL_GPL(vtime_account_system);
374 
375 void vtime_account_idle(struct task_struct *tsk)
376 {
377 	unsigned long stime, stime_scaled, steal_time;
378 	struct cpu_accounting_data *acct = get_accounting(tsk);
379 
380 	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
381 	acct->idle_time += stime + steal_time;
382 }
383 
384 static void vtime_flush_scaled(struct task_struct *tsk,
385 			       struct cpu_accounting_data *acct)
386 {
387 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
388 	if (acct->utime_scaled)
389 		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
390 	if (acct->stime_scaled)
391 		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
392 
393 	acct->utime_scaled = 0;
394 	acct->utime_sspurr = 0;
395 	acct->stime_scaled = 0;
396 #endif
397 }
398 
399 /*
400  * Account the whole cputime accumulated in the paca
401  * Must be called with interrupts disabled.
402  * Assumes that vtime_account_system/idle() has been called
403  * recently (i.e. since the last entry from usermode) so that
404  * get_paca()->user_time_scaled is up to date.
405  */
406 void vtime_flush(struct task_struct *tsk)
407 {
408 	struct cpu_accounting_data *acct = get_accounting(tsk);
409 
410 	if (acct->utime)
411 		account_user_time(tsk, cputime_to_nsecs(acct->utime));
412 
413 	if (acct->gtime)
414 		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
415 
416 	if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) {
417 		account_steal_time(cputime_to_nsecs(acct->steal_time));
418 		acct->steal_time = 0;
419 	}
420 
421 	if (acct->idle_time)
422 		account_idle_time(cputime_to_nsecs(acct->idle_time));
423 
424 	if (acct->stime)
425 		account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
426 					  CPUTIME_SYSTEM);
427 
428 	if (acct->hardirq_time)
429 		account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
430 					  CPUTIME_IRQ);
431 	if (acct->softirq_time)
432 		account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
433 					  CPUTIME_SOFTIRQ);
434 
435 	vtime_flush_scaled(tsk, acct);
436 
437 	acct->utime = 0;
438 	acct->gtime = 0;
439 	acct->idle_time = 0;
440 	acct->stime = 0;
441 	acct->hardirq_time = 0;
442 	acct->softirq_time = 0;
443 }
444 
445 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
446 #define calc_cputime_factors()
447 #endif
448 
449 void __delay(unsigned long loops)
450 {
451 	unsigned long start;
452 	int diff;
453 
454 	spin_begin();
455 	if (__USE_RTC()) {
456 		start = get_rtcl();
457 		do {
458 			/* the RTCL register wraps at 1000000000 */
459 			diff = get_rtcl() - start;
460 			if (diff < 0)
461 				diff += 1000000000;
462 			spin_cpu_relax();
463 		} while (diff < loops);
464 	} else if (tb_invalid) {
465 		/*
466 		 * TB is in error state and isn't ticking anymore.
467 		 * HMI handler was unable to recover from TB error.
468 		 * Return immediately, so that kernel won't get stuck here.
469 		 */
470 		spin_cpu_relax();
471 	} else {
472 		start = get_tbl();
473 		while (get_tbl() - start < loops)
474 			spin_cpu_relax();
475 	}
476 	spin_end();
477 }
478 EXPORT_SYMBOL(__delay);
479 
480 void udelay(unsigned long usecs)
481 {
482 	__delay(tb_ticks_per_usec * usecs);
483 }
484 EXPORT_SYMBOL(udelay);
485 
486 #ifdef CONFIG_SMP
487 unsigned long profile_pc(struct pt_regs *regs)
488 {
489 	unsigned long pc = instruction_pointer(regs);
490 
491 	if (in_lock_functions(pc))
492 		return regs->link;
493 
494 	return pc;
495 }
496 EXPORT_SYMBOL(profile_pc);
497 #endif
498 
499 #ifdef CONFIG_IRQ_WORK
500 
501 /*
502  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
503  */
504 #ifdef CONFIG_PPC64
505 static inline unsigned long test_irq_work_pending(void)
506 {
507 	unsigned long x;
508 
509 	asm volatile("lbz %0,%1(13)"
510 		: "=r" (x)
511 		: "i" (offsetof(struct paca_struct, irq_work_pending)));
512 	return x;
513 }
514 
515 static inline void set_irq_work_pending_flag(void)
516 {
517 	asm volatile("stb %0,%1(13)" : :
518 		"r" (1),
519 		"i" (offsetof(struct paca_struct, irq_work_pending)));
520 }
521 
522 static inline void clear_irq_work_pending(void)
523 {
524 	asm volatile("stb %0,%1(13)" : :
525 		"r" (0),
526 		"i" (offsetof(struct paca_struct, irq_work_pending)));
527 }
528 
529 void arch_irq_work_raise(void)
530 {
531 	preempt_disable();
532 	set_irq_work_pending_flag();
533 	/*
534 	 * Non-nmi code running with interrupts disabled will replay
535 	 * irq_happened before it re-enables interrupts, so setthe
536 	 * decrementer there instead of causing a hardware exception
537 	 * which would immediately hit the masked interrupt handler
538 	 * and have the net effect of setting the decrementer in
539 	 * irq_happened.
540 	 *
541 	 * NMI interrupts can not check this when they return, so the
542 	 * decrementer hardware exception is raised, which will fire
543 	 * when interrupts are next enabled.
544 	 *
545 	 * BookE does not support this yet, it must audit all NMI
546 	 * interrupt handlers to ensure they call nmi_enter() so this
547 	 * check would be correct.
548 	 */
549 	if (IS_ENABLED(CONFIG_BOOKE) || !irqs_disabled() || in_nmi()) {
550 		set_dec(1);
551 	} else {
552 		hard_irq_disable();
553 		local_paca->irq_happened |= PACA_IRQ_DEC;
554 	}
555 	preempt_enable();
556 }
557 
558 #else /* 32-bit */
559 
560 DEFINE_PER_CPU(u8, irq_work_pending);
561 
562 #define set_irq_work_pending_flag()	__this_cpu_write(irq_work_pending, 1)
563 #define test_irq_work_pending()		__this_cpu_read(irq_work_pending)
564 #define clear_irq_work_pending()	__this_cpu_write(irq_work_pending, 0)
565 
566 void arch_irq_work_raise(void)
567 {
568 	preempt_disable();
569 	set_irq_work_pending_flag();
570 	set_dec(1);
571 	preempt_enable();
572 }
573 
574 #endif /* 32 vs 64 bit */
575 
576 #else  /* CONFIG_IRQ_WORK */
577 
578 #define test_irq_work_pending()	0
579 #define clear_irq_work_pending()
580 
581 #endif /* CONFIG_IRQ_WORK */
582 
583 /*
584  * timer_interrupt - gets called when the decrementer overflows,
585  * with interrupts disabled.
586  */
587 void timer_interrupt(struct pt_regs *regs)
588 {
589 	struct clock_event_device *evt = this_cpu_ptr(&decrementers);
590 	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
591 	struct pt_regs *old_regs;
592 	u64 now;
593 
594 	/* Some implementations of hotplug will get timer interrupts while
595 	 * offline, just ignore these and we also need to set
596 	 * decrementers_next_tb as MAX to make sure __check_irq_replay
597 	 * don't replay timer interrupt when return, otherwise we'll trap
598 	 * here infinitely :(
599 	 */
600 	if (unlikely(!cpu_online(smp_processor_id()))) {
601 		*next_tb = ~(u64)0;
602 		set_dec(decrementer_max);
603 		return;
604 	}
605 
606 	/* Ensure a positive value is written to the decrementer, or else
607 	 * some CPUs will continue to take decrementer exceptions. When the
608 	 * PPC_WATCHDOG (decrementer based) is configured, keep this at most
609 	 * 31 bits, which is about 4 seconds on most systems, which gives
610 	 * the watchdog a chance of catching timer interrupt hard lockups.
611 	 */
612 	if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
613 		set_dec(0x7fffffff);
614 	else
615 		set_dec(decrementer_max);
616 
617 	/* Conditionally hard-enable interrupts now that the DEC has been
618 	 * bumped to its maximum value
619 	 */
620 	may_hard_irq_enable();
621 
622 
623 #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
624 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
625 		do_IRQ(regs);
626 #endif
627 
628 	old_regs = set_irq_regs(regs);
629 	irq_enter();
630 	trace_timer_interrupt_entry(regs);
631 
632 	if (test_irq_work_pending()) {
633 		clear_irq_work_pending();
634 		irq_work_run();
635 	}
636 
637 	now = get_tb_or_rtc();
638 	if (now >= *next_tb) {
639 		*next_tb = ~(u64)0;
640 		if (evt->event_handler)
641 			evt->event_handler(evt);
642 		__this_cpu_inc(irq_stat.timer_irqs_event);
643 	} else {
644 		now = *next_tb - now;
645 		if (now <= decrementer_max)
646 			set_dec(now);
647 		/* We may have raced with new irq work */
648 		if (test_irq_work_pending())
649 			set_dec(1);
650 		__this_cpu_inc(irq_stat.timer_irqs_others);
651 	}
652 
653 	trace_timer_interrupt_exit(regs);
654 	irq_exit();
655 	set_irq_regs(old_regs);
656 }
657 EXPORT_SYMBOL(timer_interrupt);
658 
659 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
660 void timer_broadcast_interrupt(void)
661 {
662 	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
663 
664 	*next_tb = ~(u64)0;
665 	tick_receive_broadcast();
666 	__this_cpu_inc(irq_stat.broadcast_irqs_event);
667 }
668 #endif
669 
670 /*
671  * Hypervisor decrementer interrupts shouldn't occur but are sometimes
672  * left pending on exit from a KVM guest.  We don't need to do anything
673  * to clear them, as they are edge-triggered.
674  */
675 void hdec_interrupt(struct pt_regs *regs)
676 {
677 }
678 
679 #ifdef CONFIG_SUSPEND
680 static void generic_suspend_disable_irqs(void)
681 {
682 	/* Disable the decrementer, so that it doesn't interfere
683 	 * with suspending.
684 	 */
685 
686 	set_dec(decrementer_max);
687 	local_irq_disable();
688 	set_dec(decrementer_max);
689 }
690 
691 static void generic_suspend_enable_irqs(void)
692 {
693 	local_irq_enable();
694 }
695 
696 /* Overrides the weak version in kernel/power/main.c */
697 void arch_suspend_disable_irqs(void)
698 {
699 	if (ppc_md.suspend_disable_irqs)
700 		ppc_md.suspend_disable_irqs();
701 	generic_suspend_disable_irqs();
702 }
703 
704 /* Overrides the weak version in kernel/power/main.c */
705 void arch_suspend_enable_irqs(void)
706 {
707 	generic_suspend_enable_irqs();
708 	if (ppc_md.suspend_enable_irqs)
709 		ppc_md.suspend_enable_irqs();
710 }
711 #endif
712 
713 unsigned long long tb_to_ns(unsigned long long ticks)
714 {
715 	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
716 }
717 EXPORT_SYMBOL_GPL(tb_to_ns);
718 
719 /*
720  * Scheduler clock - returns current time in nanosec units.
721  *
722  * Note: mulhdu(a, b) (multiply high double unsigned) returns
723  * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
724  * are 64-bit unsigned numbers.
725  */
726 notrace unsigned long long sched_clock(void)
727 {
728 	if (__USE_RTC())
729 		return get_rtc();
730 	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
731 }
732 
733 
734 #ifdef CONFIG_PPC_PSERIES
735 
736 /*
737  * Running clock - attempts to give a view of time passing for a virtualised
738  * kernels.
739  * Uses the VTB register if available otherwise a next best guess.
740  */
741 unsigned long long running_clock(void)
742 {
743 	/*
744 	 * Don't read the VTB as a host since KVM does not switch in host
745 	 * timebase into the VTB when it takes a guest off the CPU, reading the
746 	 * VTB would result in reading 'last switched out' guest VTB.
747 	 *
748 	 * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it
749 	 * would be unsafe to rely only on the #ifdef above.
750 	 */
751 	if (firmware_has_feature(FW_FEATURE_LPAR) &&
752 	    cpu_has_feature(CPU_FTR_ARCH_207S))
753 		return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
754 
755 	/*
756 	 * This is a next best approximation without a VTB.
757 	 * On a host which is running bare metal there should never be any stolen
758 	 * time and on a host which doesn't do any virtualisation TB *should* equal
759 	 * VTB so it makes no difference anyway.
760 	 */
761 	return local_clock() - kcpustat_this_cpu->cpustat[CPUTIME_STEAL];
762 }
763 #endif
764 
765 static int __init get_freq(char *name, int cells, unsigned long *val)
766 {
767 	struct device_node *cpu;
768 	const __be32 *fp;
769 	int found = 0;
770 
771 	/* The cpu node should have timebase and clock frequency properties */
772 	cpu = of_find_node_by_type(NULL, "cpu");
773 
774 	if (cpu) {
775 		fp = of_get_property(cpu, name, NULL);
776 		if (fp) {
777 			found = 1;
778 			*val = of_read_ulong(fp, cells);
779 		}
780 
781 		of_node_put(cpu);
782 	}
783 
784 	return found;
785 }
786 
787 static void start_cpu_decrementer(void)
788 {
789 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
790 	unsigned int tcr;
791 
792 	/* Clear any pending timer interrupts */
793 	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
794 
795 	tcr = mfspr(SPRN_TCR);
796 	/*
797 	 * The watchdog may have already been enabled by u-boot. So leave
798 	 * TRC[WP] (Watchdog Period) alone.
799 	 */
800 	tcr &= TCR_WP_MASK;	/* Clear all bits except for TCR[WP] */
801 	tcr |= TCR_DIE;		/* Enable decrementer */
802 	mtspr(SPRN_TCR, tcr);
803 #endif
804 }
805 
806 void __init generic_calibrate_decr(void)
807 {
808 	ppc_tb_freq = DEFAULT_TB_FREQ;		/* hardcoded default */
809 
810 	if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) &&
811 	    !get_freq("timebase-frequency", 1, &ppc_tb_freq)) {
812 
813 		printk(KERN_ERR "WARNING: Estimating decrementer frequency "
814 				"(not found)\n");
815 	}
816 
817 	ppc_proc_freq = DEFAULT_PROC_FREQ;	/* hardcoded default */
818 
819 	if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) &&
820 	    !get_freq("clock-frequency", 1, &ppc_proc_freq)) {
821 
822 		printk(KERN_ERR "WARNING: Estimating processor frequency "
823 				"(not found)\n");
824 	}
825 }
826 
827 int update_persistent_clock64(struct timespec64 now)
828 {
829 	struct rtc_time tm;
830 
831 	if (!ppc_md.set_rtc_time)
832 		return -ENODEV;
833 
834 	rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm);
835 
836 	return ppc_md.set_rtc_time(&tm);
837 }
838 
839 static void __read_persistent_clock(struct timespec64 *ts)
840 {
841 	struct rtc_time tm;
842 	static int first = 1;
843 
844 	ts->tv_nsec = 0;
845 	/* XXX this is a litle fragile but will work okay in the short term */
846 	if (first) {
847 		first = 0;
848 		if (ppc_md.time_init)
849 			timezone_offset = ppc_md.time_init();
850 
851 		/* get_boot_time() isn't guaranteed to be safe to call late */
852 		if (ppc_md.get_boot_time) {
853 			ts->tv_sec = ppc_md.get_boot_time() - timezone_offset;
854 			return;
855 		}
856 	}
857 	if (!ppc_md.get_rtc_time) {
858 		ts->tv_sec = 0;
859 		return;
860 	}
861 	ppc_md.get_rtc_time(&tm);
862 
863 	ts->tv_sec = rtc_tm_to_time64(&tm);
864 }
865 
866 void read_persistent_clock64(struct timespec64 *ts)
867 {
868 	__read_persistent_clock(ts);
869 
870 	/* Sanitize it in case real time clock is set below EPOCH */
871 	if (ts->tv_sec < 0) {
872 		ts->tv_sec = 0;
873 		ts->tv_nsec = 0;
874 	}
875 
876 }
877 
878 /* clocksource code */
879 static notrace u64 rtc_read(struct clocksource *cs)
880 {
881 	return (u64)get_rtc();
882 }
883 
884 static notrace u64 timebase_read(struct clocksource *cs)
885 {
886 	return (u64)get_tb();
887 }
888 
889 
890 void update_vsyscall(struct timekeeper *tk)
891 {
892 	struct timespec xt;
893 	struct clocksource *clock = tk->tkr_mono.clock;
894 	u32 mult = tk->tkr_mono.mult;
895 	u32 shift = tk->tkr_mono.shift;
896 	u64 cycle_last = tk->tkr_mono.cycle_last;
897 	u64 new_tb_to_xs, new_stamp_xsec;
898 	u64 frac_sec;
899 
900 	if (clock != &clocksource_timebase)
901 		return;
902 
903 	xt.tv_sec = tk->xtime_sec;
904 	xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
905 
906 	/* Make userspace gettimeofday spin until we're done. */
907 	++vdso_data->tb_update_count;
908 	smp_mb();
909 
910 	/*
911 	 * This computes ((2^20 / 1e9) * mult) >> shift as a
912 	 * 0.64 fixed-point fraction.
913 	 * The computation in the else clause below won't overflow
914 	 * (as long as the timebase frequency is >= 1.049 MHz)
915 	 * but loses precision because we lose the low bits of the constant
916 	 * in the shift.  Note that 19342813113834067 ~= 2^(20+64) / 1e9.
917 	 * For a shift of 24 the error is about 0.5e-9, or about 0.5ns
918 	 * over a second.  (Shift values are usually 22, 23 or 24.)
919 	 * For high frequency clocks such as the 512MHz timebase clock
920 	 * on POWER[6789], the mult value is small (e.g. 32768000)
921 	 * and so we can shift the constant by 16 initially
922 	 * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
923 	 * remaining shifts after the multiplication, which gives a
924 	 * more accurate result (e.g. with mult = 32768000, shift = 24,
925 	 * the error is only about 1.2e-12, or 0.7ns over 10 minutes).
926 	 */
927 	if (mult <= 62500000 && clock->shift >= 16)
928 		new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
929 	else
930 		new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
931 
932 	/*
933 	 * Compute the fractional second in units of 2^-32 seconds.
934 	 * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
935 	 * in nanoseconds, so multiplying that by 2^32 / 1e9 gives
936 	 * it in units of 2^-32 seconds.
937 	 * We assume shift <= 32 because clocks_calc_mult_shift()
938 	 * generates shift values in the range 0 - 32.
939 	 */
940 	frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
941 	do_div(frac_sec, NSEC_PER_SEC);
942 
943 	/*
944 	 * Work out new stamp_xsec value for any legacy users of systemcfg.
945 	 * stamp_xsec is in units of 2^-20 seconds.
946 	 */
947 	new_stamp_xsec = frac_sec >> 12;
948 	new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
949 
950 	/*
951 	 * tb_update_count is used to allow the userspace gettimeofday code
952 	 * to assure itself that it sees a consistent view of the tb_to_xs and
953 	 * stamp_xsec variables.  It reads the tb_update_count, then reads
954 	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
955 	 * the two values of tb_update_count match and are even then the
956 	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
957 	 * loops back and reads them again until this criteria is met.
958 	 */
959 	vdso_data->tb_orig_stamp = cycle_last;
960 	vdso_data->stamp_xsec = new_stamp_xsec;
961 	vdso_data->tb_to_xs = new_tb_to_xs;
962 	vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
963 	vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
964 	vdso_data->stamp_xtime = xt;
965 	vdso_data->stamp_sec_fraction = frac_sec;
966 	smp_wmb();
967 	++(vdso_data->tb_update_count);
968 }
969 
970 void update_vsyscall_tz(void)
971 {
972 	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
973 	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
974 }
975 
976 static void __init clocksource_init(void)
977 {
978 	struct clocksource *clock;
979 
980 	if (__USE_RTC())
981 		clock = &clocksource_rtc;
982 	else
983 		clock = &clocksource_timebase;
984 
985 	if (clocksource_register_hz(clock, tb_ticks_per_sec)) {
986 		printk(KERN_ERR "clocksource: %s is already registered\n",
987 		       clock->name);
988 		return;
989 	}
990 
991 	printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n",
992 	       clock->name, clock->mult, clock->shift);
993 }
994 
995 static int decrementer_set_next_event(unsigned long evt,
996 				      struct clock_event_device *dev)
997 {
998 	__this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt);
999 	set_dec(evt);
1000 
1001 	/* We may have raced with new irq work */
1002 	if (test_irq_work_pending())
1003 		set_dec(1);
1004 
1005 	return 0;
1006 }
1007 
1008 static int decrementer_shutdown(struct clock_event_device *dev)
1009 {
1010 	decrementer_set_next_event(decrementer_max, dev);
1011 	return 0;
1012 }
1013 
1014 static void register_decrementer_clockevent(int cpu)
1015 {
1016 	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
1017 
1018 	*dec = decrementer_clockevent;
1019 	dec->cpumask = cpumask_of(cpu);
1020 
1021 	clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max);
1022 
1023 	printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
1024 		    dec->name, dec->mult, dec->shift, cpu);
1025 
1026 	/* Set values for KVM, see kvm_emulate_dec() */
1027 	decrementer_clockevent.mult = dec->mult;
1028 	decrementer_clockevent.shift = dec->shift;
1029 }
1030 
1031 static void enable_large_decrementer(void)
1032 {
1033 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
1034 		return;
1035 
1036 	if (decrementer_max <= DECREMENTER_DEFAULT_MAX)
1037 		return;
1038 
1039 	/*
1040 	 * If we're running as the hypervisor we need to enable the LD manually
1041 	 * otherwise firmware should have done it for us.
1042 	 */
1043 	if (cpu_has_feature(CPU_FTR_HVMODE))
1044 		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_LD);
1045 }
1046 
1047 static void __init set_decrementer_max(void)
1048 {
1049 	struct device_node *cpu;
1050 	u32 bits = 32;
1051 
1052 	/* Prior to ISAv3 the decrementer is always 32 bit */
1053 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
1054 		return;
1055 
1056 	cpu = of_find_node_by_type(NULL, "cpu");
1057 
1058 	if (of_property_read_u32(cpu, "ibm,dec-bits", &bits) == 0) {
1059 		if (bits > 64 || bits < 32) {
1060 			pr_warn("time_init: firmware supplied invalid ibm,dec-bits");
1061 			bits = 32;
1062 		}
1063 
1064 		/* calculate the signed maximum given this many bits */
1065 		decrementer_max = (1ul << (bits - 1)) - 1;
1066 	}
1067 
1068 	of_node_put(cpu);
1069 
1070 	pr_info("time_init: %u bit decrementer (max: %llx)\n",
1071 		bits, decrementer_max);
1072 }
1073 
1074 static void __init init_decrementer_clockevent(void)
1075 {
1076 	register_decrementer_clockevent(smp_processor_id());
1077 }
1078 
1079 void secondary_cpu_time_init(void)
1080 {
1081 	/* Enable and test the large decrementer for this cpu */
1082 	enable_large_decrementer();
1083 
1084 	/* Start the decrementer on CPUs that have manual control
1085 	 * such as BookE
1086 	 */
1087 	start_cpu_decrementer();
1088 
1089 	/* FIME: Should make unrelatred change to move snapshot_timebase
1090 	 * call here ! */
1091 	register_decrementer_clockevent(smp_processor_id());
1092 }
1093 
1094 /* This function is only called on the boot processor */
1095 void __init time_init(void)
1096 {
1097 	struct div_result res;
1098 	u64 scale;
1099 	unsigned shift;
1100 
1101 	if (__USE_RTC()) {
1102 		/* 601 processor: dec counts down by 128 every 128ns */
1103 		ppc_tb_freq = 1000000000;
1104 	} else {
1105 		/* Normal PowerPC with timebase register */
1106 		ppc_md.calibrate_decr();
1107 		printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n",
1108 		       ppc_tb_freq / 1000000, ppc_tb_freq % 1000000);
1109 		printk(KERN_DEBUG "time_init: processor frequency   = %lu.%.6lu MHz\n",
1110 		       ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
1111 	}
1112 
1113 	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
1114 	tb_ticks_per_sec = ppc_tb_freq;
1115 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
1116 	calc_cputime_factors();
1117 
1118 	/*
1119 	 * Compute scale factor for sched_clock.
1120 	 * The calibrate_decr() function has set tb_ticks_per_sec,
1121 	 * which is the timebase frequency.
1122 	 * We compute 1e9 * 2^64 / tb_ticks_per_sec and interpret
1123 	 * the 128-bit result as a 64.64 fixed-point number.
1124 	 * We then shift that number right until it is less than 1.0,
1125 	 * giving us the scale factor and shift count to use in
1126 	 * sched_clock().
1127 	 */
1128 	div128_by_32(1000000000, 0, tb_ticks_per_sec, &res);
1129 	scale = res.result_low;
1130 	for (shift = 0; res.result_high != 0; ++shift) {
1131 		scale = (scale >> 1) | (res.result_high << 63);
1132 		res.result_high >>= 1;
1133 	}
1134 	tb_to_ns_scale = scale;
1135 	tb_to_ns_shift = shift;
1136 	/* Save the current timebase to pretty up CONFIG_PRINTK_TIME */
1137 	boot_tb = get_tb_or_rtc();
1138 
1139 	/* If platform provided a timezone (pmac), we correct the time */
1140 	if (timezone_offset) {
1141 		sys_tz.tz_minuteswest = -timezone_offset / 60;
1142 		sys_tz.tz_dsttime = 0;
1143 	}
1144 
1145 	vdso_data->tb_update_count = 0;
1146 	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
1147 
1148 	/* initialise and enable the large decrementer (if we have one) */
1149 	set_decrementer_max();
1150 	enable_large_decrementer();
1151 
1152 	/* Start the decrementer on CPUs that have manual control
1153 	 * such as BookE
1154 	 */
1155 	start_cpu_decrementer();
1156 
1157 	/* Register the clocksource */
1158 	clocksource_init();
1159 
1160 	init_decrementer_clockevent();
1161 	tick_setup_hrtimer_broadcast();
1162 
1163 #ifdef CONFIG_COMMON_CLK
1164 	of_clk_init(NULL);
1165 #endif
1166 }
1167 
1168 /*
1169  * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
1170  * result.
1171  */
1172 void div128_by_32(u64 dividend_high, u64 dividend_low,
1173 		  unsigned divisor, struct div_result *dr)
1174 {
1175 	unsigned long a, b, c, d;
1176 	unsigned long w, x, y, z;
1177 	u64 ra, rb, rc;
1178 
1179 	a = dividend_high >> 32;
1180 	b = dividend_high & 0xffffffff;
1181 	c = dividend_low >> 32;
1182 	d = dividend_low & 0xffffffff;
1183 
1184 	w = a / divisor;
1185 	ra = ((u64)(a - (w * divisor)) << 32) + b;
1186 
1187 	rb = ((u64) do_div(ra, divisor) << 32) + c;
1188 	x = ra;
1189 
1190 	rc = ((u64) do_div(rb, divisor) << 32) + d;
1191 	y = rb;
1192 
1193 	do_div(rc, divisor);
1194 	z = rc;
1195 
1196 	dr->result_high = ((u64)w << 32) + x;
1197 	dr->result_low  = ((u64)y << 32) + z;
1198 
1199 }
1200 
1201 /* We don't need to calibrate delay, we use the CPU timebase for that */
1202 void calibrate_delay(void)
1203 {
1204 	/* Some generic code (such as spinlock debug) use loops_per_jiffy
1205 	 * as the number of __delay(1) in a jiffy, so make it so
1206 	 */
1207 	loops_per_jiffy = tb_ticks_per_jiffy;
1208 }
1209 
1210 #if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
1211 static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
1212 {
1213 	ppc_md.get_rtc_time(tm);
1214 	return 0;
1215 }
1216 
1217 static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
1218 {
1219 	if (!ppc_md.set_rtc_time)
1220 		return -EOPNOTSUPP;
1221 
1222 	if (ppc_md.set_rtc_time(tm) < 0)
1223 		return -EOPNOTSUPP;
1224 
1225 	return 0;
1226 }
1227 
1228 static const struct rtc_class_ops rtc_generic_ops = {
1229 	.read_time = rtc_generic_get_time,
1230 	.set_time = rtc_generic_set_time,
1231 };
1232 
1233 static int __init rtc_init(void)
1234 {
1235 	struct platform_device *pdev;
1236 
1237 	if (!ppc_md.get_rtc_time)
1238 		return -ENODEV;
1239 
1240 	pdev = platform_device_register_data(NULL, "rtc-generic", -1,
1241 					     &rtc_generic_ops,
1242 					     sizeof(rtc_generic_ops));
1243 
1244 	return PTR_ERR_OR_ZERO(pdev);
1245 }
1246 
1247 device_initcall(rtc_init);
1248 #endif
1249