xref: /openbmc/linux/kernel/watchdog_perf.c (revision b17aa959)
16ea0d042SDouglas Anderson // SPDX-License-Identifier: GPL-2.0
26ea0d042SDouglas Anderson /*
36ea0d042SDouglas Anderson  * Detect hard lockups on a system using perf
46ea0d042SDouglas Anderson  *
56ea0d042SDouglas Anderson  * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
66ea0d042SDouglas Anderson  *
76ea0d042SDouglas Anderson  * Note: Most of this code is borrowed heavily from the original softlockup
86ea0d042SDouglas Anderson  * detector, so thanks to Ingo for the initial implementation.
96ea0d042SDouglas Anderson  * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
106ea0d042SDouglas Anderson  * to those contributors as well.
116ea0d042SDouglas Anderson  */
126ea0d042SDouglas Anderson 
136ea0d042SDouglas Anderson #define pr_fmt(fmt) "NMI watchdog: " fmt
146ea0d042SDouglas Anderson 
156ea0d042SDouglas Anderson #include <linux/nmi.h>
166ea0d042SDouglas Anderson #include <linux/atomic.h>
176ea0d042SDouglas Anderson #include <linux/module.h>
186ea0d042SDouglas Anderson #include <linux/sched/debug.h>
196ea0d042SDouglas Anderson 
206ea0d042SDouglas Anderson #include <asm/irq_regs.h>
216ea0d042SDouglas Anderson #include <linux/perf_event.h>
226ea0d042SDouglas Anderson 
236ea0d042SDouglas Anderson static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
246ea0d042SDouglas Anderson static DEFINE_PER_CPU(struct perf_event *, dead_event);
256ea0d042SDouglas Anderson static struct cpumask dead_events_mask;
266ea0d042SDouglas Anderson 
276ea0d042SDouglas Anderson static atomic_t watchdog_cpus = ATOMIC_INIT(0);
286ea0d042SDouglas Anderson 
296ea0d042SDouglas Anderson #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
306ea0d042SDouglas Anderson static DEFINE_PER_CPU(ktime_t, last_timestamp);
316ea0d042SDouglas Anderson static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
326ea0d042SDouglas Anderson static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
336ea0d042SDouglas Anderson 
watchdog_update_hrtimer_threshold(u64 period)346ea0d042SDouglas Anderson void watchdog_update_hrtimer_threshold(u64 period)
356ea0d042SDouglas Anderson {
366ea0d042SDouglas Anderson 	/*
376ea0d042SDouglas Anderson 	 * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
386ea0d042SDouglas Anderson 	 *
396ea0d042SDouglas Anderson 	 * So it runs effectively with 2.5 times the rate of the NMI
406ea0d042SDouglas Anderson 	 * watchdog. That means the hrtimer should fire 2-3 times before
416ea0d042SDouglas Anderson 	 * the NMI watchdog expires. The NMI watchdog on x86 is based on
426ea0d042SDouglas Anderson 	 * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
436ea0d042SDouglas Anderson 	 * might run way faster than expected and the NMI fires in a
446ea0d042SDouglas Anderson 	 * smaller period than the one deduced from the nominal CPU
456ea0d042SDouglas Anderson 	 * frequency. Depending on the Turbo-Mode factor this might be fast
466ea0d042SDouglas Anderson 	 * enough to get the NMI period smaller than the hrtimer watchdog
476ea0d042SDouglas Anderson 	 * period and trigger false positives.
486ea0d042SDouglas Anderson 	 *
496ea0d042SDouglas Anderson 	 * The sample threshold is used to check in the NMI handler whether
506ea0d042SDouglas Anderson 	 * the minimum time between two NMI samples has elapsed. That
516ea0d042SDouglas Anderson 	 * prevents false positives.
526ea0d042SDouglas Anderson 	 *
536ea0d042SDouglas Anderson 	 * Set this to 4/5 of the actual watchdog threshold period so the
546ea0d042SDouglas Anderson 	 * hrtimer is guaranteed to fire at least once within the real
556ea0d042SDouglas Anderson 	 * watchdog threshold.
566ea0d042SDouglas Anderson 	 */
576ea0d042SDouglas Anderson 	watchdog_hrtimer_sample_threshold = period * 2;
586ea0d042SDouglas Anderson }
596ea0d042SDouglas Anderson 
watchdog_check_timestamp(void)606ea0d042SDouglas Anderson static bool watchdog_check_timestamp(void)
616ea0d042SDouglas Anderson {
626ea0d042SDouglas Anderson 	ktime_t delta, now = ktime_get_mono_fast_ns();
636ea0d042SDouglas Anderson 
646ea0d042SDouglas Anderson 	delta = now - __this_cpu_read(last_timestamp);
656ea0d042SDouglas Anderson 	if (delta < watchdog_hrtimer_sample_threshold) {
666ea0d042SDouglas Anderson 		/*
676ea0d042SDouglas Anderson 		 * If ktime is jiffies based, a stalled timer would prevent
686ea0d042SDouglas Anderson 		 * jiffies from being incremented and the filter would look
696ea0d042SDouglas Anderson 		 * at a stale timestamp and never trigger.
706ea0d042SDouglas Anderson 		 */
716ea0d042SDouglas Anderson 		if (__this_cpu_inc_return(nmi_rearmed) < 10)
726ea0d042SDouglas Anderson 			return false;
736ea0d042SDouglas Anderson 	}
746ea0d042SDouglas Anderson 	__this_cpu_write(nmi_rearmed, 0);
756ea0d042SDouglas Anderson 	__this_cpu_write(last_timestamp, now);
766ea0d042SDouglas Anderson 	return true;
776ea0d042SDouglas Anderson }
786ea0d042SDouglas Anderson #else
watchdog_check_timestamp(void)796ea0d042SDouglas Anderson static inline bool watchdog_check_timestamp(void)
806ea0d042SDouglas Anderson {
816ea0d042SDouglas Anderson 	return true;
826ea0d042SDouglas Anderson }
836ea0d042SDouglas Anderson #endif
846ea0d042SDouglas Anderson 
856ea0d042SDouglas Anderson static struct perf_event_attr wd_hw_attr = {
866ea0d042SDouglas Anderson 	.type		= PERF_TYPE_HARDWARE,
876ea0d042SDouglas Anderson 	.config		= PERF_COUNT_HW_CPU_CYCLES,
886ea0d042SDouglas Anderson 	.size		= sizeof(struct perf_event_attr),
896ea0d042SDouglas Anderson 	.pinned		= 1,
906ea0d042SDouglas Anderson 	.disabled	= 1,
916ea0d042SDouglas Anderson };
926ea0d042SDouglas Anderson 
936ea0d042SDouglas Anderson /* Callback function for perf event subsystem */
watchdog_overflow_callback(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)946ea0d042SDouglas Anderson static void watchdog_overflow_callback(struct perf_event *event,
956ea0d042SDouglas Anderson 				       struct perf_sample_data *data,
966ea0d042SDouglas Anderson 				       struct pt_regs *regs)
976ea0d042SDouglas Anderson {
986ea0d042SDouglas Anderson 	/* Ensure the watchdog never gets throttled */
996ea0d042SDouglas Anderson 	event->hw.interrupts = 0;
1006ea0d042SDouglas Anderson 
1016ea0d042SDouglas Anderson 	if (!watchdog_check_timestamp())
1026ea0d042SDouglas Anderson 		return;
1036ea0d042SDouglas Anderson 
10477c12fc9SDouglas Anderson 	watchdog_hardlockup_check(smp_processor_id(), regs);
1056ea0d042SDouglas Anderson }
1066ea0d042SDouglas Anderson 
hardlockup_detector_event_create(void)1076ea0d042SDouglas Anderson static int hardlockup_detector_event_create(void)
1086ea0d042SDouglas Anderson {
1096ea0d042SDouglas Anderson 	unsigned int cpu;
1106ea0d042SDouglas Anderson 	struct perf_event_attr *wd_attr;
1116ea0d042SDouglas Anderson 	struct perf_event *evt;
1126ea0d042SDouglas Anderson 
1136ea0d042SDouglas Anderson 	/*
1146ea0d042SDouglas Anderson 	 * Preemption is not disabled because memory will be allocated.
1156ea0d042SDouglas Anderson 	 * Ensure CPU-locality by calling this in per-CPU kthread.
1166ea0d042SDouglas Anderson 	 */
1176ea0d042SDouglas Anderson 	WARN_ON(!is_percpu_thread());
1186ea0d042SDouglas Anderson 	cpu = raw_smp_processor_id();
1196ea0d042SDouglas Anderson 	wd_attr = &wd_hw_attr;
1206ea0d042SDouglas Anderson 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
1216ea0d042SDouglas Anderson 
1226ea0d042SDouglas Anderson 	/* Try to register using hardware perf events */
1236ea0d042SDouglas Anderson 	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
1246ea0d042SDouglas Anderson 					       watchdog_overflow_callback, NULL);
1256ea0d042SDouglas Anderson 	if (IS_ERR(evt)) {
1266ea0d042SDouglas Anderson 		pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
1276ea0d042SDouglas Anderson 			 PTR_ERR(evt));
1286ea0d042SDouglas Anderson 		return PTR_ERR(evt);
1296ea0d042SDouglas Anderson 	}
1306ea0d042SDouglas Anderson 	this_cpu_write(watchdog_ev, evt);
1316ea0d042SDouglas Anderson 	return 0;
1326ea0d042SDouglas Anderson }
1336ea0d042SDouglas Anderson 
1346ea0d042SDouglas Anderson /**
135d9b3629aSDouglas Anderson  * watchdog_hardlockup_enable - Enable the local event
136d9b3629aSDouglas Anderson  *
137d9b3629aSDouglas Anderson  * @cpu: The CPU to enable hard lockup on.
1386ea0d042SDouglas Anderson  */
watchdog_hardlockup_enable(unsigned int cpu)139d9b3629aSDouglas Anderson void watchdog_hardlockup_enable(unsigned int cpu)
1406ea0d042SDouglas Anderson {
141d9b3629aSDouglas Anderson 	WARN_ON_ONCE(cpu != smp_processor_id());
142d9b3629aSDouglas Anderson 
1436ea0d042SDouglas Anderson 	if (hardlockup_detector_event_create())
1446ea0d042SDouglas Anderson 		return;
1456ea0d042SDouglas Anderson 
1466ea0d042SDouglas Anderson 	/* use original value for check */
1476ea0d042SDouglas Anderson 	if (!atomic_fetch_inc(&watchdog_cpus))
1486ea0d042SDouglas Anderson 		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
1496ea0d042SDouglas Anderson 
1506ea0d042SDouglas Anderson 	perf_event_enable(this_cpu_read(watchdog_ev));
1516ea0d042SDouglas Anderson }
1526ea0d042SDouglas Anderson 
1536ea0d042SDouglas Anderson /**
154d9b3629aSDouglas Anderson  * watchdog_hardlockup_disable - Disable the local event
155d9b3629aSDouglas Anderson  *
156d9b3629aSDouglas Anderson  * @cpu: The CPU to enable hard lockup on.
1576ea0d042SDouglas Anderson  */
watchdog_hardlockup_disable(unsigned int cpu)158d9b3629aSDouglas Anderson void watchdog_hardlockup_disable(unsigned int cpu)
1596ea0d042SDouglas Anderson {
1606ea0d042SDouglas Anderson 	struct perf_event *event = this_cpu_read(watchdog_ev);
1616ea0d042SDouglas Anderson 
162d9b3629aSDouglas Anderson 	WARN_ON_ONCE(cpu != smp_processor_id());
163d9b3629aSDouglas Anderson 
1646ea0d042SDouglas Anderson 	if (event) {
1656ea0d042SDouglas Anderson 		perf_event_disable(event);
1666ea0d042SDouglas Anderson 		this_cpu_write(watchdog_ev, NULL);
1676ea0d042SDouglas Anderson 		this_cpu_write(dead_event, event);
1686ea0d042SDouglas Anderson 		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
1696ea0d042SDouglas Anderson 		atomic_dec(&watchdog_cpus);
1706ea0d042SDouglas Anderson 	}
1716ea0d042SDouglas Anderson }
1726ea0d042SDouglas Anderson 
1736ea0d042SDouglas Anderson /**
1746ea0d042SDouglas Anderson  * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
1756ea0d042SDouglas Anderson  *
1766ea0d042SDouglas Anderson  * Called from lockup_detector_cleanup(). Serialized by the caller.
1776ea0d042SDouglas Anderson  */
hardlockup_detector_perf_cleanup(void)1786ea0d042SDouglas Anderson void hardlockup_detector_perf_cleanup(void)
1796ea0d042SDouglas Anderson {
1806ea0d042SDouglas Anderson 	int cpu;
1816ea0d042SDouglas Anderson 
1826ea0d042SDouglas Anderson 	for_each_cpu(cpu, &dead_events_mask) {
1836ea0d042SDouglas Anderson 		struct perf_event *event = per_cpu(dead_event, cpu);
1846ea0d042SDouglas Anderson 
1856ea0d042SDouglas Anderson 		/*
1866ea0d042SDouglas Anderson 		 * Required because for_each_cpu() reports  unconditionally
1876ea0d042SDouglas Anderson 		 * CPU0 as set on UP kernels. Sigh.
1886ea0d042SDouglas Anderson 		 */
1896ea0d042SDouglas Anderson 		if (event)
1906ea0d042SDouglas Anderson 			perf_event_release_kernel(event);
1916ea0d042SDouglas Anderson 		per_cpu(dead_event, cpu) = NULL;
1926ea0d042SDouglas Anderson 	}
1936ea0d042SDouglas Anderson 	cpumask_clear(&dead_events_mask);
1946ea0d042SDouglas Anderson }
1956ea0d042SDouglas Anderson 
1966ea0d042SDouglas Anderson /**
1976ea0d042SDouglas Anderson  * hardlockup_detector_perf_stop - Globally stop watchdog events
1986ea0d042SDouglas Anderson  *
1996ea0d042SDouglas Anderson  * Special interface for x86 to handle the perf HT bug.
2006ea0d042SDouglas Anderson  */
hardlockup_detector_perf_stop(void)2016ea0d042SDouglas Anderson void __init hardlockup_detector_perf_stop(void)
2026ea0d042SDouglas Anderson {
2036ea0d042SDouglas Anderson 	int cpu;
2046ea0d042SDouglas Anderson 
2056ea0d042SDouglas Anderson 	lockdep_assert_cpus_held();
2066ea0d042SDouglas Anderson 
2076ea0d042SDouglas Anderson 	for_each_online_cpu(cpu) {
2086ea0d042SDouglas Anderson 		struct perf_event *event = per_cpu(watchdog_ev, cpu);
2096ea0d042SDouglas Anderson 
2106ea0d042SDouglas Anderson 		if (event)
2116ea0d042SDouglas Anderson 			perf_event_disable(event);
2126ea0d042SDouglas Anderson 	}
2136ea0d042SDouglas Anderson }
2146ea0d042SDouglas Anderson 
2156ea0d042SDouglas Anderson /**
2166ea0d042SDouglas Anderson  * hardlockup_detector_perf_restart - Globally restart watchdog events
2176ea0d042SDouglas Anderson  *
2186ea0d042SDouglas Anderson  * Special interface for x86 to handle the perf HT bug.
2196ea0d042SDouglas Anderson  */
hardlockup_detector_perf_restart(void)2206ea0d042SDouglas Anderson void __init hardlockup_detector_perf_restart(void)
2216ea0d042SDouglas Anderson {
2226ea0d042SDouglas Anderson 	int cpu;
2236ea0d042SDouglas Anderson 
2246ea0d042SDouglas Anderson 	lockdep_assert_cpus_held();
2256ea0d042SDouglas Anderson 
226df95d308SDouglas Anderson 	if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
2276ea0d042SDouglas Anderson 		return;
2286ea0d042SDouglas Anderson 
2296ea0d042SDouglas Anderson 	for_each_online_cpu(cpu) {
2306ea0d042SDouglas Anderson 		struct perf_event *event = per_cpu(watchdog_ev, cpu);
2316ea0d042SDouglas Anderson 
2326ea0d042SDouglas Anderson 		if (event)
2336ea0d042SDouglas Anderson 			perf_event_enable(event);
2346ea0d042SDouglas Anderson 	}
2356ea0d042SDouglas Anderson }
2366ea0d042SDouglas Anderson 
arch_perf_nmi_is_available(void)237*b17aa959SDouglas Anderson bool __weak __init arch_perf_nmi_is_available(void)
238*b17aa959SDouglas Anderson {
239*b17aa959SDouglas Anderson 	return true;
240*b17aa959SDouglas Anderson }
241*b17aa959SDouglas Anderson 
2426ea0d042SDouglas Anderson /**
243d9b3629aSDouglas Anderson  * watchdog_hardlockup_probe - Probe whether NMI event is available at all
2446ea0d042SDouglas Anderson  */
watchdog_hardlockup_probe(void)245d9b3629aSDouglas Anderson int __init watchdog_hardlockup_probe(void)
2466ea0d042SDouglas Anderson {
247*b17aa959SDouglas Anderson 	int ret;
248*b17aa959SDouglas Anderson 
249*b17aa959SDouglas Anderson 	if (!arch_perf_nmi_is_available())
250*b17aa959SDouglas Anderson 		return -ENODEV;
251*b17aa959SDouglas Anderson 
252*b17aa959SDouglas Anderson 	ret = hardlockup_detector_event_create();
2536ea0d042SDouglas Anderson 
2546ea0d042SDouglas Anderson 	if (ret) {
2556ea0d042SDouglas Anderson 		pr_info("Perf NMI watchdog permanently disabled\n");
2566ea0d042SDouglas Anderson 	} else {
2576ea0d042SDouglas Anderson 		perf_event_release_kernel(this_cpu_read(watchdog_ev));
2586ea0d042SDouglas Anderson 		this_cpu_write(watchdog_ev, NULL);
2596ea0d042SDouglas Anderson 	}
2606ea0d042SDouglas Anderson 	return ret;
2616ea0d042SDouglas Anderson }
262