xref: /openbmc/linux/drivers/char/hangcheck-timer.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1*1da177e4SLinus Torvalds /*
2*1da177e4SLinus Torvalds  * hangcheck-timer.c
3*1da177e4SLinus Torvalds  *
4*1da177e4SLinus Torvalds  * Driver for a little io fencing timer.
5*1da177e4SLinus Torvalds  *
6*1da177e4SLinus Torvalds  * Copyright (C) 2002 Oracle Corporation.  All rights reserved.
7*1da177e4SLinus Torvalds  *
8*1da177e4SLinus Torvalds  * Author: Joel Becker <joel.becker@oracle.com>
9*1da177e4SLinus Torvalds  *
10*1da177e4SLinus Torvalds  * This program is free software; you can redistribute it and/or
11*1da177e4SLinus Torvalds  * modify it under the terms of the GNU General Public
12*1da177e4SLinus Torvalds  * License version 2 as published by the Free Software Foundation.
13*1da177e4SLinus Torvalds  *
14*1da177e4SLinus Torvalds  * This program is distributed in the hope that it will be useful,
15*1da177e4SLinus Torvalds  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16*1da177e4SLinus Torvalds  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17*1da177e4SLinus Torvalds  * General Public License for more details.
18*1da177e4SLinus Torvalds  *
19*1da177e4SLinus Torvalds  * You should have received a copy of the GNU General Public
20*1da177e4SLinus Torvalds  * License along with this program; if not, write to the
21*1da177e4SLinus Torvalds  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22*1da177e4SLinus Torvalds  * Boston, MA 021110-1307, USA.
23*1da177e4SLinus Torvalds  */
24*1da177e4SLinus Torvalds 
25*1da177e4SLinus Torvalds /*
26*1da177e4SLinus Torvalds  * The hangcheck-timer driver uses the TSC to catch delays that
27*1da177e4SLinus Torvalds  * jiffies does not notice.  A timer is set.  When the timer fires, it
28*1da177e4SLinus Torvalds  * checks whether it was delayed and if that delay exceeds a given
29*1da177e4SLinus Torvalds  * margin of error.  The hangcheck_tick module paramter takes the timer
30*1da177e4SLinus Torvalds  * duration in seconds.  The hangcheck_margin parameter defines the
31*1da177e4SLinus Torvalds  * margin of error, in seconds.  The defaults are 60 seconds for the
32*1da177e4SLinus Torvalds  * timer and 180 seconds for the margin of error.  IOW, a timer is set
33*1da177e4SLinus Torvalds  * for 60 seconds.  When the timer fires, the callback checks the
34*1da177e4SLinus Torvalds  * actual duration that the timer waited.  If the duration exceeds the
35*1da177e4SLinus Torvalds  * alloted time and margin (here 60 + 180, or 240 seconds), the machine
36*1da177e4SLinus Torvalds  * is restarted.  A healthy machine will have the duration match the
37*1da177e4SLinus Torvalds  * expected timeout very closely.
38*1da177e4SLinus Torvalds  */
39*1da177e4SLinus Torvalds 
40*1da177e4SLinus Torvalds #include <linux/module.h>
41*1da177e4SLinus Torvalds #include <linux/moduleparam.h>
42*1da177e4SLinus Torvalds #include <linux/types.h>
43*1da177e4SLinus Torvalds #include <linux/kernel.h>
44*1da177e4SLinus Torvalds #include <linux/fs.h>
45*1da177e4SLinus Torvalds #include <linux/mm.h>
46*1da177e4SLinus Torvalds #include <linux/reboot.h>
47*1da177e4SLinus Torvalds #include <linux/init.h>
48*1da177e4SLinus Torvalds #include <asm/uaccess.h>
49*1da177e4SLinus Torvalds 
50*1da177e4SLinus Torvalds 
51*1da177e4SLinus Torvalds #define VERSION_STR "0.5.0"
52*1da177e4SLinus Torvalds 
53*1da177e4SLinus Torvalds #define DEFAULT_IOFENCE_MARGIN 60	/* Default fudge factor, in seconds */
54*1da177e4SLinus Torvalds #define DEFAULT_IOFENCE_TICK 180	/* Default timer timeout, in seconds */
55*1da177e4SLinus Torvalds 
56*1da177e4SLinus Torvalds static int hangcheck_tick = DEFAULT_IOFENCE_TICK;
57*1da177e4SLinus Torvalds static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN;
58*1da177e4SLinus Torvalds static int hangcheck_reboot;  /* Defaults to not reboot */
59*1da177e4SLinus Torvalds 
60*1da177e4SLinus Torvalds /* Driver options */
61*1da177e4SLinus Torvalds module_param(hangcheck_tick, int, 0);
62*1da177e4SLinus Torvalds MODULE_PARM_DESC(hangcheck_tick, "Timer delay.");
63*1da177e4SLinus Torvalds module_param(hangcheck_margin, int, 0);
64*1da177e4SLinus Torvalds MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire.");
65*1da177e4SLinus Torvalds module_param(hangcheck_reboot, int, 0);
66*1da177e4SLinus Torvalds MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded.");
67*1da177e4SLinus Torvalds 
68*1da177e4SLinus Torvalds MODULE_AUTHOR("Joel Becker");
69*1da177e4SLinus Torvalds MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin.");
70*1da177e4SLinus Torvalds MODULE_LICENSE("GPL");
71*1da177e4SLinus Torvalds 
72*1da177e4SLinus Torvalds 
73*1da177e4SLinus Torvalds /* Last time scheduled */
74*1da177e4SLinus Torvalds static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
75*1da177e4SLinus Torvalds 
76*1da177e4SLinus Torvalds static void hangcheck_fire(unsigned long);
77*1da177e4SLinus Torvalds 
78*1da177e4SLinus Torvalds static struct timer_list hangcheck_ticktock =
79*1da177e4SLinus Torvalds 		TIMER_INITIALIZER(hangcheck_fire, 0, 0);
80*1da177e4SLinus Torvalds 
81*1da177e4SLinus Torvalds extern unsigned long long monotonic_clock(void);
82*1da177e4SLinus Torvalds 
83*1da177e4SLinus Torvalds static void hangcheck_fire(unsigned long data)
84*1da177e4SLinus Torvalds {
85*1da177e4SLinus Torvalds 	unsigned long long cur_tsc, tsc_diff;
86*1da177e4SLinus Torvalds 
87*1da177e4SLinus Torvalds 	cur_tsc = monotonic_clock();
88*1da177e4SLinus Torvalds 
89*1da177e4SLinus Torvalds 	if (cur_tsc > hangcheck_tsc)
90*1da177e4SLinus Torvalds 		tsc_diff = cur_tsc - hangcheck_tsc;
91*1da177e4SLinus Torvalds 	else
92*1da177e4SLinus Torvalds 		tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */
93*1da177e4SLinus Torvalds 
94*1da177e4SLinus Torvalds 	if (tsc_diff > hangcheck_tsc_margin) {
95*1da177e4SLinus Torvalds 		if (hangcheck_reboot) {
96*1da177e4SLinus Torvalds 			printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n");
97*1da177e4SLinus Torvalds 			machine_restart(NULL);
98*1da177e4SLinus Torvalds 		} else {
99*1da177e4SLinus Torvalds 			printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n");
100*1da177e4SLinus Torvalds 		}
101*1da177e4SLinus Torvalds 	}
102*1da177e4SLinus Torvalds 	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
103*1da177e4SLinus Torvalds 	hangcheck_tsc = monotonic_clock();
104*1da177e4SLinus Torvalds }
105*1da177e4SLinus Torvalds 
106*1da177e4SLinus Torvalds 
107*1da177e4SLinus Torvalds static int __init hangcheck_init(void)
108*1da177e4SLinus Torvalds {
109*1da177e4SLinus Torvalds 	printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n",
110*1da177e4SLinus Torvalds 	       VERSION_STR, hangcheck_tick, hangcheck_margin);
111*1da177e4SLinus Torvalds 
112*1da177e4SLinus Torvalds 	hangcheck_tsc_margin = hangcheck_margin + hangcheck_tick;
113*1da177e4SLinus Torvalds 	hangcheck_tsc_margin *= 1000000000;
114*1da177e4SLinus Torvalds 
115*1da177e4SLinus Torvalds 
116*1da177e4SLinus Torvalds 	hangcheck_tsc = monotonic_clock();
117*1da177e4SLinus Torvalds 	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
118*1da177e4SLinus Torvalds 
119*1da177e4SLinus Torvalds 	return 0;
120*1da177e4SLinus Torvalds }
121*1da177e4SLinus Torvalds 
122*1da177e4SLinus Torvalds 
123*1da177e4SLinus Torvalds static void __exit hangcheck_exit(void)
124*1da177e4SLinus Torvalds {
125*1da177e4SLinus Torvalds 	del_timer_sync(&hangcheck_ticktock);
126*1da177e4SLinus Torvalds }
127*1da177e4SLinus Torvalds 
128*1da177e4SLinus Torvalds module_init(hangcheck_init);
129*1da177e4SLinus Torvalds module_exit(hangcheck_exit);
130