1*921a3d4dSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * hangcheck-timer.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Driver for a little io fencing timer. 61da177e4SLinus Torvalds * 7696f9486SJoel Becker * Copyright (C) 2002, 2003 Oracle. All rights reserved. 81da177e4SLinus Torvalds * 91da177e4SLinus Torvalds * Author: Joel Becker <joel.becker@oracle.com> 101da177e4SLinus Torvalds */ 111da177e4SLinus Torvalds 121da177e4SLinus Torvalds /* 131da177e4SLinus Torvalds * The hangcheck-timer driver uses the TSC to catch delays that 141da177e4SLinus Torvalds * jiffies does not notice. A timer is set. When the timer fires, it 151da177e4SLinus Torvalds * checks whether it was delayed and if that delay exceeds a given 168dfba4d7SJoe Perches * margin of error. The hangcheck_tick module parameter takes the timer 171da177e4SLinus Torvalds * duration in seconds. The hangcheck_margin parameter defines the 181da177e4SLinus Torvalds * margin of error, in seconds. The defaults are 60 seconds for the 191da177e4SLinus Torvalds * timer and 180 seconds for the margin of error. IOW, a timer is set 201da177e4SLinus Torvalds * for 60 seconds. When the timer fires, the callback checks the 211da177e4SLinus Torvalds * actual duration that the timer waited. If the duration exceeds the 228b932edfSShile Zhang * allotted time and margin (here 60 + 180, or 240 seconds), the machine 231da177e4SLinus Torvalds * is restarted. A healthy machine will have the duration match the 241da177e4SLinus Torvalds * expected timeout very closely. 251da177e4SLinus Torvalds */ 261da177e4SLinus Torvalds 271da177e4SLinus Torvalds #include <linux/module.h> 281da177e4SLinus Torvalds #include <linux/moduleparam.h> 291da177e4SLinus Torvalds #include <linux/types.h> 301da177e4SLinus Torvalds #include <linux/kernel.h> 311da177e4SLinus Torvalds #include <linux/fs.h> 321da177e4SLinus Torvalds #include <linux/mm.h> 331da177e4SLinus Torvalds #include <linux/reboot.h> 341da177e4SLinus Torvalds #include <linux/init.h> 35696f9486SJoel Becker #include <linux/delay.h> 367c0f6ba6SLinus Torvalds #include <linux/uaccess.h> 37696f9486SJoel Becker #include <linux/sysrq.h> 38e8edc6e0SAlexey Dobriyan #include <linux/timer.h> 392044fdb0SThomas Gleixner #include <linux/hrtimer.h> 401da177e4SLinus Torvalds 41940370fcSYury Polyanskiy #define VERSION_STR "0.9.1" 421da177e4SLinus Torvalds 431da177e4SLinus Torvalds #define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 441da177e4SLinus Torvalds #define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 451da177e4SLinus Torvalds 461da177e4SLinus Torvalds static int hangcheck_tick = DEFAULT_IOFENCE_TICK; 471da177e4SLinus Torvalds static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 481da177e4SLinus Torvalds static int hangcheck_reboot; /* Defaults to not reboot */ 49696f9486SJoel Becker static int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 501da177e4SLinus Torvalds 51696f9486SJoel Becker /* options - modular */ 521da177e4SLinus Torvalds module_param(hangcheck_tick, int, 0); 531da177e4SLinus Torvalds MODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 541da177e4SLinus Torvalds module_param(hangcheck_margin, int, 0); 551da177e4SLinus Torvalds MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 561da177e4SLinus Torvalds module_param(hangcheck_reboot, int, 0); 571da177e4SLinus Torvalds MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 58696f9486SJoel Becker module_param(hangcheck_dump_tasks, int, 0); 59696f9486SJoel Becker MODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 601da177e4SLinus Torvalds 61696f9486SJoel Becker MODULE_AUTHOR("Oracle"); 621da177e4SLinus Torvalds MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 631da177e4SLinus Torvalds MODULE_LICENSE("GPL"); 64696f9486SJoel Becker MODULE_VERSION(VERSION_STR); 65696f9486SJoel Becker 66696f9486SJoel Becker /* options - nonmodular */ 67696f9486SJoel Becker #ifndef MODULE 68696f9486SJoel Becker 69696f9486SJoel Becker static int __init hangcheck_parse_tick(char *str) 70696f9486SJoel Becker { 71696f9486SJoel Becker int par; 72696f9486SJoel Becker if (get_option(&str,&par)) 73696f9486SJoel Becker hangcheck_tick = par; 74696f9486SJoel Becker return 1; 75696f9486SJoel Becker } 76696f9486SJoel Becker 77696f9486SJoel Becker static int __init hangcheck_parse_margin(char *str) 78696f9486SJoel Becker { 79696f9486SJoel Becker int par; 80696f9486SJoel Becker if (get_option(&str,&par)) 81696f9486SJoel Becker hangcheck_margin = par; 82696f9486SJoel Becker return 1; 83696f9486SJoel Becker } 84696f9486SJoel Becker 85696f9486SJoel Becker static int __init hangcheck_parse_reboot(char *str) 86696f9486SJoel Becker { 87696f9486SJoel Becker int par; 88696f9486SJoel Becker if (get_option(&str,&par)) 89696f9486SJoel Becker hangcheck_reboot = par; 90696f9486SJoel Becker return 1; 91696f9486SJoel Becker } 92696f9486SJoel Becker 93696f9486SJoel Becker static int __init hangcheck_parse_dump_tasks(char *str) 94696f9486SJoel Becker { 95696f9486SJoel Becker int par; 96696f9486SJoel Becker if (get_option(&str,&par)) 97696f9486SJoel Becker hangcheck_dump_tasks = par; 98696f9486SJoel Becker return 1; 99696f9486SJoel Becker } 100696f9486SJoel Becker 101696f9486SJoel Becker __setup("hcheck_tick", hangcheck_parse_tick); 102696f9486SJoel Becker __setup("hcheck_margin", hangcheck_parse_margin); 103696f9486SJoel Becker __setup("hcheck_reboot", hangcheck_parse_reboot); 104696f9486SJoel Becker __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 105696f9486SJoel Becker #endif /* not MODULE */ 106696f9486SJoel Becker 107696f9486SJoel Becker #define TIMER_FREQ 1000000000ULL 1081da177e4SLinus Torvalds 1091da177e4SLinus Torvalds /* Last time scheduled */ 1101da177e4SLinus Torvalds static unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 1111da177e4SLinus Torvalds 11224ed960aSKees Cook static void hangcheck_fire(struct timer_list *); 1131da177e4SLinus Torvalds 1141d27e3e2SKees Cook static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 1151da177e4SLinus Torvalds 11624ed960aSKees Cook static void hangcheck_fire(struct timer_list *unused) 1171da177e4SLinus Torvalds { 1181da177e4SLinus Torvalds unsigned long long cur_tsc, tsc_diff; 1191da177e4SLinus Torvalds 1202044fdb0SThomas Gleixner cur_tsc = ktime_get_ns(); 1211da177e4SLinus Torvalds 1221da177e4SLinus Torvalds if (cur_tsc > hangcheck_tsc) 1231da177e4SLinus Torvalds tsc_diff = cur_tsc - hangcheck_tsc; 1241da177e4SLinus Torvalds else 1251da177e4SLinus Torvalds tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 1261da177e4SLinus Torvalds 1271da177e4SLinus Torvalds if (tsc_diff > hangcheck_tsc_margin) { 128696f9486SJoel Becker if (hangcheck_dump_tasks) { 129696f9486SJoel Becker printk(KERN_CRIT "Hangcheck: Task state:\n"); 130696f9486SJoel Becker #ifdef CONFIG_MAGIC_SYSRQ 131f335397dSDmitry Torokhov handle_sysrq('t'); 132696f9486SJoel Becker #endif /* CONFIG_MAGIC_SYSRQ */ 133696f9486SJoel Becker } 1341da177e4SLinus Torvalds if (hangcheck_reboot) { 1351da177e4SLinus Torvalds printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n"); 136970d3244SEric W. Biederman emergency_restart(); 1371da177e4SLinus Torvalds } else { 1381da177e4SLinus Torvalds printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n"); 1391da177e4SLinus Torvalds } 1401da177e4SLinus Torvalds } 141940370fcSYury Polyanskiy #if 0 142940370fcSYury Polyanskiy /* 143940370fcSYury Polyanskiy * Enable to investigate delays in detail 144940370fcSYury Polyanskiy */ 145940370fcSYury Polyanskiy printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n", 146940370fcSYury Polyanskiy tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 147940370fcSYury Polyanskiy #endif 1481da177e4SLinus Torvalds mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 1492044fdb0SThomas Gleixner hangcheck_tsc = ktime_get_ns(); 1501da177e4SLinus Torvalds } 1511da177e4SLinus Torvalds 1521da177e4SLinus Torvalds 1531da177e4SLinus Torvalds static int __init hangcheck_init(void) 1541da177e4SLinus Torvalds { 1551da177e4SLinus Torvalds printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 1561da177e4SLinus Torvalds VERSION_STR, hangcheck_tick, hangcheck_margin); 157696f9486SJoel Becker hangcheck_tsc_margin = 158d0439a54SDan Carpenter (unsigned long long)hangcheck_margin + hangcheck_tick; 159d0439a54SDan Carpenter hangcheck_tsc_margin *= TIMER_FREQ; 1601da177e4SLinus Torvalds 1612044fdb0SThomas Gleixner hangcheck_tsc = ktime_get_ns(); 1621da177e4SLinus Torvalds mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 1631da177e4SLinus Torvalds 1641da177e4SLinus Torvalds return 0; 1651da177e4SLinus Torvalds } 1661da177e4SLinus Torvalds 1671da177e4SLinus Torvalds 1681da177e4SLinus Torvalds static void __exit hangcheck_exit(void) 1691da177e4SLinus Torvalds { 1701da177e4SLinus Torvalds del_timer_sync(&hangcheck_ticktock); 171696f9486SJoel Becker printk("Hangcheck: Stopped hangcheck timer.\n"); 1721da177e4SLinus Torvalds } 1731da177e4SLinus Torvalds 1741da177e4SLinus Torvalds module_init(hangcheck_init); 1751da177e4SLinus Torvalds module_exit(hangcheck_exit); 176