1 /* Pseudo NMI support on sparc64 systems. 2 * 3 * Copyright (C) 2009 David S. Miller <davem@davemloft.net> 4 * 5 * The NMI watchdog support and infrastructure is based almost 6 * entirely upon the x86 NMI support code. 7 */ 8 #include <linux/kernel.h> 9 #include <linux/param.h> 10 #include <linux/init.h> 11 #include <linux/percpu.h> 12 #include <linux/nmi.h> 13 #include <linux/export.h> 14 #include <linux/kprobes.h> 15 #include <linux/kernel_stat.h> 16 #include <linux/reboot.h> 17 #include <linux/slab.h> 18 #include <linux/kdebug.h> 19 #include <linux/delay.h> 20 #include <linux/smp.h> 21 22 #include <asm/perf_event.h> 23 #include <asm/ptrace.h> 24 #include <asm/pcr.h> 25 26 #include "kstack.h" 27 28 /* We don't have a real NMI on sparc64, but we can fake one 29 * up using profiling counter overflow interrupts and interrupt 30 * levels. 31 * 32 * The profile overflow interrupts at level 15, so we use 33 * level 14 as our IRQ off level. 34 */ 35 36 static int panic_on_timeout; 37 38 /* nmi_active: 39 * >0: the NMI watchdog is active, but can be disabled 40 * <0: the NMI watchdog has not been set up, and cannot be enabled 41 * 0: the NMI watchdog is disabled, but can be enabled 42 */ 43 atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 44 EXPORT_SYMBOL(nmi_active); 45 static int nmi_init_done; 46 static unsigned int nmi_hz = HZ; 47 static DEFINE_PER_CPU(short, wd_enabled); 48 static int endflag __initdata; 49 50 static DEFINE_PER_CPU(unsigned int, last_irq_sum); 51 static DEFINE_PER_CPU(long, alert_counter); 52 static DEFINE_PER_CPU(int, nmi_touch); 53 54 void arch_touch_nmi_watchdog(void) 55 { 56 if (atomic_read(&nmi_active)) { 57 int cpu; 58 59 for_each_present_cpu(cpu) { 60 if (per_cpu(nmi_touch, cpu) != 1) 61 per_cpu(nmi_touch, cpu) = 1; 62 } 63 } 64 } 65 EXPORT_SYMBOL(arch_touch_nmi_watchdog); 66 67 static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) 68 { 69 int this_cpu = smp_processor_id(); 70 71 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 72 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) 73 return; 74 75 if (do_panic || panic_on_oops) 76 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 77 else 78 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 79 } 80 81 notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs) 82 { 83 unsigned int sum, touched = 0; 84 void *orig_sp; 85 86 clear_softint(1 << irq); 87 88 local_cpu_data().__nmi_count++; 89 90 nmi_enter(); 91 92 orig_sp = set_hardirq_stack(); 93 94 if (notify_die(DIE_NMI, "nmi", regs, 0, 95 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) 96 touched = 1; 97 else 98 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 99 100 sum = local_cpu_data().irq0_irqs; 101 if (__this_cpu_read(nmi_touch)) { 102 __this_cpu_write(nmi_touch, 0); 103 touched = 1; 104 } 105 if (!touched && __this_cpu_read(last_irq_sum) == sum) { 106 __this_cpu_inc(alert_counter); 107 if (__this_cpu_read(alert_counter) == 30 * nmi_hz) 108 die_nmi("BUG: NMI Watchdog detected LOCKUP", 109 regs, panic_on_timeout); 110 } else { 111 __this_cpu_write(last_irq_sum, sum); 112 __this_cpu_write(alert_counter, 0); 113 } 114 if (__this_cpu_read(wd_enabled)) { 115 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); 116 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); 117 } 118 119 restore_hardirq_stack(orig_sp); 120 121 nmi_exit(); 122 } 123 124 static inline unsigned int get_nmi_count(int cpu) 125 { 126 return cpu_data(cpu).__nmi_count; 127 } 128 129 static __init void nmi_cpu_busy(void *data) 130 { 131 while (endflag == 0) 132 mb(); 133 } 134 135 static void report_broken_nmi(int cpu, int *prev_nmi_count) 136 { 137 printk(KERN_CONT "\n"); 138 139 printk(KERN_WARNING 140 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", 141 cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); 142 143 printk(KERN_WARNING 144 "Please report this to bugzilla.kernel.org,\n"); 145 printk(KERN_WARNING 146 "and attach the output of the 'dmesg' command.\n"); 147 148 per_cpu(wd_enabled, cpu) = 0; 149 atomic_dec(&nmi_active); 150 } 151 152 void stop_nmi_watchdog(void *unused) 153 { 154 if (!__this_cpu_read(wd_enabled)) 155 return; 156 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 157 __this_cpu_write(wd_enabled, 0); 158 atomic_dec(&nmi_active); 159 } 160 161 static int __init check_nmi_watchdog(void) 162 { 163 unsigned int *prev_nmi_count; 164 int cpu, err; 165 166 if (!atomic_read(&nmi_active)) 167 return 0; 168 169 prev_nmi_count = kmalloc_array(nr_cpu_ids, sizeof(unsigned int), 170 GFP_KERNEL); 171 if (!prev_nmi_count) { 172 err = -ENOMEM; 173 goto error; 174 } 175 176 printk(KERN_INFO "Testing NMI watchdog ... "); 177 178 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); 179 180 for_each_possible_cpu(cpu) 181 prev_nmi_count[cpu] = get_nmi_count(cpu); 182 local_irq_enable(); 183 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ 184 185 for_each_online_cpu(cpu) { 186 if (!per_cpu(wd_enabled, cpu)) 187 continue; 188 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) 189 report_broken_nmi(cpu, prev_nmi_count); 190 } 191 endflag = 1; 192 if (!atomic_read(&nmi_active)) { 193 kfree(prev_nmi_count); 194 atomic_set(&nmi_active, -1); 195 err = -ENODEV; 196 goto error; 197 } 198 printk("OK.\n"); 199 200 nmi_hz = 1; 201 202 kfree(prev_nmi_count); 203 return 0; 204 error: 205 on_each_cpu(stop_nmi_watchdog, NULL, 1); 206 return err; 207 } 208 209 void start_nmi_watchdog(void *unused) 210 { 211 if (__this_cpu_read(wd_enabled)) 212 return; 213 214 __this_cpu_write(wd_enabled, 1); 215 atomic_inc(&nmi_active); 216 217 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 218 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); 219 220 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); 221 } 222 223 static void nmi_adjust_hz_one(void *unused) 224 { 225 if (!__this_cpu_read(wd_enabled)) 226 return; 227 228 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 229 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); 230 231 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); 232 } 233 234 void nmi_adjust_hz(unsigned int new_hz) 235 { 236 nmi_hz = new_hz; 237 on_each_cpu(nmi_adjust_hz_one, NULL, 1); 238 } 239 EXPORT_SYMBOL_GPL(nmi_adjust_hz); 240 241 static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p) 242 { 243 on_each_cpu(stop_nmi_watchdog, NULL, 1); 244 return 0; 245 } 246 247 static struct notifier_block nmi_reboot_notifier = { 248 .notifier_call = nmi_shutdown, 249 }; 250 251 int __init nmi_init(void) 252 { 253 int err; 254 255 on_each_cpu(start_nmi_watchdog, NULL, 1); 256 257 err = check_nmi_watchdog(); 258 if (!err) { 259 err = register_reboot_notifier(&nmi_reboot_notifier); 260 if (err) { 261 on_each_cpu(stop_nmi_watchdog, NULL, 1); 262 atomic_set(&nmi_active, -1); 263 } 264 } 265 266 nmi_init_done = 1; 267 268 return err; 269 } 270 271 static int __init setup_nmi_watchdog(char *str) 272 { 273 if (!strncmp(str, "panic", 5)) 274 panic_on_timeout = 1; 275 276 return 0; 277 } 278 __setup("nmi_watchdog=", setup_nmi_watchdog); 279 280 /* 281 * sparc specific NMI watchdog enable function. 282 * Enables watchdog if it is not enabled already. 283 */ 284 int watchdog_nmi_enable(unsigned int cpu) 285 { 286 if (atomic_read(&nmi_active) == -1) { 287 pr_warn("NMI watchdog cannot be enabled or disabled\n"); 288 return -1; 289 } 290 291 /* 292 * watchdog thread could start even before nmi_init is called. 293 * Just Return in that case. Let nmi_init finish the init 294 * process first. 295 */ 296 if (!nmi_init_done) 297 return 0; 298 299 smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1); 300 301 return 0; 302 } 303 /* 304 * sparc specific NMI watchdog disable function. 305 * Disables watchdog if it is not disabled already. 306 */ 307 void watchdog_nmi_disable(unsigned int cpu) 308 { 309 if (atomic_read(&nmi_active) == -1) 310 pr_warn_once("NMI watchdog cannot be enabled or disabled\n"); 311 else 312 smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1); 313 } 314