1 /* 2 * Machine check handler 3 * 4 * Copyright IBM Corp. 2000,2009 5 * Author(s): Ingo Adlung <adlung@de.ibm.com>, 6 * Martin Schwidefsky <schwidefsky@de.ibm.com>, 7 * Cornelia Huck <cornelia.huck@de.ibm.com>, 8 * Heiko Carstens <heiko.carstens@de.ibm.com>, 9 */ 10 11 #include <linux/init.h> 12 #include <linux/errno.h> 13 #include <linux/time.h> 14 #include <linux/module.h> 15 #include <asm/lowcore.h> 16 #include <asm/smp.h> 17 #include <asm/etr.h> 18 #include <asm/cpu.h> 19 #include <asm/nmi.h> 20 #include <asm/crw.h> 21 22 struct mcck_struct { 23 int kill_task; 24 int channel_report; 25 int warning; 26 unsigned long long mcck_code; 27 }; 28 29 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); 30 31 static NORET_TYPE void s390_handle_damage(char *msg) 32 { 33 smp_send_stop(); 34 disabled_wait((unsigned long) __builtin_return_address(0)); 35 while (1); 36 } 37 38 /* 39 * Main machine check handler function. Will be called with interrupts enabled 40 * or disabled and machine checks enabled or disabled. 41 */ 42 void s390_handle_mcck(void) 43 { 44 unsigned long flags; 45 struct mcck_struct mcck; 46 47 /* 48 * Disable machine checks and get the current state of accumulated 49 * machine checks. Afterwards delete the old state and enable machine 50 * checks again. 51 */ 52 local_irq_save(flags); 53 local_mcck_disable(); 54 mcck = __get_cpu_var(cpu_mcck); 55 memset(&__get_cpu_var(cpu_mcck), 0, sizeof(struct mcck_struct)); 56 clear_thread_flag(TIF_MCCK_PENDING); 57 local_mcck_enable(); 58 local_irq_restore(flags); 59 60 if (mcck.channel_report) 61 crw_handle_channel_report(); 62 /* 63 * A warning may remain for a prolonged period on the bare iron. 64 * (actually until the machine is powered off, or the problem is gone) 65 * So we just stop listening for the WARNING MCH and avoid continuously 66 * being interrupted. One caveat is however, that we must do this per 67 * processor and cannot use the smp version of ctl_clear_bit(). 68 * On VM we only get one interrupt per virtally presented machinecheck. 69 * Though one suffices, we may get one interrupt per (virtual) cpu. 70 */ 71 if (mcck.warning) { /* WARNING pending ? */ 72 static int mchchk_wng_posted = 0; 73 74 /* Use single cpu clear, as we cannot handle smp here. */ 75 __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ 76 if (xchg(&mchchk_wng_posted, 1) == 0) 77 kill_cad_pid(SIGPWR, 1); 78 } 79 if (mcck.kill_task) { 80 local_irq_enable(); 81 printk(KERN_EMERG "mcck: Terminating task because of machine " 82 "malfunction (code 0x%016llx).\n", mcck.mcck_code); 83 printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", 84 current->comm, current->pid); 85 do_exit(SIGSEGV); 86 } 87 } 88 EXPORT_SYMBOL_GPL(s390_handle_mcck); 89 90 /* 91 * returns 0 if all registers could be validated 92 * returns 1 otherwise 93 */ 94 static int notrace s390_revalidate_registers(struct mci *mci) 95 { 96 int kill_task; 97 u64 tmpclock; 98 u64 zero; 99 void *fpt_save_area, *fpt_creg_save_area; 100 101 kill_task = 0; 102 zero = 0; 103 104 if (!mci->gr) { 105 /* 106 * General purpose registers couldn't be restored and have 107 * unknown contents. Process needs to be terminated. 108 */ 109 kill_task = 1; 110 } 111 if (!mci->fp) { 112 /* 113 * Floating point registers can't be restored and 114 * therefore the process needs to be terminated. 115 */ 116 kill_task = 1; 117 } 118 #ifndef CONFIG_64BIT 119 asm volatile( 120 " ld 0,0(%0)\n" 121 " ld 2,8(%0)\n" 122 " ld 4,16(%0)\n" 123 " ld 6,24(%0)" 124 : : "a" (&S390_lowcore.floating_pt_save_area)); 125 #endif 126 127 if (MACHINE_HAS_IEEE) { 128 #ifdef CONFIG_64BIT 129 fpt_save_area = &S390_lowcore.floating_pt_save_area; 130 fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area; 131 #else 132 fpt_save_area = (void *) S390_lowcore.extended_save_area_addr; 133 fpt_creg_save_area = fpt_save_area + 128; 134 #endif 135 if (!mci->fc) { 136 /* 137 * Floating point control register can't be restored. 138 * Task will be terminated. 139 */ 140 asm volatile("lfpc 0(%0)" : : "a" (&zero), "m" (zero)); 141 kill_task = 1; 142 143 } else 144 asm volatile("lfpc 0(%0)" : : "a" (fpt_creg_save_area)); 145 146 asm volatile( 147 " ld 0,0(%0)\n" 148 " ld 1,8(%0)\n" 149 " ld 2,16(%0)\n" 150 " ld 3,24(%0)\n" 151 " ld 4,32(%0)\n" 152 " ld 5,40(%0)\n" 153 " ld 6,48(%0)\n" 154 " ld 7,56(%0)\n" 155 " ld 8,64(%0)\n" 156 " ld 9,72(%0)\n" 157 " ld 10,80(%0)\n" 158 " ld 11,88(%0)\n" 159 " ld 12,96(%0)\n" 160 " ld 13,104(%0)\n" 161 " ld 14,112(%0)\n" 162 " ld 15,120(%0)\n" 163 : : "a" (fpt_save_area)); 164 } 165 /* Revalidate access registers */ 166 asm volatile( 167 " lam 0,15,0(%0)" 168 : : "a" (&S390_lowcore.access_regs_save_area)); 169 if (!mci->ar) { 170 /* 171 * Access registers have unknown contents. 172 * Terminating task. 173 */ 174 kill_task = 1; 175 } 176 /* Revalidate control registers */ 177 if (!mci->cr) { 178 /* 179 * Control registers have unknown contents. 180 * Can't recover and therefore stopping machine. 181 */ 182 s390_handle_damage("invalid control registers."); 183 } else { 184 #ifdef CONFIG_64BIT 185 asm volatile( 186 " lctlg 0,15,0(%0)" 187 : : "a" (&S390_lowcore.cregs_save_area)); 188 #else 189 asm volatile( 190 " lctl 0,15,0(%0)" 191 : : "a" (&S390_lowcore.cregs_save_area)); 192 #endif 193 } 194 /* 195 * We don't even try to revalidate the TOD register, since we simply 196 * can't write something sensible into that register. 197 */ 198 #ifdef CONFIG_64BIT 199 /* 200 * See if we can revalidate the TOD programmable register with its 201 * old contents (should be zero) otherwise set it to zero. 202 */ 203 if (!mci->pr) 204 asm volatile( 205 " sr 0,0\n" 206 " sckpf" 207 : : : "0", "cc"); 208 else 209 asm volatile( 210 " l 0,0(%0)\n" 211 " sckpf" 212 : : "a" (&S390_lowcore.tod_progreg_save_area) 213 : "0", "cc"); 214 #endif 215 /* Revalidate clock comparator register */ 216 asm volatile( 217 " stck 0(%1)\n" 218 " sckc 0(%1)" 219 : "=m" (tmpclock) : "a" (&(tmpclock)) : "cc", "memory"); 220 221 /* Check if old PSW is valid */ 222 if (!mci->wp) 223 /* 224 * Can't tell if we come from user or kernel mode 225 * -> stopping machine. 226 */ 227 s390_handle_damage("old psw invalid."); 228 229 if (!mci->ms || !mci->pm || !mci->ia) 230 kill_task = 1; 231 232 return kill_task; 233 } 234 235 #define MAX_IPD_COUNT 29 236 #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ 237 238 #define ED_STP_ISLAND 6 /* External damage STP island check */ 239 #define ED_STP_SYNC 7 /* External damage STP sync check */ 240 #define ED_ETR_SYNC 12 /* External damage ETR sync check */ 241 #define ED_ETR_SWITCH 13 /* External damage ETR switch to local */ 242 243 /* 244 * machine check handler. 245 */ 246 void notrace s390_do_machine_check(struct pt_regs *regs) 247 { 248 static int ipd_count; 249 static DEFINE_SPINLOCK(ipd_lock); 250 static unsigned long long last_ipd; 251 struct mcck_struct *mcck; 252 unsigned long long tmp; 253 struct mci *mci; 254 int umode; 255 256 lockdep_off(); 257 s390_idle_check(); 258 259 mci = (struct mci *) &S390_lowcore.mcck_interruption_code; 260 mcck = &__get_cpu_var(cpu_mcck); 261 umode = user_mode(regs); 262 263 if (mci->sd) { 264 /* System damage -> stopping machine */ 265 s390_handle_damage("received system damage machine check."); 266 } 267 if (mci->pd) { 268 if (mci->b) { 269 /* Processing backup -> verify if we can survive this */ 270 u64 z_mcic, o_mcic, t_mcic; 271 #ifdef CONFIG_64BIT 272 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); 273 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 274 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 275 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 | 276 1ULL<<16); 277 #else 278 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<57 | 1ULL<<50 | 279 1ULL<<29); 280 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 281 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 282 1ULL<<30 | 1ULL<<20 | 1ULL<<17 | 1ULL<<16); 283 #endif 284 t_mcic = *(u64 *)mci; 285 286 if (((t_mcic & z_mcic) != 0) || 287 ((t_mcic & o_mcic) != o_mcic)) { 288 s390_handle_damage("processing backup machine " 289 "check with damage."); 290 } 291 292 /* 293 * Nullifying exigent condition, therefore we might 294 * retry this instruction. 295 */ 296 spin_lock(&ipd_lock); 297 tmp = get_clock(); 298 if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME) 299 ipd_count++; 300 else 301 ipd_count = 1; 302 last_ipd = tmp; 303 if (ipd_count == MAX_IPD_COUNT) 304 s390_handle_damage("too many ipd retries."); 305 spin_unlock(&ipd_lock); 306 } else { 307 /* Processing damage -> stopping machine */ 308 s390_handle_damage("received instruction processing " 309 "damage machine check."); 310 } 311 } 312 if (s390_revalidate_registers(mci)) { 313 if (umode) { 314 /* 315 * Couldn't restore all register contents while in 316 * user mode -> mark task for termination. 317 */ 318 mcck->kill_task = 1; 319 mcck->mcck_code = *(unsigned long long *) mci; 320 set_thread_flag(TIF_MCCK_PENDING); 321 } else { 322 /* 323 * Couldn't restore all register contents while in 324 * kernel mode -> stopping machine. 325 */ 326 s390_handle_damage("unable to revalidate registers."); 327 } 328 } 329 if (mci->cd) { 330 /* Timing facility damage */ 331 s390_handle_damage("TOD clock damaged"); 332 } 333 if (mci->ed && mci->ec) { 334 /* External damage */ 335 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC)) 336 etr_sync_check(); 337 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH)) 338 etr_switch_to_local(); 339 if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) 340 stp_sync_check(); 341 if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) 342 stp_island_check(); 343 } 344 if (mci->se) 345 /* Storage error uncorrected */ 346 s390_handle_damage("received storage error uncorrected " 347 "machine check."); 348 if (mci->ke) 349 /* Storage key-error uncorrected */ 350 s390_handle_damage("received storage key-error uncorrected " 351 "machine check."); 352 if (mci->ds && mci->fa) 353 /* Storage degradation */ 354 s390_handle_damage("received storage degradation machine " 355 "check."); 356 if (mci->cp) { 357 /* Channel report word pending */ 358 mcck->channel_report = 1; 359 set_thread_flag(TIF_MCCK_PENDING); 360 } 361 if (mci->w) { 362 /* Warning pending */ 363 mcck->warning = 1; 364 set_thread_flag(TIF_MCCK_PENDING); 365 } 366 lockdep_on(); 367 } 368 369 static int __init machine_check_init(void) 370 { 371 ctl_set_bit(14, 25); /* enable external damage MCH */ 372 ctl_set_bit(14, 27); /* enable system recovery MCH */ 373 ctl_set_bit(14, 24); /* enable warning MCH */ 374 return 0; 375 } 376 arch_initcall(machine_check_init); 377