1 /* 2 * Machine check handler 3 * 4 * Copyright IBM Corp. 2000,2009 5 * Author(s): Ingo Adlung <adlung@de.ibm.com>, 6 * Martin Schwidefsky <schwidefsky@de.ibm.com>, 7 * Cornelia Huck <cornelia.huck@de.ibm.com>, 8 * Heiko Carstens <heiko.carstens@de.ibm.com>, 9 */ 10 11 #include <linux/init.h> 12 #include <linux/errno.h> 13 #include <linux/hardirq.h> 14 #include <linux/time.h> 15 #include <linux/module.h> 16 #include <asm/lowcore.h> 17 #include <asm/smp.h> 18 #include <asm/etr.h> 19 #include <asm/cputime.h> 20 #include <asm/nmi.h> 21 #include <asm/crw.h> 22 23 struct mcck_struct { 24 int kill_task; 25 int channel_report; 26 int warning; 27 unsigned long long mcck_code; 28 }; 29 30 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); 31 32 static NORET_TYPE void s390_handle_damage(char *msg) 33 { 34 smp_send_stop(); 35 disabled_wait((unsigned long) __builtin_return_address(0)); 36 while (1); 37 } 38 39 /* 40 * Main machine check handler function. Will be called with interrupts enabled 41 * or disabled and machine checks enabled or disabled. 42 */ 43 void s390_handle_mcck(void) 44 { 45 unsigned long flags; 46 struct mcck_struct mcck; 47 48 /* 49 * Disable machine checks and get the current state of accumulated 50 * machine checks. Afterwards delete the old state and enable machine 51 * checks again. 52 */ 53 local_irq_save(flags); 54 local_mcck_disable(); 55 mcck = __get_cpu_var(cpu_mcck); 56 memset(&__get_cpu_var(cpu_mcck), 0, sizeof(struct mcck_struct)); 57 clear_thread_flag(TIF_MCCK_PENDING); 58 local_mcck_enable(); 59 local_irq_restore(flags); 60 61 if (mcck.channel_report) 62 crw_handle_channel_report(); 63 /* 64 * A warning may remain for a prolonged period on the bare iron. 65 * (actually until the machine is powered off, or the problem is gone) 66 * So we just stop listening for the WARNING MCH and avoid continuously 67 * being interrupted. One caveat is however, that we must do this per 68 * processor and cannot use the smp version of ctl_clear_bit(). 69 * On VM we only get one interrupt per virtally presented machinecheck. 70 * Though one suffices, we may get one interrupt per (virtual) cpu. 71 */ 72 if (mcck.warning) { /* WARNING pending ? */ 73 static int mchchk_wng_posted = 0; 74 75 /* Use single cpu clear, as we cannot handle smp here. */ 76 __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ 77 if (xchg(&mchchk_wng_posted, 1) == 0) 78 kill_cad_pid(SIGPWR, 1); 79 } 80 if (mcck.kill_task) { 81 local_irq_enable(); 82 printk(KERN_EMERG "mcck: Terminating task because of machine " 83 "malfunction (code 0x%016llx).\n", mcck.mcck_code); 84 printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", 85 current->comm, current->pid); 86 do_exit(SIGSEGV); 87 } 88 } 89 EXPORT_SYMBOL_GPL(s390_handle_mcck); 90 91 /* 92 * returns 0 if all registers could be validated 93 * returns 1 otherwise 94 */ 95 static int notrace s390_revalidate_registers(struct mci *mci) 96 { 97 int kill_task; 98 u64 zero; 99 void *fpt_save_area, *fpt_creg_save_area; 100 101 kill_task = 0; 102 zero = 0; 103 104 if (!mci->gr) { 105 /* 106 * General purpose registers couldn't be restored and have 107 * unknown contents. Process needs to be terminated. 108 */ 109 kill_task = 1; 110 } 111 if (!mci->fp) { 112 /* 113 * Floating point registers can't be restored and 114 * therefore the process needs to be terminated. 115 */ 116 kill_task = 1; 117 } 118 #ifndef CONFIG_64BIT 119 asm volatile( 120 " ld 0,0(%0)\n" 121 " ld 2,8(%0)\n" 122 " ld 4,16(%0)\n" 123 " ld 6,24(%0)" 124 : : "a" (&S390_lowcore.floating_pt_save_area)); 125 #endif 126 127 if (MACHINE_HAS_IEEE) { 128 #ifdef CONFIG_64BIT 129 fpt_save_area = &S390_lowcore.floating_pt_save_area; 130 fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area; 131 #else 132 fpt_save_area = (void *) S390_lowcore.extended_save_area_addr; 133 fpt_creg_save_area = fpt_save_area + 128; 134 #endif 135 if (!mci->fc) { 136 /* 137 * Floating point control register can't be restored. 138 * Task will be terminated. 139 */ 140 asm volatile("lfpc 0(%0)" : : "a" (&zero), "m" (zero)); 141 kill_task = 1; 142 143 } else 144 asm volatile("lfpc 0(%0)" : : "a" (fpt_creg_save_area)); 145 146 asm volatile( 147 " ld 0,0(%0)\n" 148 " ld 1,8(%0)\n" 149 " ld 2,16(%0)\n" 150 " ld 3,24(%0)\n" 151 " ld 4,32(%0)\n" 152 " ld 5,40(%0)\n" 153 " ld 6,48(%0)\n" 154 " ld 7,56(%0)\n" 155 " ld 8,64(%0)\n" 156 " ld 9,72(%0)\n" 157 " ld 10,80(%0)\n" 158 " ld 11,88(%0)\n" 159 " ld 12,96(%0)\n" 160 " ld 13,104(%0)\n" 161 " ld 14,112(%0)\n" 162 " ld 15,120(%0)\n" 163 : : "a" (fpt_save_area)); 164 } 165 /* Revalidate access registers */ 166 asm volatile( 167 " lam 0,15,0(%0)" 168 : : "a" (&S390_lowcore.access_regs_save_area)); 169 if (!mci->ar) { 170 /* 171 * Access registers have unknown contents. 172 * Terminating task. 173 */ 174 kill_task = 1; 175 } 176 /* Revalidate control registers */ 177 if (!mci->cr) { 178 /* 179 * Control registers have unknown contents. 180 * Can't recover and therefore stopping machine. 181 */ 182 s390_handle_damage("invalid control registers."); 183 } else { 184 #ifdef CONFIG_64BIT 185 asm volatile( 186 " lctlg 0,15,0(%0)" 187 : : "a" (&S390_lowcore.cregs_save_area)); 188 #else 189 asm volatile( 190 " lctl 0,15,0(%0)" 191 : : "a" (&S390_lowcore.cregs_save_area)); 192 #endif 193 } 194 /* 195 * We don't even try to revalidate the TOD register, since we simply 196 * can't write something sensible into that register. 197 */ 198 #ifdef CONFIG_64BIT 199 /* 200 * See if we can revalidate the TOD programmable register with its 201 * old contents (should be zero) otherwise set it to zero. 202 */ 203 if (!mci->pr) 204 asm volatile( 205 " sr 0,0\n" 206 " sckpf" 207 : : : "0", "cc"); 208 else 209 asm volatile( 210 " l 0,0(%0)\n" 211 " sckpf" 212 : : "a" (&S390_lowcore.tod_progreg_save_area) 213 : "0", "cc"); 214 #endif 215 /* Revalidate clock comparator register */ 216 if (S390_lowcore.clock_comparator == -1) 217 set_clock_comparator(S390_lowcore.mcck_clock); 218 else 219 set_clock_comparator(S390_lowcore.clock_comparator); 220 /* Check if old PSW is valid */ 221 if (!mci->wp) 222 /* 223 * Can't tell if we come from user or kernel mode 224 * -> stopping machine. 225 */ 226 s390_handle_damage("old psw invalid."); 227 228 if (!mci->ms || !mci->pm || !mci->ia) 229 kill_task = 1; 230 231 return kill_task; 232 } 233 234 #define MAX_IPD_COUNT 29 235 #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ 236 237 #define ED_STP_ISLAND 6 /* External damage STP island check */ 238 #define ED_STP_SYNC 7 /* External damage STP sync check */ 239 #define ED_ETR_SYNC 12 /* External damage ETR sync check */ 240 #define ED_ETR_SWITCH 13 /* External damage ETR switch to local */ 241 242 /* 243 * machine check handler. 244 */ 245 void notrace s390_do_machine_check(struct pt_regs *regs) 246 { 247 static int ipd_count; 248 static DEFINE_SPINLOCK(ipd_lock); 249 static unsigned long long last_ipd; 250 struct mcck_struct *mcck; 251 unsigned long long tmp; 252 struct mci *mci; 253 int umode; 254 255 nmi_enter(); 256 s390_idle_check(regs, S390_lowcore.mcck_clock, 257 S390_lowcore.mcck_enter_timer); 258 259 mci = (struct mci *) &S390_lowcore.mcck_interruption_code; 260 mcck = &__get_cpu_var(cpu_mcck); 261 umode = user_mode(regs); 262 263 if (mci->sd) { 264 /* System damage -> stopping machine */ 265 s390_handle_damage("received system damage machine check."); 266 } 267 if (mci->pd) { 268 if (mci->b) { 269 /* Processing backup -> verify if we can survive this */ 270 u64 z_mcic, o_mcic, t_mcic; 271 #ifdef CONFIG_64BIT 272 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); 273 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 274 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 275 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 | 276 1ULL<<16); 277 #else 278 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<57 | 1ULL<<50 | 279 1ULL<<29); 280 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 281 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 282 1ULL<<30 | 1ULL<<20 | 1ULL<<17 | 1ULL<<16); 283 #endif 284 t_mcic = *(u64 *)mci; 285 286 if (((t_mcic & z_mcic) != 0) || 287 ((t_mcic & o_mcic) != o_mcic)) { 288 s390_handle_damage("processing backup machine " 289 "check with damage."); 290 } 291 292 /* 293 * Nullifying exigent condition, therefore we might 294 * retry this instruction. 295 */ 296 spin_lock(&ipd_lock); 297 tmp = get_clock(); 298 if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME) 299 ipd_count++; 300 else 301 ipd_count = 1; 302 last_ipd = tmp; 303 if (ipd_count == MAX_IPD_COUNT) 304 s390_handle_damage("too many ipd retries."); 305 spin_unlock(&ipd_lock); 306 } else { 307 /* Processing damage -> stopping machine */ 308 s390_handle_damage("received instruction processing " 309 "damage machine check."); 310 } 311 } 312 if (s390_revalidate_registers(mci)) { 313 if (umode) { 314 /* 315 * Couldn't restore all register contents while in 316 * user mode -> mark task for termination. 317 */ 318 mcck->kill_task = 1; 319 mcck->mcck_code = *(unsigned long long *) mci; 320 set_thread_flag(TIF_MCCK_PENDING); 321 } else { 322 /* 323 * Couldn't restore all register contents while in 324 * kernel mode -> stopping machine. 325 */ 326 s390_handle_damage("unable to revalidate registers."); 327 } 328 } 329 if (mci->cd) { 330 /* Timing facility damage */ 331 s390_handle_damage("TOD clock damaged"); 332 } 333 if (mci->ed && mci->ec) { 334 /* External damage */ 335 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC)) 336 etr_sync_check(); 337 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH)) 338 etr_switch_to_local(); 339 if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) 340 stp_sync_check(); 341 if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) 342 stp_island_check(); 343 } 344 if (mci->se) 345 /* Storage error uncorrected */ 346 s390_handle_damage("received storage error uncorrected " 347 "machine check."); 348 if (mci->ke) 349 /* Storage key-error uncorrected */ 350 s390_handle_damage("received storage key-error uncorrected " 351 "machine check."); 352 if (mci->ds && mci->fa) 353 /* Storage degradation */ 354 s390_handle_damage("received storage degradation machine " 355 "check."); 356 if (mci->cp) { 357 /* Channel report word pending */ 358 mcck->channel_report = 1; 359 set_thread_flag(TIF_MCCK_PENDING); 360 } 361 if (mci->w) { 362 /* Warning pending */ 363 mcck->warning = 1; 364 set_thread_flag(TIF_MCCK_PENDING); 365 } 366 nmi_exit(); 367 } 368 369 static int __init machine_check_init(void) 370 { 371 ctl_set_bit(14, 25); /* enable external damage MCH */ 372 ctl_set_bit(14, 27); /* enable system recovery MCH */ 373 ctl_set_bit(14, 24); /* enable warning MCH */ 374 return 0; 375 } 376 arch_initcall(machine_check_init); 377