1 /* 2 * APEI Generic Hardware Error Source support 3 * 4 * Generic Hardware Error Source provides a way to report platform 5 * hardware errors (such as that from chipset). It works in so called 6 * "Firmware First" mode, that is, hardware errors are reported to 7 * firmware firstly, then reported to Linux by firmware. This way, 8 * some non-standard hardware error registers or non-standard hardware 9 * link can be checked by firmware to produce more hardware error 10 * information for Linux. 11 * 12 * For more information about Generic Hardware Error Source, please 13 * refer to ACPI Specification version 4.0, section 17.3.2.6 14 * 15 * Now, only SCI notification type and memory errors are 16 * supported. More notification type and hardware error type will be 17 * added later. 18 * 19 * Copyright 2010 Intel Corp. 20 * Author: Huang Ying <ying.huang@intel.com> 21 * 22 * This program is free software; you can redistribute it and/or 23 * modify it under the terms of the GNU General Public License version 24 * 2 as published by the Free Software Foundation; 25 * 26 * This program is distributed in the hope that it will be useful, 27 * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 * GNU General Public License for more details. 30 * 31 * You should have received a copy of the GNU General Public License 32 * along with this program; if not, write to the Free Software 33 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 34 */ 35 36 #include <linux/kernel.h> 37 #include <linux/module.h> 38 #include <linux/init.h> 39 #include <linux/acpi.h> 40 #include <linux/io.h> 41 #include <linux/interrupt.h> 42 #include <linux/cper.h> 43 #include <linux/kdebug.h> 44 #include <linux/platform_device.h> 45 #include <linux/mutex.h> 46 #include <acpi/apei.h> 47 #include <acpi/atomicio.h> 48 #include <acpi/hed.h> 49 #include <asm/mce.h> 50 51 #include "apei-internal.h" 52 53 #define GHES_PFX "GHES: " 54 55 #define GHES_ESTATUS_MAX_SIZE 65536 56 57 /* 58 * One struct ghes is created for each generic hardware error 59 * source. 60 * 61 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI 62 * handler. Handler for one generic hardware error source is only 63 * triggered after the previous one is done. So handler can uses 64 * struct ghes without locking. 65 * 66 * estatus: memory buffer for error status block, allocated during 67 * HEST parsing. 68 */ 69 #define GHES_TO_CLEAR 0x0001 70 71 struct ghes { 72 struct acpi_hest_generic *generic; 73 struct acpi_hest_generic_status *estatus; 74 struct list_head list; 75 u64 buffer_paddr; 76 unsigned long flags; 77 }; 78 79 /* 80 * Error source lists, one list for each notification method. The 81 * members in lists are struct ghes. 82 * 83 * The list members are only added in HEST parsing and deleted during 84 * module_exit, that is, single-threaded. So no lock is needed for 85 * that. 86 * 87 * But the mutual exclusion is needed between members adding/deleting 88 * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is 89 * used for that. 90 */ 91 static LIST_HEAD(ghes_sci); 92 static DEFINE_MUTEX(ghes_list_mutex); 93 94 static struct ghes *ghes_new(struct acpi_hest_generic *generic) 95 { 96 struct ghes *ghes; 97 unsigned int error_block_length; 98 int rc; 99 100 ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); 101 if (!ghes) 102 return ERR_PTR(-ENOMEM); 103 ghes->generic = generic; 104 INIT_LIST_HEAD(&ghes->list); 105 rc = acpi_pre_map_gar(&generic->error_status_address); 106 if (rc) 107 goto err_free; 108 error_block_length = generic->error_block_length; 109 if (error_block_length > GHES_ESTATUS_MAX_SIZE) { 110 pr_warning(FW_WARN GHES_PFX 111 "Error status block length is too long: %u for " 112 "generic hardware error source: %d.\n", 113 error_block_length, generic->header.source_id); 114 error_block_length = GHES_ESTATUS_MAX_SIZE; 115 } 116 ghes->estatus = kmalloc(error_block_length, GFP_KERNEL); 117 if (!ghes->estatus) { 118 rc = -ENOMEM; 119 goto err_unmap; 120 } 121 122 return ghes; 123 124 err_unmap: 125 acpi_post_unmap_gar(&generic->error_status_address); 126 err_free: 127 kfree(ghes); 128 return ERR_PTR(rc); 129 } 130 131 static void ghes_fini(struct ghes *ghes) 132 { 133 kfree(ghes->estatus); 134 acpi_post_unmap_gar(&ghes->generic->error_status_address); 135 } 136 137 enum { 138 GHES_SEV_NO = 0x0, 139 GHES_SEV_CORRECTED = 0x1, 140 GHES_SEV_RECOVERABLE = 0x2, 141 GHES_SEV_PANIC = 0x3, 142 }; 143 144 static inline int ghes_severity(int severity) 145 { 146 switch (severity) { 147 case CPER_SEV_INFORMATIONAL: 148 return GHES_SEV_NO; 149 case CPER_SEV_CORRECTED: 150 return GHES_SEV_CORRECTED; 151 case CPER_SEV_RECOVERABLE: 152 return GHES_SEV_RECOVERABLE; 153 case CPER_SEV_FATAL: 154 return GHES_SEV_PANIC; 155 default: 156 /* Unkown, go panic */ 157 return GHES_SEV_PANIC; 158 } 159 } 160 161 /* SCI handler run in work queue, so ioremap can be used here */ 162 static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, 163 int from_phys) 164 { 165 void *vaddr; 166 167 vaddr = ioremap_cache(paddr, len); 168 if (!vaddr) 169 return -ENOMEM; 170 if (from_phys) 171 memcpy(buffer, vaddr, len); 172 else 173 memcpy(vaddr, buffer, len); 174 iounmap(vaddr); 175 176 return 0; 177 } 178 179 static int ghes_read_estatus(struct ghes *ghes, int silent) 180 { 181 struct acpi_hest_generic *g = ghes->generic; 182 u64 buf_paddr; 183 u32 len; 184 int rc; 185 186 rc = acpi_atomic_read(&buf_paddr, &g->error_status_address); 187 if (rc) { 188 if (!silent && printk_ratelimit()) 189 pr_warning(FW_WARN GHES_PFX 190 "Failed to read error status block address for hardware error source: %d.\n", 191 g->header.source_id); 192 return -EIO; 193 } 194 if (!buf_paddr) 195 return -ENOENT; 196 197 rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, 198 sizeof(*ghes->estatus), 1); 199 if (rc) 200 return rc; 201 if (!ghes->estatus->block_status) 202 return -ENOENT; 203 204 ghes->buffer_paddr = buf_paddr; 205 ghes->flags |= GHES_TO_CLEAR; 206 207 rc = -EIO; 208 len = apei_estatus_len(ghes->estatus); 209 if (len < sizeof(*ghes->estatus)) 210 goto err_read_block; 211 if (len > ghes->generic->error_block_length) 212 goto err_read_block; 213 if (apei_estatus_check_header(ghes->estatus)) 214 goto err_read_block; 215 rc = ghes_copy_tofrom_phys(ghes->estatus + 1, 216 buf_paddr + sizeof(*ghes->estatus), 217 len - sizeof(*ghes->estatus), 1); 218 if (rc) 219 return rc; 220 if (apei_estatus_check(ghes->estatus)) 221 goto err_read_block; 222 rc = 0; 223 224 err_read_block: 225 if (rc && !silent) 226 pr_warning(FW_WARN GHES_PFX 227 "Failed to read error status block!\n"); 228 return rc; 229 } 230 231 static void ghes_clear_estatus(struct ghes *ghes) 232 { 233 ghes->estatus->block_status = 0; 234 if (!(ghes->flags & GHES_TO_CLEAR)) 235 return; 236 ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr, 237 sizeof(ghes->estatus->block_status), 0); 238 ghes->flags &= ~GHES_TO_CLEAR; 239 } 240 241 static void ghes_do_proc(struct ghes *ghes) 242 { 243 int sev, processed = 0; 244 struct acpi_hest_generic_data *gdata; 245 246 sev = ghes_severity(ghes->estatus->error_severity); 247 apei_estatus_for_each_section(ghes->estatus, gdata) { 248 #ifdef CONFIG_X86_MCE 249 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, 250 CPER_SEC_PLATFORM_MEM)) { 251 apei_mce_report_mem_error( 252 sev == GHES_SEV_CORRECTED, 253 (struct cper_sec_mem_err *)(gdata+1)); 254 processed = 1; 255 } 256 #endif 257 } 258 259 if (!processed && printk_ratelimit()) 260 pr_warning(GHES_PFX 261 "Unknown error record from generic hardware error source: %d\n", 262 ghes->generic->header.source_id); 263 } 264 265 static int ghes_proc(struct ghes *ghes) 266 { 267 int rc; 268 269 rc = ghes_read_estatus(ghes, 0); 270 if (rc) 271 goto out; 272 ghes_do_proc(ghes); 273 274 out: 275 ghes_clear_estatus(ghes); 276 return 0; 277 } 278 279 static int ghes_notify_sci(struct notifier_block *this, 280 unsigned long event, void *data) 281 { 282 struct ghes *ghes; 283 int ret = NOTIFY_DONE; 284 285 rcu_read_lock(); 286 list_for_each_entry_rcu(ghes, &ghes_sci, list) { 287 if (!ghes_proc(ghes)) 288 ret = NOTIFY_OK; 289 } 290 rcu_read_unlock(); 291 292 return ret; 293 } 294 295 static struct notifier_block ghes_notifier_sci = { 296 .notifier_call = ghes_notify_sci, 297 }; 298 299 static int __devinit ghes_probe(struct platform_device *ghes_dev) 300 { 301 struct acpi_hest_generic *generic; 302 struct ghes *ghes = NULL; 303 int rc = -EINVAL; 304 305 generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; 306 if (!generic->enabled) 307 return -ENODEV; 308 309 if (generic->error_block_length < 310 sizeof(struct acpi_hest_generic_status)) { 311 pr_warning(FW_BUG GHES_PFX 312 "Invalid error block length: %u for generic hardware error source: %d\n", 313 generic->error_block_length, 314 generic->header.source_id); 315 goto err; 316 } 317 if (generic->records_to_preallocate == 0) { 318 pr_warning(FW_BUG GHES_PFX 319 "Invalid records to preallocate: %u for generic hardware error source: %d\n", 320 generic->records_to_preallocate, 321 generic->header.source_id); 322 goto err; 323 } 324 ghes = ghes_new(generic); 325 if (IS_ERR(ghes)) { 326 rc = PTR_ERR(ghes); 327 ghes = NULL; 328 goto err; 329 } 330 if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) { 331 mutex_lock(&ghes_list_mutex); 332 if (list_empty(&ghes_sci)) 333 register_acpi_hed_notifier(&ghes_notifier_sci); 334 list_add_rcu(&ghes->list, &ghes_sci); 335 mutex_unlock(&ghes_list_mutex); 336 } else { 337 unsigned char *notify = NULL; 338 339 switch (generic->notify.type) { 340 case ACPI_HEST_NOTIFY_POLLED: 341 notify = "POLL"; 342 break; 343 case ACPI_HEST_NOTIFY_EXTERNAL: 344 case ACPI_HEST_NOTIFY_LOCAL: 345 notify = "IRQ"; 346 break; 347 case ACPI_HEST_NOTIFY_NMI: 348 notify = "NMI"; 349 break; 350 } 351 if (notify) { 352 pr_warning(GHES_PFX 353 "Generic hardware error source: %d notified via %s is not supported!\n", 354 generic->header.source_id, notify); 355 } else { 356 pr_warning(FW_WARN GHES_PFX 357 "Unknown notification type: %u for generic hardware error source: %d\n", 358 generic->notify.type, generic->header.source_id); 359 } 360 rc = -ENODEV; 361 goto err; 362 } 363 platform_set_drvdata(ghes_dev, ghes); 364 365 return 0; 366 err: 367 if (ghes) { 368 ghes_fini(ghes); 369 kfree(ghes); 370 } 371 return rc; 372 } 373 374 static int __devexit ghes_remove(struct platform_device *ghes_dev) 375 { 376 struct ghes *ghes; 377 struct acpi_hest_generic *generic; 378 379 ghes = platform_get_drvdata(ghes_dev); 380 generic = ghes->generic; 381 382 switch (generic->notify.type) { 383 case ACPI_HEST_NOTIFY_SCI: 384 mutex_lock(&ghes_list_mutex); 385 list_del_rcu(&ghes->list); 386 if (list_empty(&ghes_sci)) 387 unregister_acpi_hed_notifier(&ghes_notifier_sci); 388 mutex_unlock(&ghes_list_mutex); 389 break; 390 default: 391 BUG(); 392 break; 393 } 394 395 synchronize_rcu(); 396 ghes_fini(ghes); 397 kfree(ghes); 398 399 platform_set_drvdata(ghes_dev, NULL); 400 401 return 0; 402 } 403 404 static struct platform_driver ghes_platform_driver = { 405 .driver = { 406 .name = "GHES", 407 .owner = THIS_MODULE, 408 }, 409 .probe = ghes_probe, 410 .remove = ghes_remove, 411 }; 412 413 static int __init ghes_init(void) 414 { 415 if (acpi_disabled) 416 return -ENODEV; 417 418 if (hest_disable) { 419 pr_info(GHES_PFX "HEST is not enabled!\n"); 420 return -EINVAL; 421 } 422 423 return platform_driver_register(&ghes_platform_driver); 424 } 425 426 static void __exit ghes_exit(void) 427 { 428 platform_driver_unregister(&ghes_platform_driver); 429 } 430 431 module_init(ghes_init); 432 module_exit(ghes_exit); 433 434 MODULE_AUTHOR("Huang Ying"); 435 MODULE_DESCRIPTION("APEI Generic Hardware Error Source support"); 436 MODULE_LICENSE("GPL"); 437 MODULE_ALIAS("platform:GHES"); 438