1 /* 2 * APEI Generic Hardware Error Source support 3 * 4 * Generic Hardware Error Source provides a way to report platform 5 * hardware errors (such as that from chipset). It works in so called 6 * "Firmware First" mode, that is, hardware errors are reported to 7 * firmware firstly, then reported to Linux by firmware. This way, 8 * some non-standard hardware error registers or non-standard hardware 9 * link can be checked by firmware to produce more hardware error 10 * information for Linux. 11 * 12 * For more information about Generic Hardware Error Source, please 13 * refer to ACPI Specification version 4.0, section 17.3.2.6 14 * 15 * Copyright 2010 Intel Corp. 16 * Author: Huang Ying <ying.huang@intel.com> 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License version 20 * 2 as published by the Free Software Foundation; 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 * 27 * You should have received a copy of the GNU General Public License 28 * along with this program; if not, write to the Free Software 29 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 30 */ 31 32 #include <linux/kernel.h> 33 #include <linux/module.h> 34 #include <linux/init.h> 35 #include <linux/acpi.h> 36 #include <linux/io.h> 37 #include <linux/interrupt.h> 38 #include <linux/timer.h> 39 #include <linux/cper.h> 40 #include <linux/kdebug.h> 41 #include <linux/platform_device.h> 42 #include <linux/mutex.h> 43 #include <linux/ratelimit.h> 44 #include <linux/vmalloc.h> 45 #include <acpi/apei.h> 46 #include <acpi/atomicio.h> 47 #include <acpi/hed.h> 48 #include <asm/mce.h> 49 #include <asm/tlbflush.h> 50 51 #include "apei-internal.h" 52 53 #define GHES_PFX "GHES: " 54 55 #define GHES_ESTATUS_MAX_SIZE 65536 56 57 /* 58 * One struct ghes is created for each generic hardware error source. 59 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI 60 * handler. 61 * 62 * estatus: memory buffer for error status block, allocated during 63 * HEST parsing. 64 */ 65 #define GHES_TO_CLEAR 0x0001 66 #define GHES_EXITING 0x0002 67 68 struct ghes { 69 struct acpi_hest_generic *generic; 70 struct acpi_hest_generic_status *estatus; 71 u64 buffer_paddr; 72 unsigned long flags; 73 union { 74 struct list_head list; 75 struct timer_list timer; 76 unsigned int irq; 77 }; 78 }; 79 80 static int ghes_panic_timeout __read_mostly = 30; 81 82 /* 83 * All error sources notified with SCI shares one notifier function, 84 * so they need to be linked and checked one by one. This is applied 85 * to NMI too. 86 * 87 * RCU is used for these lists, so ghes_list_mutex is only used for 88 * list changing, not for traversing. 89 */ 90 static LIST_HEAD(ghes_sci); 91 static LIST_HEAD(ghes_nmi); 92 static DEFINE_MUTEX(ghes_list_mutex); 93 94 /* 95 * NMI may be triggered on any CPU, so ghes_nmi_lock is used for 96 * mutual exclusion. 97 */ 98 static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); 99 100 /* 101 * Because the memory area used to transfer hardware error information 102 * from BIOS to Linux can be determined only in NMI, IRQ or timer 103 * handler, but general ioremap can not be used in atomic context, so 104 * a special version of atomic ioremap is implemented for that. 105 */ 106 107 /* 108 * Two virtual pages are used, one for NMI context, the other for 109 * IRQ/PROCESS context 110 */ 111 #define GHES_IOREMAP_PAGES 2 112 #define GHES_IOREMAP_NMI_PAGE(base) (base) 113 #define GHES_IOREMAP_IRQ_PAGE(base) ((base) + PAGE_SIZE) 114 115 /* virtual memory area for atomic ioremap */ 116 static struct vm_struct *ghes_ioremap_area; 117 /* 118 * These 2 spinlock is used to prevent atomic ioremap virtual memory 119 * area from being mapped simultaneously. 120 */ 121 static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); 122 static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); 123 124 static int ghes_ioremap_init(void) 125 { 126 ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, 127 VM_IOREMAP, VMALLOC_START, VMALLOC_END); 128 if (!ghes_ioremap_area) { 129 pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n"); 130 return -ENOMEM; 131 } 132 133 return 0; 134 } 135 136 static void ghes_ioremap_exit(void) 137 { 138 free_vm_area(ghes_ioremap_area); 139 } 140 141 static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) 142 { 143 unsigned long vaddr; 144 145 vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); 146 ioremap_page_range(vaddr, vaddr + PAGE_SIZE, 147 pfn << PAGE_SHIFT, PAGE_KERNEL); 148 149 return (void __iomem *)vaddr; 150 } 151 152 static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) 153 { 154 unsigned long vaddr; 155 156 vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); 157 ioremap_page_range(vaddr, vaddr + PAGE_SIZE, 158 pfn << PAGE_SHIFT, PAGE_KERNEL); 159 160 return (void __iomem *)vaddr; 161 } 162 163 static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) 164 { 165 unsigned long vaddr = (unsigned long __force)vaddr_ptr; 166 void *base = ghes_ioremap_area->addr; 167 168 BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); 169 unmap_kernel_range_noflush(vaddr, PAGE_SIZE); 170 __flush_tlb_one(vaddr); 171 } 172 173 static void ghes_iounmap_irq(void __iomem *vaddr_ptr) 174 { 175 unsigned long vaddr = (unsigned long __force)vaddr_ptr; 176 void *base = ghes_ioremap_area->addr; 177 178 BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); 179 unmap_kernel_range_noflush(vaddr, PAGE_SIZE); 180 __flush_tlb_one(vaddr); 181 } 182 183 static struct ghes *ghes_new(struct acpi_hest_generic *generic) 184 { 185 struct ghes *ghes; 186 unsigned int error_block_length; 187 int rc; 188 189 ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); 190 if (!ghes) 191 return ERR_PTR(-ENOMEM); 192 ghes->generic = generic; 193 rc = acpi_pre_map_gar(&generic->error_status_address); 194 if (rc) 195 goto err_free; 196 error_block_length = generic->error_block_length; 197 if (error_block_length > GHES_ESTATUS_MAX_SIZE) { 198 pr_warning(FW_WARN GHES_PFX 199 "Error status block length is too long: %u for " 200 "generic hardware error source: %d.\n", 201 error_block_length, generic->header.source_id); 202 error_block_length = GHES_ESTATUS_MAX_SIZE; 203 } 204 ghes->estatus = kmalloc(error_block_length, GFP_KERNEL); 205 if (!ghes->estatus) { 206 rc = -ENOMEM; 207 goto err_unmap; 208 } 209 210 return ghes; 211 212 err_unmap: 213 acpi_post_unmap_gar(&generic->error_status_address); 214 err_free: 215 kfree(ghes); 216 return ERR_PTR(rc); 217 } 218 219 static void ghes_fini(struct ghes *ghes) 220 { 221 kfree(ghes->estatus); 222 acpi_post_unmap_gar(&ghes->generic->error_status_address); 223 } 224 225 enum { 226 GHES_SEV_NO = 0x0, 227 GHES_SEV_CORRECTED = 0x1, 228 GHES_SEV_RECOVERABLE = 0x2, 229 GHES_SEV_PANIC = 0x3, 230 }; 231 232 static inline int ghes_severity(int severity) 233 { 234 switch (severity) { 235 case CPER_SEV_INFORMATIONAL: 236 return GHES_SEV_NO; 237 case CPER_SEV_CORRECTED: 238 return GHES_SEV_CORRECTED; 239 case CPER_SEV_RECOVERABLE: 240 return GHES_SEV_RECOVERABLE; 241 case CPER_SEV_FATAL: 242 return GHES_SEV_PANIC; 243 default: 244 /* Unkown, go panic */ 245 return GHES_SEV_PANIC; 246 } 247 } 248 249 static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, 250 int from_phys) 251 { 252 void __iomem *vaddr; 253 unsigned long flags = 0; 254 int in_nmi = in_nmi(); 255 u64 offset; 256 u32 trunk; 257 258 while (len > 0) { 259 offset = paddr - (paddr & PAGE_MASK); 260 if (in_nmi) { 261 raw_spin_lock(&ghes_ioremap_lock_nmi); 262 vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT); 263 } else { 264 spin_lock_irqsave(&ghes_ioremap_lock_irq, flags); 265 vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT); 266 } 267 trunk = PAGE_SIZE - offset; 268 trunk = min(trunk, len); 269 if (from_phys) 270 memcpy_fromio(buffer, vaddr + offset, trunk); 271 else 272 memcpy_toio(vaddr + offset, buffer, trunk); 273 len -= trunk; 274 paddr += trunk; 275 buffer += trunk; 276 if (in_nmi) { 277 ghes_iounmap_nmi(vaddr); 278 raw_spin_unlock(&ghes_ioremap_lock_nmi); 279 } else { 280 ghes_iounmap_irq(vaddr); 281 spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); 282 } 283 } 284 } 285 286 static int ghes_read_estatus(struct ghes *ghes, int silent) 287 { 288 struct acpi_hest_generic *g = ghes->generic; 289 u64 buf_paddr; 290 u32 len; 291 int rc; 292 293 rc = acpi_atomic_read(&buf_paddr, &g->error_status_address); 294 if (rc) { 295 if (!silent && printk_ratelimit()) 296 pr_warning(FW_WARN GHES_PFX 297 "Failed to read error status block address for hardware error source: %d.\n", 298 g->header.source_id); 299 return -EIO; 300 } 301 if (!buf_paddr) 302 return -ENOENT; 303 304 ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, 305 sizeof(*ghes->estatus), 1); 306 if (!ghes->estatus->block_status) 307 return -ENOENT; 308 309 ghes->buffer_paddr = buf_paddr; 310 ghes->flags |= GHES_TO_CLEAR; 311 312 rc = -EIO; 313 len = apei_estatus_len(ghes->estatus); 314 if (len < sizeof(*ghes->estatus)) 315 goto err_read_block; 316 if (len > ghes->generic->error_block_length) 317 goto err_read_block; 318 if (apei_estatus_check_header(ghes->estatus)) 319 goto err_read_block; 320 ghes_copy_tofrom_phys(ghes->estatus + 1, 321 buf_paddr + sizeof(*ghes->estatus), 322 len - sizeof(*ghes->estatus), 1); 323 if (apei_estatus_check(ghes->estatus)) 324 goto err_read_block; 325 rc = 0; 326 327 err_read_block: 328 if (rc && !silent && printk_ratelimit()) 329 pr_warning(FW_WARN GHES_PFX 330 "Failed to read error status block!\n"); 331 return rc; 332 } 333 334 static void ghes_clear_estatus(struct ghes *ghes) 335 { 336 ghes->estatus->block_status = 0; 337 if (!(ghes->flags & GHES_TO_CLEAR)) 338 return; 339 ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr, 340 sizeof(ghes->estatus->block_status), 0); 341 ghes->flags &= ~GHES_TO_CLEAR; 342 } 343 344 static void ghes_do_proc(struct ghes *ghes) 345 { 346 int sev, processed = 0; 347 struct acpi_hest_generic_data *gdata; 348 349 sev = ghes_severity(ghes->estatus->error_severity); 350 apei_estatus_for_each_section(ghes->estatus, gdata) { 351 #ifdef CONFIG_X86_MCE 352 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, 353 CPER_SEC_PLATFORM_MEM)) { 354 apei_mce_report_mem_error( 355 sev == GHES_SEV_CORRECTED, 356 (struct cper_sec_mem_err *)(gdata+1)); 357 processed = 1; 358 } 359 #endif 360 } 361 } 362 363 static void ghes_print_estatus(const char *pfx, struct ghes *ghes) 364 { 365 /* Not more than 2 messages every 5 seconds */ 366 static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); 367 368 if (pfx == NULL) { 369 if (ghes_severity(ghes->estatus->error_severity) <= 370 GHES_SEV_CORRECTED) 371 pfx = KERN_WARNING HW_ERR; 372 else 373 pfx = KERN_ERR HW_ERR; 374 } 375 if (__ratelimit(&ratelimit)) { 376 printk( 377 "%s""Hardware error from APEI Generic Hardware Error Source: %d\n", 378 pfx, ghes->generic->header.source_id); 379 apei_estatus_print(pfx, ghes->estatus); 380 } 381 } 382 383 static int ghes_proc(struct ghes *ghes) 384 { 385 int rc; 386 387 rc = ghes_read_estatus(ghes, 0); 388 if (rc) 389 goto out; 390 ghes_print_estatus(NULL, ghes); 391 ghes_do_proc(ghes); 392 393 out: 394 ghes_clear_estatus(ghes); 395 return 0; 396 } 397 398 static void ghes_add_timer(struct ghes *ghes) 399 { 400 struct acpi_hest_generic *g = ghes->generic; 401 unsigned long expire; 402 403 if (!g->notify.poll_interval) { 404 pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n", 405 g->header.source_id); 406 return; 407 } 408 expire = jiffies + msecs_to_jiffies(g->notify.poll_interval); 409 ghes->timer.expires = round_jiffies_relative(expire); 410 add_timer(&ghes->timer); 411 } 412 413 static void ghes_poll_func(unsigned long data) 414 { 415 struct ghes *ghes = (void *)data; 416 417 ghes_proc(ghes); 418 if (!(ghes->flags & GHES_EXITING)) 419 ghes_add_timer(ghes); 420 } 421 422 static irqreturn_t ghes_irq_func(int irq, void *data) 423 { 424 struct ghes *ghes = data; 425 int rc; 426 427 rc = ghes_proc(ghes); 428 if (rc) 429 return IRQ_NONE; 430 431 return IRQ_HANDLED; 432 } 433 434 static int ghes_notify_sci(struct notifier_block *this, 435 unsigned long event, void *data) 436 { 437 struct ghes *ghes; 438 int ret = NOTIFY_DONE; 439 440 rcu_read_lock(); 441 list_for_each_entry_rcu(ghes, &ghes_sci, list) { 442 if (!ghes_proc(ghes)) 443 ret = NOTIFY_OK; 444 } 445 rcu_read_unlock(); 446 447 return ret; 448 } 449 450 static int ghes_notify_nmi(struct notifier_block *this, 451 unsigned long cmd, void *data) 452 { 453 struct ghes *ghes, *ghes_global = NULL; 454 int sev, sev_global = -1; 455 int ret = NOTIFY_DONE; 456 457 if (cmd != DIE_NMI) 458 return ret; 459 460 raw_spin_lock(&ghes_nmi_lock); 461 list_for_each_entry_rcu(ghes, &ghes_nmi, list) { 462 if (ghes_read_estatus(ghes, 1)) { 463 ghes_clear_estatus(ghes); 464 continue; 465 } 466 sev = ghes_severity(ghes->estatus->error_severity); 467 if (sev > sev_global) { 468 sev_global = sev; 469 ghes_global = ghes; 470 } 471 ret = NOTIFY_STOP; 472 } 473 474 if (ret == NOTIFY_DONE) 475 goto out; 476 477 if (sev_global >= GHES_SEV_PANIC) { 478 oops_begin(); 479 ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); 480 /* reboot to log the error! */ 481 if (panic_timeout == 0) 482 panic_timeout = ghes_panic_timeout; 483 panic("Fatal hardware error!"); 484 } 485 486 list_for_each_entry_rcu(ghes, &ghes_nmi, list) { 487 if (!(ghes->flags & GHES_TO_CLEAR)) 488 continue; 489 /* Do not print estatus because printk is not NMI safe */ 490 ghes_do_proc(ghes); 491 ghes_clear_estatus(ghes); 492 } 493 494 out: 495 raw_spin_unlock(&ghes_nmi_lock); 496 return ret; 497 } 498 499 static struct notifier_block ghes_notifier_sci = { 500 .notifier_call = ghes_notify_sci, 501 }; 502 503 static struct notifier_block ghes_notifier_nmi = { 504 .notifier_call = ghes_notify_nmi, 505 }; 506 507 static int __devinit ghes_probe(struct platform_device *ghes_dev) 508 { 509 struct acpi_hest_generic *generic; 510 struct ghes *ghes = NULL; 511 int rc = -EINVAL; 512 513 generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; 514 if (!generic->enabled) 515 return -ENODEV; 516 517 switch (generic->notify.type) { 518 case ACPI_HEST_NOTIFY_POLLED: 519 case ACPI_HEST_NOTIFY_EXTERNAL: 520 case ACPI_HEST_NOTIFY_SCI: 521 case ACPI_HEST_NOTIFY_NMI: 522 break; 523 case ACPI_HEST_NOTIFY_LOCAL: 524 pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", 525 generic->header.source_id); 526 goto err; 527 default: 528 pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n", 529 generic->notify.type, generic->header.source_id); 530 goto err; 531 } 532 533 rc = -EIO; 534 if (generic->error_block_length < 535 sizeof(struct acpi_hest_generic_status)) { 536 pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n", 537 generic->error_block_length, 538 generic->header.source_id); 539 goto err; 540 } 541 ghes = ghes_new(generic); 542 if (IS_ERR(ghes)) { 543 rc = PTR_ERR(ghes); 544 ghes = NULL; 545 goto err; 546 } 547 switch (generic->notify.type) { 548 case ACPI_HEST_NOTIFY_POLLED: 549 ghes->timer.function = ghes_poll_func; 550 ghes->timer.data = (unsigned long)ghes; 551 init_timer_deferrable(&ghes->timer); 552 ghes_add_timer(ghes); 553 break; 554 case ACPI_HEST_NOTIFY_EXTERNAL: 555 /* External interrupt vector is GSI */ 556 if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) { 557 pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n", 558 generic->header.source_id); 559 goto err; 560 } 561 if (request_irq(ghes->irq, ghes_irq_func, 562 0, "GHES IRQ", ghes)) { 563 pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n", 564 generic->header.source_id); 565 goto err; 566 } 567 break; 568 case ACPI_HEST_NOTIFY_SCI: 569 mutex_lock(&ghes_list_mutex); 570 if (list_empty(&ghes_sci)) 571 register_acpi_hed_notifier(&ghes_notifier_sci); 572 list_add_rcu(&ghes->list, &ghes_sci); 573 mutex_unlock(&ghes_list_mutex); 574 break; 575 case ACPI_HEST_NOTIFY_NMI: 576 mutex_lock(&ghes_list_mutex); 577 if (list_empty(&ghes_nmi)) 578 register_die_notifier(&ghes_notifier_nmi); 579 list_add_rcu(&ghes->list, &ghes_nmi); 580 mutex_unlock(&ghes_list_mutex); 581 break; 582 default: 583 BUG(); 584 } 585 platform_set_drvdata(ghes_dev, ghes); 586 587 return 0; 588 err: 589 if (ghes) { 590 ghes_fini(ghes); 591 kfree(ghes); 592 } 593 return rc; 594 } 595 596 static int __devexit ghes_remove(struct platform_device *ghes_dev) 597 { 598 struct ghes *ghes; 599 struct acpi_hest_generic *generic; 600 601 ghes = platform_get_drvdata(ghes_dev); 602 generic = ghes->generic; 603 604 ghes->flags |= GHES_EXITING; 605 switch (generic->notify.type) { 606 case ACPI_HEST_NOTIFY_POLLED: 607 del_timer_sync(&ghes->timer); 608 break; 609 case ACPI_HEST_NOTIFY_EXTERNAL: 610 free_irq(ghes->irq, ghes); 611 break; 612 case ACPI_HEST_NOTIFY_SCI: 613 mutex_lock(&ghes_list_mutex); 614 list_del_rcu(&ghes->list); 615 if (list_empty(&ghes_sci)) 616 unregister_acpi_hed_notifier(&ghes_notifier_sci); 617 mutex_unlock(&ghes_list_mutex); 618 break; 619 case ACPI_HEST_NOTIFY_NMI: 620 mutex_lock(&ghes_list_mutex); 621 list_del_rcu(&ghes->list); 622 if (list_empty(&ghes_nmi)) 623 unregister_die_notifier(&ghes_notifier_nmi); 624 mutex_unlock(&ghes_list_mutex); 625 /* 626 * To synchronize with NMI handler, ghes can only be 627 * freed after NMI handler finishes. 628 */ 629 synchronize_rcu(); 630 break; 631 default: 632 BUG(); 633 break; 634 } 635 636 ghes_fini(ghes); 637 kfree(ghes); 638 639 platform_set_drvdata(ghes_dev, NULL); 640 641 return 0; 642 } 643 644 static struct platform_driver ghes_platform_driver = { 645 .driver = { 646 .name = "GHES", 647 .owner = THIS_MODULE, 648 }, 649 .probe = ghes_probe, 650 .remove = ghes_remove, 651 }; 652 653 static int __init ghes_init(void) 654 { 655 int rc; 656 657 if (acpi_disabled) 658 return -ENODEV; 659 660 if (hest_disable) { 661 pr_info(GHES_PFX "HEST is not enabled!\n"); 662 return -EINVAL; 663 } 664 665 rc = ghes_ioremap_init(); 666 if (rc) 667 goto err; 668 669 rc = platform_driver_register(&ghes_platform_driver); 670 if (rc) 671 goto err_ioremap_exit; 672 673 return 0; 674 err_ioremap_exit: 675 ghes_ioremap_exit(); 676 err: 677 return rc; 678 } 679 680 static void __exit ghes_exit(void) 681 { 682 platform_driver_unregister(&ghes_platform_driver); 683 ghes_ioremap_exit(); 684 } 685 686 module_init(ghes_init); 687 module_exit(ghes_exit); 688 689 MODULE_AUTHOR("Huang Ying"); 690 MODULE_DESCRIPTION("APEI Generic Hardware Error Source support"); 691 MODULE_LICENSE("GPL"); 692 MODULE_ALIAS("platform:GHES"); 693