1 #include "oem_event_manager.hpp" 2 3 #include "libcper/Cper.h" 4 5 #include "cper.hpp" 6 #include "requester/handler.hpp" 7 #include "requester/request.hpp" 8 9 #include <config.h> 10 #include <libpldm/pldm.h> 11 #include <libpldm/utils.h> 12 #include <systemd/sd-journal.h> 13 14 #include <phosphor-logging/lg2.hpp> 15 #include <xyz/openbmc_project/Logging/Entry/server.hpp> 16 17 #include <algorithm> 18 #include <map> 19 #include <set> 20 #include <sstream> 21 #include <string> 22 #include <unordered_map> 23 24 namespace pldm 25 { 26 namespace oem_ampere 27 { 28 namespace boot_stage = boot::stage; 29 namespace ddr_status = ddr::status; 30 namespace dimm_status = dimm::status; 31 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome; 32 namespace phy_syndrome = dimm::training_failure::phy_syndrome; 33 namespace training_failure = dimm::training_failure; 34 35 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent.OK"; 36 constexpr const char* ampereWarningRegistry = 37 "OpenBMC.0.1.AmpereWarning.Warning"; 38 constexpr const char* ampereCriticalRegistry = 39 "OpenBMC.0.1.AmpereCritical.Critical"; 40 constexpr const char* BIOSFWPanicRegistry = 41 "OpenBMC.0.1.BIOSFirmwarePanicReason.Warning"; 42 constexpr auto maxDIMMIdxBitNum = 24; 43 constexpr auto maxDIMMInstantNum = 24; 44 45 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE}; 46 47 /* 48 An array of possible boot status of a boot stage. 49 The index maps with byte 0 of boot code. 50 */ 51 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"}; 52 53 /* 54 An array of possible boot status of DDR training stage. 55 The index maps with byte 0 of boot code. 56 */ 57 std::array<std::string, 3> ddrTrainingMsg = { 58 " progress started", " in-progress", " progress completed"}; 59 60 /* 61 A map between PMIC status and logging strings. 62 */ 63 std::array<std::string, 8> pmicTempAlertMsg = { 64 "Below 85°C", "85°C", "95°C", "105°C", 65 "115°C", "125°C", "135°C", "Equal or greater than 140°C"}; 66 67 /* 68 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC 69 EPs through SMBus and PCIe. When host boots up, SMBUS interface 70 comes up first. In this interface, BMC is bus owner. 71 72 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available). 73 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available). 74 */ 75 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}}; 76 77 /* 78 A map between sensor IDs and their names in string. 79 Using pldm::oem::sensor_ids 80 */ 81 EventToMsgMap_t sensorIdToStrMap = { 82 {DDR_STATUS, "DDR_STATUS"}, 83 {PCP_VR_STATE, "PCP_VR_STATE"}, 84 {SOC_VR_STATE, "SOC_VR_STATE"}, 85 {DPHY_VR1_STATE, "DPHY_VR1_STATE"}, 86 {DPHY_VR2_STATE, "DPHY_VR2_STATE"}, 87 {D2D_VR_STATE, "D2D_VR_STATE"}, 88 {IOC_VR1_STATE, "IOC_VR1_STATE"}, 89 {IOC_VR2_STATE, "IOC_VR2_STATE"}, 90 {PCI_D_VR_STATE, "PCI_D_VR_STATE"}, 91 {PCI_A_VR_STATE, "PCI_A_VR_STATE"}, 92 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"}, 93 {BOOT_OVERALL, "BOOT_OVERALL"}, 94 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"}, 95 {WATCH_DOG, "WATCH_DOG"}}; 96 97 /* 98 A map between the boot stages and logging strings. 99 Using pldm::oem::boot::stage::boot_stage 100 */ 101 EventToMsgMap_t bootStageToMsgMap = { 102 {boot_stage::SECPRO, "SECpro"}, 103 {boot_stage::MPRO, "Mpro"}, 104 {boot_stage::ATF_BL1, "ATF BL1"}, 105 {boot_stage::ATF_BL2, "ATF BL2"}, 106 {boot_stage::DDR_INITIALIZATION, "DDR initialization"}, 107 {boot_stage::DDR_TRAINING, "DDR training"}, 108 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"}, 109 {boot_stage::ATF_BL31, "ATF BL31"}, 110 {boot_stage::ATF_BL32, "ATF BL32"}, 111 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"}, 112 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN, 113 "ATF BL33 (UEFI) booting status = "}}; 114 115 /* 116 A map between DDR status and logging strings. 117 Using pldm::oem::ddr::status::ddr_status 118 */ 119 EventToMsgMap_t ddrStatusToMsgMap = { 120 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"}, 121 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"}, 122 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"}, 123 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"}, 124 {ddr_status::OTHER_FAILURE, "has other failure"}, 125 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG, 126 "has boot failure due to no configuration"}, 127 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS, 128 "failsafe activated but boot success with the next valid configuration"}}; 129 130 /* 131 A map between DIMM status and logging strings. 132 Using pldm::oem::dimm::status::dimm_status 133 */ 134 EventToMsgMap_t dimmStatusToMsgMap = { 135 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"}, 136 {dimm_status::NOT_INSTALLED, "is not installed"}, 137 {dimm_status::OTHER_FAILURE, "has other failure"}, 138 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"}, 139 {dimm_status::TRAINING_FAILURE, "has training failure; "}, 140 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}}; 141 142 /* 143 A map between PHY training failure syndrome and logging strings. 144 Using 145 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome 146 */ 147 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = { 148 {phy_syndrome::NA, "(N/A)"}, 149 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"}, 150 {phy_syndrome::CA_LEVELING, "(CA leveling)"}, 151 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE, 152 "(PHY write level failure - see syndrome 1)"}, 153 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE, 154 "(PHY read gate leveling failure)"}, 155 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"}, 156 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"}, 157 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}}; 158 159 /* 160 A map between DIMM training failure syndrome and logging strings. 161 Using 162 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome 163 */ 164 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = { 165 {dimm_syndrome::NA, "(N/A)"}, 166 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE, 167 "(DRAM VREFDQ training failure)"}, 168 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"}, 169 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE, 170 "(LRDRIMM DB SW training failure)"}}; 171 172 /* 173 A map between DIMM training failure type and a pair of <logging strings - 174 syndrome map>. Using 175 pldm::oem::dimm::training_faillure::dimm_training_failure_type 176 */ 177 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>> 178 dimmTrainingFailureTypeMap = { 179 {training_failure::PHY_TRAINING_FAILURE_TYPE, 180 std::make_pair("PHY training failure", 181 phyTrainingFailureSyndromeToMsgMap)}, 182 {training_failure::DIMM_TRAINING_FAILURE_TYPE, 183 std::make_pair("DIMM training failure", 184 dimmTrainingFailureSyndromeToMsgMap)}}; 185 186 /* 187 A map between log level and the registry used for Redfish SEL log 188 Using pldm::oem::log_level 189 */ 190 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = { 191 {log_level::OK, ampereEventRegistry}, 192 {log_level::WARNING, ampereWarningRegistry}, 193 {log_level::CRITICAL, ampereCriticalRegistry}, 194 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}}; 195 196 std::unordered_map< 197 uint16_t, 198 std::vector<std::pair< 199 std::string, 200 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>> 201 stateSensorToMsgMap = { 202 {SOC_HEALTH_AVAILABILITY, 203 {{"SoC Health", 204 {{1, {log_level::OK, "Normal"}}, 205 {2, {log_level::WARNING, "Non-Critical"}}, 206 {3, {log_level::CRITICAL, "Critical"}}, 207 {4, {log_level::CRITICAL, "Fatal"}}}}, 208 {"SoC Availability", 209 {{1, {log_level::OK, "Enabled"}}, 210 {2, {log_level::WARNING, "Disabled"}}, 211 {3, {log_level::CRITICAL, "Shutdown"}}}}}}, 212 {WATCH_DOG, 213 {{"Global Watch Dog", 214 {{1, {log_level::OK, "Normal"}}, 215 {2, {log_level::CRITICAL, "Timer Expired"}}}}, 216 {"Secure Watch Dog", 217 {{1, {log_level::OK, "Normal"}}, 218 {2, {log_level::CRITICAL, "Timer Expired"}}}}, 219 {"Non-secure Watch Dog", 220 {{1, {log_level::OK, "Normal"}}, 221 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}}; 222 223 std::string 224 OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId) 225 { 226 std::string description; 227 if (!tidToSocketNameMap.contains(tid)) 228 { 229 description += "TID " + std::to_string(tid) + ": "; 230 } 231 else 232 { 233 description += tidToSocketNameMap[tid] + ": "; 234 } 235 236 if (!sensorIdToStrMap.contains(sensorId)) 237 { 238 description += "Sensor ID " + std::to_string(sensorId) + ": "; 239 } 240 else 241 { 242 description += sensorIdToStrMap[sensorId] + ": "; 243 } 244 245 return description; 246 } 247 248 void OemEventManager::sendJournalRedfish(const std::string& description, 249 log_level& logLevel) 250 { 251 if (description.empty()) 252 { 253 return; 254 } 255 256 if (!logLevelToRedfishMsgIdMap.contains(logLevel)) 257 { 258 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel, 259 "DES", description); 260 return; 261 } 262 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel]; 263 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID", 264 redfishMsgId, "REDFISH_MESSAGE_ARGS", description); 265 } 266 267 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs) 268 { 269 std::string description; 270 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum)) 271 { 272 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx)) 273 { 274 description += " #" + std::to_string(bitIdx); 275 } 276 } 277 return description; 278 } 279 280 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId) 281 { 282 uint8_t dimmIdx = maxDIMMInstantNum; 283 int sensorId_Off = sensorId - 4; 284 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) && 285 ((sensorId_Off / 2) < maxDIMMInstantNum)) 286 { 287 dimmIdx = sensorId_Off / 2; 288 } 289 return dimmIdx; 290 } 291 292 void OemEventManager::handleBootOverallEvent( 293 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading) 294 { 295 log_level logLevel{log_level::OK}; 296 std::string description; 297 std::stringstream strStream; 298 299 uint8_t byte0 = (presentReading & 0x000000ff); 300 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8; 301 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16; 302 uint8_t byte3 = (presentReading & 0xff000000) >> 24; 303 /* 304 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31, 305 * ATF BL32 and DDR initialization 306 */ 307 if (bootStageToMsgMap.contains(byte3)) 308 { 309 // Boot stage adding 310 description += bootStageToMsgMap[byte3]; 311 312 switch (byte3) 313 { 314 case boot_stage::DDR_TRAINING: 315 if (byte0 >= ddrTrainingMsg.size()) 316 { 317 logLevel = log_level::BIOSFWPANIC; 318 description += " unknown status"; 319 } 320 else 321 { 322 description += ddrTrainingMsg[byte0]; 323 } 324 if (0x01 == byte0) 325 { 326 // Add complete percentage 327 description += " at " + std::to_string(byte1) + "%"; 328 } 329 break; 330 case boot_stage::S0_DDR_TRAINING_FAILURE: 331 case boot_stage::S1_DDR_TRAINING_FAILURE: 332 // ddr_training_status_msg() 333 logLevel = log_level::BIOSFWPANIC; 334 description += " at DIMMs:"; 335 // dimmIdxs = presentReading & 0x00ffffff; 336 description += dimmIdxsToString(presentReading & 0x00ffffff); 337 description += " of socket "; 338 description += 339 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1"; 340 break; 341 default: 342 if (byte0 >= bootStatMsg.size()) 343 { 344 logLevel = log_level::BIOSFWPANIC; 345 description += " unknown status"; 346 } 347 else 348 { 349 description += bootStatMsg[byte0]; 350 } 351 break; 352 } 353 354 // Sensor report action is fail 355 if (boot::status::BOOT_STATUS_FAILURE == byte2) 356 { 357 logLevel = log_level::BIOSFWPANIC; 358 } 359 } 360 else 361 { 362 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX) 363 { 364 description += 365 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN]; 366 367 strStream 368 << "Segment (0x" << std::setfill('0') << std::hex 369 << std::setw(8) << static_cast<uint32_t>(presentReading) 370 << "); Status Class (0x" << std::setw(2) 371 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x" 372 << std::setw(2) << static_cast<uint32_t>(byte2) 373 << "); Operation Code (0x" << std::setw(4) 374 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16) 375 << ")" << std::dec; 376 377 description += strStream.str(); 378 } 379 } 380 381 // Log to Redfish event 382 sendJournalRedfish(description, logLevel); 383 } 384 385 int OemEventManager::processNumericSensorEvent( 386 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData, 387 size_t sensorDataLength) 388 { 389 uint8_t eventState = 0; 390 uint8_t previousEventState = 0; 391 uint8_t sensorDataSize = 0; 392 uint32_t presentReading; 393 auto rc = decode_numeric_sensor_data( 394 sensorData, sensorDataLength, &eventState, &previousEventState, 395 &sensorDataSize, &presentReading); 396 if (rc) 397 { 398 lg2::error( 399 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ", 400 "TID", tid, "RC", rc); 401 return rc; 402 } 403 404 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1) 405 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum) 406 { 407 handleDIMMStatusEvent(tid, sensorId, presentReading); 408 return PLDM_SUCCESS; 409 } 410 411 switch (sensorId) 412 { 413 case BOOT_OVERALL: 414 handleBootOverallEvent(tid, sensorId, presentReading); 415 break; 416 case PCIE_HOT_PLUG: 417 handlePCIeHotPlugEvent(tid, sensorId, presentReading); 418 break; 419 case DDR_STATUS: 420 handleDDRStatusEvent(tid, sensorId, presentReading); 421 break; 422 case PCP_VR_STATE: 423 case SOC_VR_STATE: 424 case DPHY_VR1_STATE: 425 case DPHY_VR2_STATE: 426 case D2D_VR_STATE: 427 case IOC_VR1_STATE: 428 case IOC_VR2_STATE: 429 case PCI_D_VR_STATE: 430 case PCI_A_VR_STATE: 431 handleVRDStatusEvent(tid, sensorId, presentReading); 432 break; 433 case WATCH_DOG: 434 handleNumericWatchdogEvent(tid, sensorId, presentReading); 435 break; 436 default: 437 std::string description; 438 std::stringstream strStream; 439 log_level logLevel = log_level::OK; 440 441 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: "; 442 description += prefixMsgStrCreation(tid, sensorId); 443 strStream << std::setfill('0') << std::hex << "eventState 0x" 444 << std::setw(2) << static_cast<uint32_t>(eventState) 445 << " previousEventState 0x" << std::setw(2) 446 << static_cast<uint32_t>(previousEventState) 447 << " sensorDataSize 0x" << std::setw(2) 448 << static_cast<uint32_t>(sensorDataSize) 449 << " presentReading 0x" << std::setw(8) 450 << static_cast<uint32_t>(presentReading) << std::dec; 451 description += strStream.str(); 452 453 sendJournalRedfish(description, logLevel); 454 break; 455 } 456 return PLDM_SUCCESS; 457 } 458 459 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId, 460 const uint8_t* sensorData, 461 size_t sensorDataLength) 462 { 463 uint8_t sensorOffset = 0; 464 uint8_t eventState = 0; 465 uint8_t previousEventState = 0; 466 467 auto rc = 468 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset, 469 &eventState, &previousEventState); 470 if (rc) 471 { 472 lg2::error( 473 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}", 474 "TID", tid, "RC", rc); 475 return rc; 476 } 477 478 std::string description; 479 log_level logLevel = log_level::OK; 480 481 if (stateSensorToMsgMap.contains(sensorId)) 482 { 483 description += prefixMsgStrCreation(tid, sensorId); 484 auto componentMap = stateSensorToMsgMap[sensorId]; 485 if (sensorOffset < componentMap.size()) 486 { 487 description += std::get<0>(componentMap[sensorOffset]); 488 auto stateMap = std::get<1>(componentMap[sensorOffset]); 489 if (stateMap.contains(eventState)) 490 { 491 logLevel = std::get<0>(stateMap[eventState]); 492 description += " state : " + std::get<1>(stateMap[eventState]); 493 if (stateMap.contains(previousEventState)) 494 { 495 description += "; previous state: " + 496 std::get<1>(stateMap[previousEventState]); 497 } 498 } 499 else 500 { 501 description += " sends unsupported event state: " + 502 std::to_string(eventState); 503 if (stateMap.contains(previousEventState)) 504 { 505 description += "; previous state: " + 506 std::get<1>(stateMap[previousEventState]); 507 } 508 } 509 } 510 else 511 { 512 description += "sends unsupported component sensor offset " + 513 std::to_string(sensorOffset); 514 } 515 } 516 else 517 { 518 std::stringstream strStream; 519 description += "SENSOR_EVENT : STATE_SENSOR_STATE: "; 520 description += prefixMsgStrCreation(tid, sensorId); 521 strStream << std::setfill('0') << std::hex << "sensorOffset 0x" 522 << std::setw(2) << static_cast<uint32_t>(sensorOffset) 523 << "eventState 0x" << std::setw(2) 524 << static_cast<uint32_t>(eventState) 525 << " previousEventState 0x" << std::setw(2) 526 << static_cast<uint32_t>(previousEventState) << std::dec; 527 description += strStream.str(); 528 } 529 530 sendJournalRedfish(description, logLevel); 531 532 return PLDM_SUCCESS; 533 } 534 535 int OemEventManager::processSensorOpStateEvent( 536 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData, 537 size_t sensorDataLength) 538 { 539 uint8_t present_op_state = 0; 540 uint8_t previous_op_state = 0; 541 542 auto rc = decode_sensor_op_data(sensorData, sensorDataLength, 543 &present_op_state, &previous_op_state); 544 if (rc) 545 { 546 lg2::error( 547 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}", 548 "TID", tid, "RC", rc); 549 return rc; 550 } 551 552 std::string description; 553 std::stringstream strStream; 554 log_level logLevel = log_level::OK; 555 556 description += "SENSOR_EVENT : SENSOR_OP_STATE: "; 557 description += prefixMsgStrCreation(tid, sensorId); 558 strStream << std::setfill('0') << std::hex << "present_op_state 0x" 559 << std::setw(2) << static_cast<uint32_t>(present_op_state) 560 << "previous_op_state 0x" << std::setw(2) 561 << static_cast<uint32_t>(previous_op_state) << std::dec; 562 description += strStream.str(); 563 564 sendJournalRedfish(description, logLevel); 565 566 return PLDM_SUCCESS; 567 } 568 569 int OemEventManager::handleSensorEvent( 570 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */, 571 pldm_tid_t tid, size_t eventDataOffset) 572 { 573 /* This OEM event handler is only used for SoC terminus*/ 574 if (!tidToSocketNameMap.contains(tid)) 575 { 576 return PLDM_SUCCESS; 577 } 578 auto eventData = 579 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset; 580 auto eventDataSize = payloadLength - eventDataOffset; 581 582 uint16_t sensorId = 0; 583 uint8_t sensorEventClassType = 0; 584 size_t eventClassDataOffset = 0; 585 auto rc = 586 decode_sensor_event_data(eventData, eventDataSize, &sensorId, 587 &sensorEventClassType, &eventClassDataOffset); 588 if (rc) 589 { 590 lg2::error("Failed to decode sensor event data return code {RC}.", "RC", 591 rc); 592 return rc; 593 } 594 const uint8_t* sensorData = eventData + eventClassDataOffset; 595 size_t sensorDataLength = eventDataSize - eventClassDataOffset; 596 597 switch (sensorEventClassType) 598 { 599 case PLDM_NUMERIC_SENSOR_STATE: 600 { 601 return processNumericSensorEvent(tid, sensorId, sensorData, 602 sensorDataLength); 603 } 604 case PLDM_STATE_SENSOR_STATE: 605 { 606 return processStateSensorEvent(tid, sensorId, sensorData, 607 sensorDataLength); 608 } 609 case PLDM_SENSOR_OP_STATE: 610 { 611 return processSensorOpStateEvent(tid, sensorId, sensorData, 612 sensorDataLength); 613 } 614 default: 615 std::string description; 616 std::stringstream strStream; 617 log_level logLevel = log_level::OK; 618 619 description += "SENSOR_EVENT : Unsupported Sensor Class " + 620 std::to_string(sensorEventClassType) + ": "; 621 description += prefixMsgStrCreation(tid, sensorId); 622 strStream << std::setfill('0') << std::hex 623 << std::setw(sizeof(sensorData) * 2) << "Sensor data: "; 624 625 auto dataPtr = sensorData; 626 for ([[maybe_unused]] const auto& i : 627 std::views::iota(0, (int)sensorDataLength)) 628 { 629 strStream << "0x" << static_cast<uint32_t>(*dataPtr); 630 dataPtr += sizeof(sensorData); 631 } 632 633 description += strStream.str(); 634 635 sendJournalRedfish(description, logLevel); 636 } 637 lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE", 638 sensorEventClassType); 639 return PLDM_ERROR; 640 } 641 642 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId, 643 uint32_t presentReading) 644 { 645 std::string description; 646 std::stringstream strStream; 647 PCIeHotPlugEventRecord_t record{presentReading}; 648 649 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal"; 650 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed"; 651 log_level logLevel = 652 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING; 653 654 description += prefixMsgStrCreation(tid, sensorId); 655 656 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2) 657 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x" 658 << std::setw(2) << static_cast<uint32_t>(record.bits.bus) 659 << "); Device (0x" << std::setw(2) 660 << static_cast<uint32_t>(record.bits.device) << "); Function (0x" 661 << std::setw(2) << static_cast<uint32_t>(record.bits.function) 662 << "); Action (" << sAction << "); Operation status (" 663 << sOpStatus << "); Media slot number (" << std::dec 664 << static_cast<uint32_t>(record.bits.mediaSlot) << ")"; 665 666 description += strStream.str(); 667 668 // Log to Redfish event 669 sendJournalRedfish(description, logLevel); 670 } 671 672 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo) 673 { 674 std::string description; 675 DIMMTrainingFailure_t failure{failureInfo}; 676 677 if (dimmTrainingFailureTypeMap.contains(failure.bits.type)) 678 { 679 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type]; 680 681 description += std::get<0>(failureInfoMap); 682 683 description += "; MCU rank index " + 684 std::to_string(failure.bits.mcuRankIdx); 685 686 description += "; Slice number " + 687 std::to_string(failure.bits.sliceNum); 688 689 description += "; Upper nibble error status: "; 690 description += (!failure.bits.upperNibbStatErr) 691 ? "No error" 692 : "Found no rising edge"; 693 694 description += "; Lower nibble error status: "; 695 description += (!failure.bits.lowerNibbStatErr) 696 ? "No error" 697 : "Found no rising edge"; 698 699 description += "; Failure syndrome 0: "; 700 701 auto& syndromeMap = std::get<1>(failureInfoMap); 702 if (syndromeMap.contains(failure.bits.syndrome)) 703 { 704 description += syndromeMap[failure.bits.syndrome]; 705 } 706 else 707 { 708 description += "(Unknown syndrome)"; 709 } 710 } 711 else 712 { 713 description += "Unknown training failure type " + 714 std::to_string(failure.bits.type); 715 } 716 717 return description; 718 } 719 720 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId, 721 uint32_t presentReading) 722 { 723 log_level logLevel{log_level::WARNING}; 724 std::string description; 725 uint8_t byte3 = (presentReading & 0xff000000) >> 24; 726 uint32_t byte012 = presentReading & 0xffffff; 727 728 description += prefixMsgStrCreation(tid, sensorId); 729 730 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1) 731 auto dimmIdx = sensorIdToDIMMIdx(sensorId); 732 if (dimmIdx >= maxDIMMIdxBitNum) 733 { 734 return; 735 } 736 737 description += "DIMM " + std::to_string(dimmIdx) + " "; 738 739 if (dimmStatusToMsgMap.contains(byte3)) 740 { 741 if (byte3 == dimm_status::INSTALLED_NO_ERROR || 742 byte3 == dimm_status::INSTALLED_BUT_DISABLED) 743 { 744 logLevel = log_level::OK; 745 } 746 747 description += dimmStatusToMsgMap[byte3]; 748 749 if (byte3 == dimm_status::TRAINING_FAILURE) 750 { 751 description += "; " + dimmTrainingFailureToMsg(byte012); 752 } 753 else if (byte3 == dimm_status::PMIC_TEMP_ALERT) 754 { 755 uint8_t byte0 = (byte012 & 0xff); 756 if (byte0 < pmicTempAlertMsg.size()) 757 { 758 description += ": " + pmicTempAlertMsg[byte0]; 759 } 760 } 761 } 762 else 763 { 764 switch (byte3) 765 { 766 case dimm_status::PMIC_HIGH_TEMP: 767 if (byte012 == 0x01) 768 { 769 description += "has PMIC high temp condition"; 770 } 771 break; 772 case dimm_status::TSx_HIGH_TEMP: 773 switch (byte012) 774 { 775 case 0x01: 776 description += "has TS0"; 777 break; 778 case 0x02: 779 description += "has TS1"; 780 break; 781 case 0x03: 782 description += "has TS0 and TS1"; 783 break; 784 } 785 description += " exceeding their high temperature threshold"; 786 break; 787 case dimm_status::SPD_HUB_HIGH_TEMP: 788 if (byte012 == 0x01) 789 { 790 description += "has SPD/HUB high temp condition"; 791 } 792 break; 793 default: 794 description += "has unsupported status " + 795 std::to_string(byte3); 796 break; 797 } 798 } 799 800 // Log to Redfish event 801 sendJournalRedfish(description, logLevel); 802 } 803 804 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId, 805 uint32_t presentReading) 806 { 807 log_level logLevel{log_level::WARNING}; 808 std::string description; 809 uint8_t byte3 = (presentReading & 0xff000000) >> 24; 810 uint32_t byte012 = presentReading & 0xffffff; 811 812 description += prefixMsgStrCreation(tid, sensorId); 813 814 description += "DDR "; 815 if (ddrStatusToMsgMap.contains(byte3)) 816 { 817 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR) 818 { 819 logLevel = log_level::OK; 820 } 821 822 description += ddrStatusToMsgMap[byte3]; 823 824 if (byte3 == ddr_status::CONFIGURATION_FAILURE || 825 byte3 == ddr_status::TRAINING_FAILURE) 826 { 827 // List out failed DIMMs 828 description += dimmIdxsToString(byte012); 829 } 830 } 831 else 832 { 833 description += "has unsupported status " + std::to_string(byte3); 834 } 835 836 // Log to Redfish event 837 sendJournalRedfish(description, logLevel); 838 } 839 840 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId, 841 uint32_t presentReading) 842 { 843 log_level logLevel{log_level::WARNING}; 844 std::string description; 845 std::stringstream strStream; 846 847 description += prefixMsgStrCreation(tid, sensorId); 848 849 VRDStatus_t status{presentReading}; 850 851 if (status.bits.warning && status.bits.critical) 852 { 853 description += "A VR warning and a VR critical"; 854 logLevel = log_level::CRITICAL; 855 } 856 else 857 { 858 if (status.bits.warning) 859 { 860 description += "A VR warning"; 861 } 862 else if (status.bits.critical) 863 { 864 description += "A VR critical"; 865 logLevel = log_level::CRITICAL; 866 } 867 else 868 { 869 description += "No VR warning or critical"; 870 logLevel = log_level::OK; 871 } 872 } 873 description += " condition observed"; 874 875 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex 876 << std::setw(2) 877 << static_cast<uint32_t>(status.bits.vr_status_byte_high) 878 << "; VR status byte low is 0x" << std::setw(2) 879 << static_cast<uint32_t>(status.bits.vr_status_byte_low) 880 << "; Reading is 0x" << std::setw(2) 881 << static_cast<uint32_t>(presentReading) << ";"; 882 883 description += strStream.str(); 884 885 // Log to Redfish event 886 sendJournalRedfish(description, logLevel); 887 } 888 889 void OemEventManager::handleNumericWatchdogEvent( 890 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading) 891 { 892 std::string description; 893 log_level logLevel = log_level::CRITICAL; 894 895 description += prefixMsgStrCreation(tid, sensorId); 896 897 if (presentReading & 0x01) 898 { 899 description += "Global watchdog expired;"; 900 } 901 if (presentReading & 0x02) 902 { 903 description += "Secure watchdog expired;"; 904 } 905 if (presentReading & 0x04) 906 { 907 description += "Non-secure watchdog expired;"; 908 } 909 910 // Log to Redfish event 911 sendJournalRedfish(description, logLevel); 912 } 913 914 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId, 915 const uint8_t* eventData, 916 size_t eventDataSize) 917 { 918 EFI_AMPERE_ERROR_DATA ampHdr; 919 920 decodeCperRecord(eventData, eventDataSize, &Hdr); 921 922 addCperSELLog(tid, eventId, &Hdr); 923 924 /* isBert at bit 12 of TypeId */ 925 if (ampHdr.TypeId & 0x0800) 926 { 927 lg2::info("Ampere SoC BERT is triggered."); 928 std::variant<std::string> value( 929 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert"); 930 try 931 { 932 auto& bus = pldm::utils::DBusHandler::getBus(); 933 auto method = 934 bus.new_method_call("com.ampere.CrashCapture.Trigger", 935 "/com/ampere/crashcapture/trigger", 936 pldm::utils::dbusProperties, "Set"); 937 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions", 938 value); 939 bus.call_noreply(method); 940 } 941 catch (const std::exception& e) 942 { 943 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e); 944 } 945 } 946 947 return PLDM_SUCCESS; 948 } 949 950 int OemEventManager::handlepldmMessagePollEvent( 951 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */, 952 pldm_tid_t tid, size_t eventDataOffset) 953 { 954 /* This OEM event handler is only used for SoC terminus*/ 955 if (!tidToSocketNameMap.contains(tid)) 956 { 957 return PLDM_SUCCESS; 958 } 959 960 auto eventData = 961 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset; 962 auto eventDataSize = payloadLength - eventDataOffset; 963 964 pldm_message_poll_event poll_event{}; 965 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize, 966 &poll_event); 967 if (rc) 968 { 969 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ", 970 "RC", rc); 971 return rc; 972 } 973 974 auto sensorID = poll_event.event_id; 975 /* The UE errors */ 976 if (rasUESensorIDs.contains(sensorID)) 977 { 978 pldm::utils::DBusMapping dbusMapping{ 979 "/xyz/openbmc_project/led/groups/ras_ue_fault", 980 "xyz.openbmc_project.Led.Group", "Asserted", "bool"}; 981 try 982 { 983 pldm::utils::DBusHandler().setDbusProperty( 984 dbusMapping, pldm::utils::PropertyValue{bool(true)}); 985 } 986 catch (const std::exception& e) 987 { 988 lg2::error( 989 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}", 990 "TID", tid, "SENSORID", sensorID, "ERROR", e); 991 } 992 } 993 994 return PLDM_SUCCESS; 995 } 996 997 } // namespace oem_ampere 998 } // namespace pldm 999