1 #include "oem_event_manager.hpp" 2 3 #include "libcper/Cper.h" 4 5 #include "cper.hpp" 6 #include "requester/handler.hpp" 7 #include "requester/request.hpp" 8 9 #include <config.h> 10 #include <libpldm/pldm.h> 11 #include <libpldm/utils.h> 12 #include <systemd/sd-journal.h> 13 14 #include <phosphor-logging/lg2.hpp> 15 #include <xyz/openbmc_project/Logging/Entry/server.hpp> 16 17 #include <algorithm> 18 #include <map> 19 #include <set> 20 #include <sstream> 21 #include <string> 22 #include <unordered_map> 23 24 namespace pldm 25 { 26 namespace oem_ampere 27 { 28 namespace fs = std::filesystem; 29 using namespace std::chrono; 30 31 namespace boot_stage = boot::stage; 32 namespace ddr_status = ddr::status; 33 namespace dimm_status = dimm::status; 34 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome; 35 namespace phy_syndrome = dimm::training_failure::phy_syndrome; 36 namespace training_failure = dimm::training_failure; 37 38 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent"; 39 constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning"; 40 constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical"; 41 constexpr const char* BIOSFWPanicRegistry = 42 "OpenBMC.0.1.BIOSFirmwarePanicReason"; 43 constexpr auto maxDIMMIdxBitNum = 24; 44 constexpr auto maxDIMMInstantNum = 24; 45 46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE}; 47 48 /* 49 An array of possible boot status of a boot stage. 50 The index maps with byte 0 of boot code. 51 */ 52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"}; 53 54 /* 55 An array of possible boot status of DDR training stage. 56 The index maps with byte 0 of boot code. 57 */ 58 std::array<std::string, 3> ddrTrainingMsg = { 59 " progress started", " in-progress", " progress completed"}; 60 61 /* 62 A map between PMIC status and logging strings. 63 */ 64 std::array<std::string, 8> pmicTempAlertMsg = { 65 "Below 85°C", "85°C", "95°C", "105°C", 66 "115°C", "125°C", "135°C", "Equal or greater than 140°C"}; 67 68 /* 69 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC 70 EPs through SMBus and PCIe. When host boots up, SMBUS interface 71 comes up first. In this interface, BMC is bus owner. 72 73 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available). 74 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available). 75 */ 76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}}; 77 78 /* 79 A map between sensor IDs and their names in string. 80 Using pldm::oem::sensor_ids 81 */ 82 EventToMsgMap_t sensorIdToStrMap = { 83 {DDR_STATUS, "DDR_STATUS"}, 84 {PCP_VR_STATE, "PCP_VR_STATE"}, 85 {SOC_VR_STATE, "SOC_VR_STATE"}, 86 {DPHY_VR1_STATE, "DPHY_VR1_STATE"}, 87 {DPHY_VR2_STATE, "DPHY_VR2_STATE"}, 88 {D2D_VR_STATE, "D2D_VR_STATE"}, 89 {IOC_VR1_STATE, "IOC_VR1_STATE"}, 90 {IOC_VR2_STATE, "IOC_VR2_STATE"}, 91 {PCI_D_VR_STATE, "PCI_D_VR_STATE"}, 92 {PCI_A_VR_STATE, "PCI_A_VR_STATE"}, 93 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"}, 94 {BOOT_OVERALL, "BOOT_OVERALL"}, 95 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"}, 96 {WATCH_DOG, "WATCH_DOG"}}; 97 98 /* 99 A map between the boot stages and logging strings. 100 Using pldm::oem::boot::stage::boot_stage 101 */ 102 EventToMsgMap_t bootStageToMsgMap = { 103 {boot_stage::SECPRO, "SECpro"}, 104 {boot_stage::MPRO, "Mpro"}, 105 {boot_stage::ATF_BL1, "ATF BL1"}, 106 {boot_stage::ATF_BL2, "ATF BL2"}, 107 {boot_stage::DDR_INITIALIZATION, "DDR initialization"}, 108 {boot_stage::DDR_TRAINING, "DDR training"}, 109 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"}, 110 {boot_stage::ATF_BL31, "ATF BL31"}, 111 {boot_stage::ATF_BL32, "ATF BL32"}, 112 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"}, 113 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN, 114 "ATF BL33 (UEFI) booting status = "}}; 115 116 /* 117 A map between DDR status and logging strings. 118 Using pldm::oem::ddr::status::ddr_status 119 */ 120 EventToMsgMap_t ddrStatusToMsgMap = { 121 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"}, 122 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"}, 123 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"}, 124 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"}, 125 {ddr_status::OTHER_FAILURE, "has other failure"}, 126 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG, 127 "has boot failure due to no configuration"}, 128 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS, 129 "failsafe activated but boot success with the next valid configuration"}}; 130 131 /* 132 A map between DIMM status and logging strings. 133 Using pldm::oem::dimm::status::dimm_status 134 */ 135 EventToMsgMap_t dimmStatusToMsgMap = { 136 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"}, 137 {dimm_status::NOT_INSTALLED, "is not installed"}, 138 {dimm_status::OTHER_FAILURE, "has other failure"}, 139 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"}, 140 {dimm_status::TRAINING_FAILURE, "has training failure; "}, 141 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}}; 142 143 /* 144 A map between PHY training failure syndrome and logging strings. 145 Using 146 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome 147 */ 148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = { 149 {phy_syndrome::NA, "(N/A)"}, 150 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"}, 151 {phy_syndrome::CA_LEVELING, "(CA leveling)"}, 152 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE, 153 "(PHY write level failure - see syndrome 1)"}, 154 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE, 155 "(PHY read gate leveling failure)"}, 156 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"}, 157 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"}, 158 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}}; 159 160 /* 161 A map between DIMM training failure syndrome and logging strings. 162 Using 163 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome 164 */ 165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = { 166 {dimm_syndrome::NA, "(N/A)"}, 167 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE, 168 "(DRAM VREFDQ training failure)"}, 169 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"}, 170 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE, 171 "(LRDRIMM DB SW training failure)"}}; 172 173 /* 174 A map between DIMM training failure type and a pair of <logging strings - 175 syndrome map>. Using 176 pldm::oem::dimm::training_faillure::dimm_training_failure_type 177 */ 178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>> 179 dimmTrainingFailureTypeMap = { 180 {training_failure::PHY_TRAINING_FAILURE_TYPE, 181 std::make_pair("PHY training failure", 182 phyTrainingFailureSyndromeToMsgMap)}, 183 {training_failure::DIMM_TRAINING_FAILURE_TYPE, 184 std::make_pair("DIMM training failure", 185 dimmTrainingFailureSyndromeToMsgMap)}}; 186 187 /* 188 A map between log level and the registry used for Redfish SEL log 189 Using pldm::oem::log_level 190 */ 191 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = { 192 {log_level::OK, ampereEventRegistry}, 193 {log_level::WARNING, ampereWarningRegistry}, 194 {log_level::CRITICAL, ampereCriticalRegistry}, 195 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}}; 196 197 std::unordered_map< 198 uint16_t, 199 std::vector<std::pair< 200 std::string, 201 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>> 202 stateSensorToMsgMap = { 203 {SOC_HEALTH_AVAILABILITY, 204 {{"SoC Health", 205 {{1, {log_level::OK, "Normal"}}, 206 {2, {log_level::WARNING, "Non-Critical"}}, 207 {3, {log_level::CRITICAL, "Critical"}}, 208 {4, {log_level::CRITICAL, "Fatal"}}}}, 209 {"SoC Availability", 210 {{1, {log_level::OK, "Enabled"}}, 211 {2, {log_level::WARNING, "Disabled"}}, 212 {3, {log_level::CRITICAL, "Shutdown"}}}}}}, 213 {WATCH_DOG, 214 {{"Global Watch Dog", 215 {{1, {log_level::OK, "Normal"}}, 216 {2, {log_level::CRITICAL, "Timer Expired"}}}}, 217 {"Secure Watch Dog", 218 {{1, {log_level::OK, "Normal"}}, 219 {2, {log_level::CRITICAL, "Timer Expired"}}}}, 220 {"Non-secure Watch Dog", 221 {{1, {log_level::OK, "Normal"}}, 222 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}}; 223 prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)224 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, 225 uint16_t sensorId) 226 { 227 std::string description; 228 if (!tidToSocketNameMap.contains(tid)) 229 { 230 description += "TID " + std::to_string(tid) + ": "; 231 } 232 else 233 { 234 description += tidToSocketNameMap[tid] + ": "; 235 } 236 237 if (!sensorIdToStrMap.contains(sensorId)) 238 { 239 description += "Sensor ID " + std::to_string(sensorId) + ": "; 240 } 241 else 242 { 243 description += sensorIdToStrMap[sensorId] + ": "; 244 } 245 246 return description; 247 } 248 sendJournalRedfish(const std::string & description,log_level & logLevel)249 void OemEventManager::sendJournalRedfish(const std::string& description, 250 log_level& logLevel) 251 { 252 if (description.empty()) 253 { 254 return; 255 } 256 257 if (!logLevelToRedfishMsgIdMap.contains(logLevel)) 258 { 259 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel, 260 "DES", description); 261 return; 262 } 263 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel]; 264 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID", 265 redfishMsgId, "REDFISH_MESSAGE_ARGS", description); 266 } 267 dimmIdxsToString(uint32_t dimmIdxs)268 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs) 269 { 270 std::string description; 271 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum)) 272 { 273 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx)) 274 { 275 description += " #" + std::to_string(bitIdx); 276 } 277 } 278 return description; 279 } 280 sensorIdToDIMMIdx(const uint16_t & sensorId)281 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId) 282 { 283 uint8_t dimmIdx = maxDIMMInstantNum; 284 int sensorId_Off = sensorId - 4; 285 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) && 286 ((sensorId_Off / 2) < maxDIMMInstantNum)) 287 { 288 dimmIdx = sensorId_Off / 2; 289 } 290 return dimmIdx; 291 } 292 handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)293 void OemEventManager::handleBootOverallEvent( 294 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading) 295 { 296 log_level logLevel{log_level::OK}; 297 std::string description; 298 std::stringstream strStream; 299 300 uint8_t byte0 = (presentReading & 0x000000ff); 301 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8; 302 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16; 303 uint8_t byte3 = (presentReading & 0xff000000) >> 24; 304 /* 305 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31, 306 * ATF BL32 and DDR initialization 307 */ 308 if (bootStageToMsgMap.contains(byte3)) 309 { 310 // Boot stage adding 311 description += bootStageToMsgMap[byte3]; 312 313 switch (byte3) 314 { 315 case boot_stage::DDR_TRAINING: 316 if (byte0 >= ddrTrainingMsg.size()) 317 { 318 logLevel = log_level::BIOSFWPANIC; 319 description += " unknown status"; 320 } 321 else 322 { 323 description += ddrTrainingMsg[byte0]; 324 } 325 if (0x01 == byte0) 326 { 327 // Add complete percentage 328 description += " at " + std::to_string(byte1) + "%"; 329 } 330 break; 331 case boot_stage::S0_DDR_TRAINING_FAILURE: 332 case boot_stage::S1_DDR_TRAINING_FAILURE: 333 // ddr_training_status_msg() 334 logLevel = log_level::BIOSFWPANIC; 335 description += " at DIMMs:"; 336 // dimmIdxs = presentReading & 0x00ffffff; 337 description += dimmIdxsToString(presentReading & 0x00ffffff); 338 description += " of socket "; 339 description += 340 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1"; 341 break; 342 default: 343 if (byte0 >= bootStatMsg.size()) 344 { 345 logLevel = log_level::BIOSFWPANIC; 346 description += " unknown status"; 347 } 348 else 349 { 350 description += bootStatMsg[byte0]; 351 } 352 break; 353 } 354 355 // Sensor report action is fail 356 if (boot::status::BOOT_STATUS_FAILURE == byte2) 357 { 358 logLevel = log_level::BIOSFWPANIC; 359 } 360 } 361 else 362 { 363 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX) 364 { 365 description += 366 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN]; 367 368 strStream 369 << "Segment (0x" << std::setfill('0') << std::hex 370 << std::setw(8) << static_cast<uint32_t>(presentReading) 371 << "); Status Class (0x" << std::setw(2) 372 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x" 373 << std::setw(2) << static_cast<uint32_t>(byte2) 374 << "); Operation Code (0x" << std::setw(4) 375 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16) 376 << ")" << std::dec; 377 378 description += strStream.str(); 379 } 380 } 381 382 // Log to Redfish event 383 sendJournalRedfish(description, logLevel); 384 } 385 processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)386 int OemEventManager::processNumericSensorEvent( 387 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData, 388 size_t sensorDataLength) 389 { 390 uint8_t eventState = 0; 391 uint8_t previousEventState = 0; 392 uint8_t sensorDataSize = 0; 393 uint32_t presentReading; 394 auto rc = decode_numeric_sensor_data( 395 sensorData, sensorDataLength, &eventState, &previousEventState, 396 &sensorDataSize, &presentReading); 397 if (rc) 398 { 399 lg2::error( 400 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ", 401 "TID", tid, "RC", rc); 402 return rc; 403 } 404 405 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1) 406 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum) 407 { 408 handleDIMMStatusEvent(tid, sensorId, presentReading); 409 return PLDM_SUCCESS; 410 } 411 412 switch (sensorId) 413 { 414 case BOOT_OVERALL: 415 handleBootOverallEvent(tid, sensorId, presentReading); 416 break; 417 case PCIE_HOT_PLUG: 418 handlePCIeHotPlugEvent(tid, sensorId, presentReading); 419 break; 420 case DDR_STATUS: 421 handleDDRStatusEvent(tid, sensorId, presentReading); 422 break; 423 case PCP_VR_STATE: 424 case SOC_VR_STATE: 425 case DPHY_VR1_STATE: 426 case DPHY_VR2_STATE: 427 case D2D_VR_STATE: 428 case IOC_VR1_STATE: 429 case IOC_VR2_STATE: 430 case PCI_D_VR_STATE: 431 case PCI_A_VR_STATE: 432 handleVRDStatusEvent(tid, sensorId, presentReading); 433 break; 434 case WATCH_DOG: 435 handleNumericWatchdogEvent(tid, sensorId, presentReading); 436 break; 437 default: 438 std::string description; 439 std::stringstream strStream; 440 441 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: "; 442 description += prefixMsgStrCreation(tid, sensorId); 443 strStream << std::setfill('0') << std::hex << "eventState 0x" 444 << std::setw(2) << static_cast<uint32_t>(eventState) 445 << " previousEventState 0x" << std::setw(2) 446 << static_cast<uint32_t>(previousEventState) 447 << " sensorDataSize 0x" << std::setw(2) 448 << static_cast<uint32_t>(sensorDataSize) 449 << " presentReading 0x" << std::setw(8) 450 << static_cast<uint32_t>(presentReading) << std::dec; 451 description += strStream.str(); 452 std::cout << description << "\n"; 453 } 454 return PLDM_SUCCESS; 455 } 456 processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)457 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId, 458 const uint8_t* sensorData, 459 size_t sensorDataLength) 460 { 461 uint8_t sensorOffset = 0; 462 uint8_t eventState = 0; 463 uint8_t previousEventState = 0; 464 465 auto rc = 466 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset, 467 &eventState, &previousEventState); 468 if (rc) 469 { 470 lg2::error( 471 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}", 472 "TID", tid, "RC", rc); 473 return rc; 474 } 475 476 std::string description; 477 478 if (stateSensorToMsgMap.contains(sensorId)) 479 { 480 log_level logLevel = log_level::OK; 481 482 description += prefixMsgStrCreation(tid, sensorId); 483 auto componentMap = stateSensorToMsgMap[sensorId]; 484 if (sensorOffset < componentMap.size()) 485 { 486 description += std::get<0>(componentMap[sensorOffset]); 487 auto stateMap = std::get<1>(componentMap[sensorOffset]); 488 if (stateMap.contains(eventState)) 489 { 490 logLevel = std::get<0>(stateMap[eventState]); 491 description += " state : " + std::get<1>(stateMap[eventState]); 492 if (stateMap.contains(previousEventState)) 493 { 494 description += "; previous state: " + 495 std::get<1>(stateMap[previousEventState]); 496 } 497 } 498 else 499 { 500 description += " sends unsupported event state: " + 501 std::to_string(eventState); 502 if (stateMap.contains(previousEventState)) 503 { 504 description += "; previous state: " + 505 std::get<1>(stateMap[previousEventState]); 506 } 507 } 508 } 509 else 510 { 511 description += "sends unsupported component sensor offset " + 512 std::to_string(sensorOffset); 513 } 514 515 sendJournalRedfish(description, logLevel); 516 } 517 else 518 { 519 std::stringstream strStream; 520 description += "SENSOR_EVENT : STATE_SENSOR_STATE: "; 521 description += prefixMsgStrCreation(tid, sensorId); 522 strStream << std::setfill('0') << std::hex << "sensorOffset 0x" 523 << std::setw(2) << static_cast<uint32_t>(sensorOffset) 524 << "eventState 0x" << std::setw(2) 525 << static_cast<uint32_t>(eventState) 526 << " previousEventState 0x" << std::setw(2) 527 << static_cast<uint32_t>(previousEventState) << std::dec; 528 description += strStream.str(); 529 std::cout << description << "\n"; 530 } 531 532 return PLDM_SUCCESS; 533 } 534 processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)535 int OemEventManager::processSensorOpStateEvent( 536 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData, 537 size_t sensorDataLength) 538 { 539 uint8_t present_op_state = 0; 540 uint8_t previous_op_state = 0; 541 542 auto rc = decode_sensor_op_data(sensorData, sensorDataLength, 543 &present_op_state, &previous_op_state); 544 if (rc) 545 { 546 lg2::error( 547 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}", 548 "TID", tid, "RC", rc); 549 return rc; 550 } 551 552 std::string description; 553 std::stringstream strStream; 554 555 description += "SENSOR_EVENT : SENSOR_OP_STATE: "; 556 description += prefixMsgStrCreation(tid, sensorId); 557 strStream << std::setfill('0') << std::hex << "present_op_state 0x" 558 << std::setw(2) << static_cast<uint32_t>(present_op_state) 559 << "previous_op_state 0x" << std::setw(2) 560 << static_cast<uint32_t>(previous_op_state) << std::dec; 561 description += strStream.str(); 562 std::cout << description << "\n"; 563 564 return PLDM_SUCCESS; 565 } 566 handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)567 int OemEventManager::handleSensorEvent( 568 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */, 569 pldm_tid_t tid, size_t eventDataOffset) 570 { 571 /* This OEM event handler is only used for SoC terminus*/ 572 if (!tidToSocketNameMap.contains(tid)) 573 { 574 return PLDM_SUCCESS; 575 } 576 auto eventData = 577 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset; 578 auto eventDataSize = payloadLength - eventDataOffset; 579 580 uint16_t sensorId = 0; 581 uint8_t sensorEventClassType = 0; 582 size_t eventClassDataOffset = 0; 583 auto rc = 584 decode_sensor_event_data(eventData, eventDataSize, &sensorId, 585 &sensorEventClassType, &eventClassDataOffset); 586 if (rc) 587 { 588 lg2::error("Failed to decode sensor event data return code {RC}.", "RC", 589 rc); 590 return rc; 591 } 592 const uint8_t* sensorData = eventData + eventClassDataOffset; 593 size_t sensorDataLength = eventDataSize - eventClassDataOffset; 594 595 switch (sensorEventClassType) 596 { 597 case PLDM_NUMERIC_SENSOR_STATE: 598 { 599 return processNumericSensorEvent(tid, sensorId, sensorData, 600 sensorDataLength); 601 } 602 case PLDM_STATE_SENSOR_STATE: 603 { 604 return processStateSensorEvent(tid, sensorId, sensorData, 605 sensorDataLength); 606 } 607 case PLDM_SENSOR_OP_STATE: 608 { 609 return processSensorOpStateEvent(tid, sensorId, sensorData, 610 sensorDataLength); 611 } 612 default: 613 std::string description; 614 std::stringstream strStream; 615 616 description += "SENSOR_EVENT : Unsupported Sensor Class " + 617 std::to_string(sensorEventClassType) + ": "; 618 description += prefixMsgStrCreation(tid, sensorId); 619 strStream << std::setfill('0') << std::hex 620 << std::setw(sizeof(sensorData) * 2) << "Sensor data: "; 621 622 auto dataPtr = sensorData; 623 for ([[maybe_unused]] const auto& i : 624 std::views::iota(0, (int)sensorDataLength)) 625 { 626 strStream << "0x" << static_cast<uint32_t>(*dataPtr); 627 dataPtr += sizeof(sensorData); 628 } 629 630 description += strStream.str(); 631 std::cout << description << "\n"; 632 } 633 634 return PLDM_ERROR; 635 } 636 handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)637 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId, 638 uint32_t presentReading) 639 { 640 std::string description; 641 std::stringstream strStream; 642 PCIeHotPlugEventRecord_t record{presentReading}; 643 644 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal"; 645 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed"; 646 log_level logLevel = 647 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING; 648 649 description += prefixMsgStrCreation(tid, sensorId); 650 651 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2) 652 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x" 653 << std::setw(2) << static_cast<uint32_t>(record.bits.bus) 654 << "); Device (0x" << std::setw(2) 655 << static_cast<uint32_t>(record.bits.device) << "); Function (0x" 656 << std::setw(2) << static_cast<uint32_t>(record.bits.function) 657 << "); Action (" << sAction << "); Operation status (" 658 << sOpStatus << "); Media slot number (" << std::dec 659 << static_cast<uint32_t>(record.bits.mediaSlot) << ")"; 660 661 description += strStream.str(); 662 663 // Log to Redfish event 664 sendJournalRedfish(description, logLevel); 665 } 666 dimmTrainingFailureToMsg(uint32_t failureInfo)667 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo) 668 { 669 std::string description; 670 DIMMTrainingFailure_t failure{failureInfo}; 671 672 if (dimmTrainingFailureTypeMap.contains(failure.bits.type)) 673 { 674 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type]; 675 676 description += std::get<0>(failureInfoMap); 677 678 description += "; MCU rank index " + 679 std::to_string(failure.bits.mcuRankIdx); 680 681 description += "; Slice number " + 682 std::to_string(failure.bits.sliceNum); 683 684 description += "; Upper nibble error status: "; 685 description += (!failure.bits.upperNibbStatErr) 686 ? "No error" 687 : "Found no rising edge"; 688 689 description += "; Lower nibble error status: "; 690 description += (!failure.bits.lowerNibbStatErr) 691 ? "No error" 692 : "Found no rising edge"; 693 694 description += "; Failure syndrome 0: "; 695 696 auto& syndromeMap = std::get<1>(failureInfoMap); 697 if (syndromeMap.contains(failure.bits.syndrome)) 698 { 699 description += syndromeMap[failure.bits.syndrome]; 700 } 701 else 702 { 703 description += "(Unknown syndrome)"; 704 } 705 } 706 else 707 { 708 description += "Unknown training failure type " + 709 std::to_string(failure.bits.type); 710 } 711 712 return description; 713 } 714 handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)715 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId, 716 uint32_t presentReading) 717 { 718 log_level logLevel{log_level::WARNING}; 719 std::string description; 720 uint8_t byte3 = (presentReading & 0xff000000) >> 24; 721 uint32_t byte012 = presentReading & 0xffffff; 722 723 description += prefixMsgStrCreation(tid, sensorId); 724 725 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1) 726 auto dimmIdx = sensorIdToDIMMIdx(sensorId); 727 if (dimmIdx >= maxDIMMIdxBitNum) 728 { 729 return; 730 } 731 732 description += "DIMM " + std::to_string(dimmIdx) + " "; 733 734 if (dimmStatusToMsgMap.contains(byte3)) 735 { 736 if (byte3 == dimm_status::INSTALLED_NO_ERROR || 737 byte3 == dimm_status::INSTALLED_BUT_DISABLED) 738 { 739 logLevel = log_level::OK; 740 } 741 742 description += dimmStatusToMsgMap[byte3]; 743 744 if (byte3 == dimm_status::TRAINING_FAILURE) 745 { 746 description += "; " + dimmTrainingFailureToMsg(byte012); 747 } 748 else if (byte3 == dimm_status::PMIC_TEMP_ALERT) 749 { 750 uint8_t byte0 = (byte012 & 0xff); 751 if (byte0 < pmicTempAlertMsg.size()) 752 { 753 description += ": " + pmicTempAlertMsg[byte0]; 754 } 755 } 756 } 757 else 758 { 759 switch (byte3) 760 { 761 case dimm_status::PMIC_HIGH_TEMP: 762 if (byte012 == 0x01) 763 { 764 description += "has PMIC high temp condition"; 765 } 766 break; 767 case dimm_status::TSx_HIGH_TEMP: 768 switch (byte012) 769 { 770 case 0x01: 771 description += "has TS0"; 772 break; 773 case 0x02: 774 description += "has TS1"; 775 break; 776 case 0x03: 777 description += "has TS0 and TS1"; 778 break; 779 } 780 description += " exceeding their high temperature threshold"; 781 break; 782 case dimm_status::SPD_HUB_HIGH_TEMP: 783 if (byte012 == 0x01) 784 { 785 description += "has SPD/HUB high temp condition"; 786 } 787 break; 788 default: 789 description += "has unsupported status " + 790 std::to_string(byte3); 791 break; 792 } 793 } 794 795 // Log to Redfish event 796 sendJournalRedfish(description, logLevel); 797 } 798 handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)799 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId, 800 uint32_t presentReading) 801 { 802 log_level logLevel{log_level::WARNING}; 803 std::string description; 804 uint8_t byte3 = (presentReading & 0xff000000) >> 24; 805 uint32_t byte012 = presentReading & 0xffffff; 806 807 description += prefixMsgStrCreation(tid, sensorId); 808 809 description += "DDR "; 810 if (ddrStatusToMsgMap.contains(byte3)) 811 { 812 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR) 813 { 814 logLevel = log_level::OK; 815 } 816 817 description += ddrStatusToMsgMap[byte3]; 818 819 if (byte3 == ddr_status::CONFIGURATION_FAILURE || 820 byte3 == ddr_status::TRAINING_FAILURE) 821 { 822 // List out failed DIMMs 823 description += dimmIdxsToString(byte012); 824 } 825 } 826 else 827 { 828 description += "has unsupported status " + std::to_string(byte3); 829 } 830 831 // Log to Redfish event 832 sendJournalRedfish(description, logLevel); 833 } 834 handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)835 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId, 836 uint32_t presentReading) 837 { 838 log_level logLevel{log_level::WARNING}; 839 std::string description; 840 std::stringstream strStream; 841 842 description += prefixMsgStrCreation(tid, sensorId); 843 844 VRDStatus_t status{presentReading}; 845 846 if (status.bits.warning && status.bits.critical) 847 { 848 description += "A VR warning and a VR critical"; 849 logLevel = log_level::CRITICAL; 850 } 851 else 852 { 853 if (status.bits.warning) 854 { 855 description += "A VR warning"; 856 } 857 else if (status.bits.critical) 858 { 859 description += "A VR critical"; 860 logLevel = log_level::CRITICAL; 861 } 862 else 863 { 864 description += "No VR warning or critical"; 865 logLevel = log_level::OK; 866 } 867 } 868 description += " condition observed"; 869 870 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex 871 << std::setw(2) 872 << static_cast<uint32_t>(status.bits.vr_status_byte_high) 873 << "; VR status byte low is 0x" << std::setw(2) 874 << static_cast<uint32_t>(status.bits.vr_status_byte_low) 875 << "; Reading is 0x" << std::setw(2) 876 << static_cast<uint32_t>(presentReading) << ";"; 877 878 description += strStream.str(); 879 880 // Log to Redfish event 881 sendJournalRedfish(description, logLevel); 882 } 883 handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)884 void OemEventManager::handleNumericWatchdogEvent( 885 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading) 886 { 887 std::string description; 888 log_level logLevel = log_level::CRITICAL; 889 890 description += prefixMsgStrCreation(tid, sensorId); 891 892 if (presentReading & 0x01) 893 { 894 description += "Global watchdog expired;"; 895 } 896 if (presentReading & 0x02) 897 { 898 description += "Secure watchdog expired;"; 899 } 900 if (presentReading & 0x04) 901 { 902 description += "Non-secure watchdog expired;"; 903 } 904 905 // Log to Redfish event 906 sendJournalRedfish(description, logLevel); 907 } 908 processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)909 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId, 910 const uint8_t* eventData, 911 size_t eventDataSize) 912 { 913 EFI_AMPERE_ERROR_DATA ampHdr; 914 915 decodeCperRecord(eventData, eventDataSize, &Hdr); 916 917 addCperSELLog(tid, eventId, &Hdr); 918 919 /* isBert at bit 12 of TypeId */ 920 if (ampHdr.TypeId & 0x0800) 921 { 922 lg2::info("Ampere SoC BERT is triggered."); 923 std::variant<std::string> value( 924 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert"); 925 try 926 { 927 auto& bus = pldm::utils::DBusHandler::getBus(); 928 auto method = 929 bus.new_method_call("com.ampere.CrashCapture.Trigger", 930 "/com/ampere/crashcapture/trigger", 931 pldm::utils::dbusProperties, "Set"); 932 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions", 933 value); 934 bus.call_noreply(method); 935 } 936 catch (const std::exception& e) 937 { 938 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e); 939 } 940 } 941 942 return PLDM_SUCCESS; 943 } 944 handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)945 int OemEventManager::handlepldmMessagePollEvent( 946 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */, 947 pldm_tid_t tid, size_t eventDataOffset) 948 { 949 /* This OEM event handler is only used for SoC terminus*/ 950 if (!tidToSocketNameMap.contains(tid)) 951 { 952 return PLDM_SUCCESS; 953 } 954 955 auto eventData = 956 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset; 957 auto eventDataSize = payloadLength - eventDataOffset; 958 959 pldm_message_poll_event poll_event{}; 960 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize, 961 &poll_event); 962 if (rc) 963 { 964 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ", 965 "RC", rc); 966 return rc; 967 } 968 969 auto sensorID = poll_event.event_id; 970 /* The UE errors */ 971 if (rasUESensorIDs.contains(sensorID)) 972 { 973 pldm::utils::DBusMapping dbusMapping{ 974 "/xyz/openbmc_project/led/groups/ras_ue_fault", 975 "xyz.openbmc_project.Led.Group", "Asserted", "bool"}; 976 try 977 { 978 pldm::utils::DBusHandler().setDbusProperty( 979 dbusMapping, pldm::utils::PropertyValue{bool(true)}); 980 } 981 catch (const std::exception& e) 982 { 983 lg2::error( 984 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}", 985 "TID", tid, "SENSORID", sensorID, "ERROR", e); 986 } 987 } 988 989 return PLDM_SUCCESS; 990 } 991 oemPollForPlatformEvent(pldm_tid_t tid)992 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid) 993 { 994 uint64_t t0 = 0; 995 996 /* This OEM event handler is only used for SoC terminus */ 997 if (!tidToSocketNameMap.contains(tid)) 998 { 999 co_return PLDM_SUCCESS; 1000 } 1001 1002 if (!timeStampMap.contains(tid)) 1003 { 1004 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0); 1005 timeStampMap.emplace(std::make_pair(tid, t0)); 1006 } 1007 else 1008 { 1009 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0); 1010 uint64_t elapsed = t0 - timeStampMap[tid]; 1011 if (elapsed >= NORMAL_EVENT_POLLING_TIME) 1012 { 1013 co_await manager->pollForPlatformEvent(tid, 0, 0); 1014 timeStampMap[tid] = t0; 1015 } 1016 } 1017 1018 co_return PLDM_SUCCESS; 1019 } 1020 } // namespace oem_ampere 1021 } // namespace pldm 1022