xref: /openbmc/pldm/oem/ampere/event/oem_event_manager.cpp (revision 198084bb50931b64a3134e159d5ca3361f084a5d)
1 #include "oem_event_manager.hpp"
2 
3 #include "libcper/Cper.h"
4 
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8 
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13 
14 #include <phosphor-logging/lg2.hpp>
15 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
16 
17 #include <algorithm>
18 #include <map>
19 #include <set>
20 #include <sstream>
21 #include <string>
22 #include <unordered_map>
23 
24 namespace pldm
25 {
26 namespace oem_ampere
27 {
28 namespace boot_stage = boot::stage;
29 namespace ddr_status = ddr::status;
30 namespace dimm_status = dimm::status;
31 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
32 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
33 namespace training_failure = dimm::training_failure;
34 
35 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
36 constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
37 constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
38 constexpr const char* BIOSFWPanicRegistry =
39     "OpenBMC.0.1.BIOSFirmwarePanicReason";
40 constexpr auto maxDIMMIdxBitNum = 24;
41 constexpr auto maxDIMMInstantNum = 24;
42 
43 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
44 
45 /*
46     An array of possible boot status of a boot stage.
47     The index maps with byte 0 of boot code.
48 */
49 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
50 
51 /*
52     An array of possible boot status of DDR training stage.
53     The index maps with byte 0 of boot code.
54 */
55 std::array<std::string, 3> ddrTrainingMsg = {
56     " progress started", " in-progress", " progress completed"};
57 
58 /*
59     A map between PMIC status and logging strings.
60 */
61 std::array<std::string, 8> pmicTempAlertMsg = {
62     "Below 85°C", "85°C",  "95°C",  "105°C",
63     "115°C",      "125°C", "135°C", "Equal or greater than 140°C"};
64 
65 /*
66     In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
67     EPs through SMBus and PCIe. When host boots up, SMBUS interface
68     comes up first. In this interface, BMC is bus owner.
69 
70     mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
71     pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
72 */
73 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
74 
75 /*
76     A map between sensor IDs and their names in string.
77     Using pldm::oem::sensor_ids
78 */
79 EventToMsgMap_t sensorIdToStrMap = {
80     {DDR_STATUS, "DDR_STATUS"},
81     {PCP_VR_STATE, "PCP_VR_STATE"},
82     {SOC_VR_STATE, "SOC_VR_STATE"},
83     {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
84     {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
85     {D2D_VR_STATE, "D2D_VR_STATE"},
86     {IOC_VR1_STATE, "IOC_VR1_STATE"},
87     {IOC_VR2_STATE, "IOC_VR2_STATE"},
88     {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
89     {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
90     {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
91     {BOOT_OVERALL, "BOOT_OVERALL"},
92     {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
93     {WATCH_DOG, "WATCH_DOG"}};
94 
95 /*
96     A map between the boot stages and logging strings.
97     Using pldm::oem::boot::stage::boot_stage
98 */
99 EventToMsgMap_t bootStageToMsgMap = {
100     {boot_stage::SECPRO, "SECpro"},
101     {boot_stage::MPRO, "Mpro"},
102     {boot_stage::ATF_BL1, "ATF BL1"},
103     {boot_stage::ATF_BL2, "ATF BL2"},
104     {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
105     {boot_stage::DDR_TRAINING, "DDR training"},
106     {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
107     {boot_stage::ATF_BL31, "ATF BL31"},
108     {boot_stage::ATF_BL32, "ATF BL32"},
109     {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
110     {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
111      "ATF BL33 (UEFI) booting status = "}};
112 
113 /*
114     A map between DDR status and logging strings.
115     Using pldm::oem::ddr::status::ddr_status
116 */
117 EventToMsgMap_t ddrStatusToMsgMap = {
118     {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
119     {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
120     {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
121     {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
122     {ddr_status::OTHER_FAILURE, "has other failure"},
123     {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
124      "has boot failure due to no configuration"},
125     {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
126      "failsafe activated but boot success with the next valid configuration"}};
127 
128 /*
129     A map between DIMM status and logging strings.
130     Using pldm::oem::dimm::status::dimm_status
131 */
132 EventToMsgMap_t dimmStatusToMsgMap = {
133     {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
134     {dimm_status::NOT_INSTALLED, "is not installed"},
135     {dimm_status::OTHER_FAILURE, "has other failure"},
136     {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
137     {dimm_status::TRAINING_FAILURE, "has training failure; "},
138     {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
139 
140 /*
141     A map between PHY training failure syndrome and logging strings.
142     Using
143    pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
144 */
145 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
146     {phy_syndrome::NA, "(N/A)"},
147     {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
148     {phy_syndrome::CA_LEVELING, "(CA leveling)"},
149     {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
150      "(PHY write level failure - see syndrome 1)"},
151     {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
152      "(PHY read gate leveling failure)"},
153     {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
154     {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
155     {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
156 
157 /*
158     A map between DIMM training failure syndrome and logging strings.
159     Using
160    pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
161 */
162 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
163     {dimm_syndrome::NA, "(N/A)"},
164     {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
165      "(DRAM VREFDQ training failure)"},
166     {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
167     {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
168      "(LRDRIMM DB SW training failure)"}};
169 
170 /*
171     A map between DIMM training failure type and a pair of <logging strings -
172    syndrome map>. Using
173    pldm::oem::dimm::training_faillure::dimm_training_failure_type
174 */
175 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
176     dimmTrainingFailureTypeMap = {
177         {training_failure::PHY_TRAINING_FAILURE_TYPE,
178          std::make_pair("PHY training failure",
179                         phyTrainingFailureSyndromeToMsgMap)},
180         {training_failure::DIMM_TRAINING_FAILURE_TYPE,
181          std::make_pair("DIMM training failure",
182                         dimmTrainingFailureSyndromeToMsgMap)}};
183 
184 /*
185     A map between log level and the registry used for Redfish SEL log
186     Using pldm::oem::log_level
187 */
188 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
189     {log_level::OK, ampereEventRegistry},
190     {log_level::WARNING, ampereWarningRegistry},
191     {log_level::CRITICAL, ampereCriticalRegistry},
192     {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
193 
194 std::unordered_map<
195     uint16_t,
196     std::vector<std::pair<
197         std::string,
198         std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
199     stateSensorToMsgMap = {
200         {SOC_HEALTH_AVAILABILITY,
201          {{"SoC Health",
202            {{1, {log_level::OK, "Normal"}},
203             {2, {log_level::WARNING, "Non-Critical"}},
204             {3, {log_level::CRITICAL, "Critical"}},
205             {4, {log_level::CRITICAL, "Fatal"}}}},
206           {"SoC Availability",
207            {{1, {log_level::OK, "Enabled"}},
208             {2, {log_level::WARNING, "Disabled"}},
209             {3, {log_level::CRITICAL, "Shutdown"}}}}}},
210         {WATCH_DOG,
211          {{"Global Watch Dog",
212            {{1, {log_level::OK, "Normal"}},
213             {2, {log_level::CRITICAL, "Timer Expired"}}}},
214           {"Secure Watch Dog",
215            {{1, {log_level::OK, "Normal"}},
216             {2, {log_level::CRITICAL, "Timer Expired"}}}},
217           {"Non-secure Watch Dog",
218            {{1, {log_level::OK, "Normal"}},
219             {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
220 
221 std::string
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)222     OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId)
223 {
224     std::string description;
225     if (!tidToSocketNameMap.contains(tid))
226     {
227         description += "TID " + std::to_string(tid) + ": ";
228     }
229     else
230     {
231         description += tidToSocketNameMap[tid] + ": ";
232     }
233 
234     if (!sensorIdToStrMap.contains(sensorId))
235     {
236         description += "Sensor ID " + std::to_string(sensorId) + ": ";
237     }
238     else
239     {
240         description += sensorIdToStrMap[sensorId] + ": ";
241     }
242 
243     return description;
244 }
245 
sendJournalRedfish(const std::string & description,log_level & logLevel)246 void OemEventManager::sendJournalRedfish(const std::string& description,
247                                          log_level& logLevel)
248 {
249     if (description.empty())
250     {
251         return;
252     }
253 
254     if (!logLevelToRedfishMsgIdMap.contains(logLevel))
255     {
256         lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
257                    "DES", description);
258         return;
259     }
260     auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
261     lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
262               redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
263 }
264 
dimmIdxsToString(uint32_t dimmIdxs)265 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
266 {
267     std::string description;
268     for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
269     {
270         if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
271         {
272             description += " #" + std::to_string(bitIdx);
273         }
274     }
275     return description;
276 }
277 
sensorIdToDIMMIdx(const uint16_t & sensorId)278 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
279 {
280     uint8_t dimmIdx = maxDIMMInstantNum;
281     int sensorId_Off = sensorId - 4;
282     if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
283         ((sensorId_Off / 2) < maxDIMMInstantNum))
284     {
285         dimmIdx = sensorId_Off / 2;
286     }
287     return dimmIdx;
288 }
289 
handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)290 void OemEventManager::handleBootOverallEvent(
291     pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
292 {
293     log_level logLevel{log_level::OK};
294     std::string description;
295     std::stringstream strStream;
296 
297     uint8_t byte0 = (presentReading & 0x000000ff);
298     uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
299     uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
300     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
301     /*
302      * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
303      * ATF BL32 and DDR initialization
304      */
305     if (bootStageToMsgMap.contains(byte3))
306     {
307         // Boot stage adding
308         description += bootStageToMsgMap[byte3];
309 
310         switch (byte3)
311         {
312             case boot_stage::DDR_TRAINING:
313                 if (byte0 >= ddrTrainingMsg.size())
314                 {
315                     logLevel = log_level::BIOSFWPANIC;
316                     description += " unknown status";
317                 }
318                 else
319                 {
320                     description += ddrTrainingMsg[byte0];
321                 }
322                 if (0x01 == byte0)
323                 {
324                     // Add complete percentage
325                     description += " at " + std::to_string(byte1) + "%";
326                 }
327                 break;
328             case boot_stage::S0_DDR_TRAINING_FAILURE:
329             case boot_stage::S1_DDR_TRAINING_FAILURE:
330                 // ddr_training_status_msg()
331                 logLevel = log_level::BIOSFWPANIC;
332                 description += " at DIMMs:";
333                 // dimmIdxs = presentReading & 0x00ffffff;
334                 description += dimmIdxsToString(presentReading & 0x00ffffff);
335                 description += " of socket ";
336                 description +=
337                     (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
338                 break;
339             default:
340                 if (byte0 >= bootStatMsg.size())
341                 {
342                     logLevel = log_level::BIOSFWPANIC;
343                     description += " unknown status";
344                 }
345                 else
346                 {
347                     description += bootStatMsg[byte0];
348                 }
349                 break;
350         }
351 
352         // Sensor report action is fail
353         if (boot::status::BOOT_STATUS_FAILURE == byte2)
354         {
355             logLevel = log_level::BIOSFWPANIC;
356         }
357     }
358     else
359     {
360         if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
361         {
362             description +=
363                 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
364 
365             strStream
366                 << "Segment (0x" << std::setfill('0') << std::hex
367                 << std::setw(8) << static_cast<uint32_t>(presentReading)
368                 << "); Status Class (0x" << std::setw(2)
369                 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
370                 << std::setw(2) << static_cast<uint32_t>(byte2)
371                 << "); Operation Code (0x" << std::setw(4)
372                 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
373                 << ")" << std::dec;
374 
375             description += strStream.str();
376         }
377     }
378 
379     // Log to Redfish event
380     sendJournalRedfish(description, logLevel);
381 }
382 
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)383 int OemEventManager::processNumericSensorEvent(
384     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
385     size_t sensorDataLength)
386 {
387     uint8_t eventState = 0;
388     uint8_t previousEventState = 0;
389     uint8_t sensorDataSize = 0;
390     uint32_t presentReading;
391     auto rc = decode_numeric_sensor_data(
392         sensorData, sensorDataLength, &eventState, &previousEventState,
393         &sensorDataSize, &presentReading);
394     if (rc)
395     {
396         lg2::error(
397             "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
398             "TID", tid, "RC", rc);
399         return rc;
400     }
401 
402     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
403     if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
404     {
405         handleDIMMStatusEvent(tid, sensorId, presentReading);
406         return PLDM_SUCCESS;
407     }
408 
409     switch (sensorId)
410     {
411         case BOOT_OVERALL:
412             handleBootOverallEvent(tid, sensorId, presentReading);
413             break;
414         case PCIE_HOT_PLUG:
415             handlePCIeHotPlugEvent(tid, sensorId, presentReading);
416             break;
417         case DDR_STATUS:
418             handleDDRStatusEvent(tid, sensorId, presentReading);
419             break;
420         case PCP_VR_STATE:
421         case SOC_VR_STATE:
422         case DPHY_VR1_STATE:
423         case DPHY_VR2_STATE:
424         case D2D_VR_STATE:
425         case IOC_VR1_STATE:
426         case IOC_VR2_STATE:
427         case PCI_D_VR_STATE:
428         case PCI_A_VR_STATE:
429             handleVRDStatusEvent(tid, sensorId, presentReading);
430             break;
431         case WATCH_DOG:
432             handleNumericWatchdogEvent(tid, sensorId, presentReading);
433             break;
434         default:
435             std::string description;
436             std::stringstream strStream;
437             log_level logLevel = log_level::OK;
438 
439             description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
440             description += prefixMsgStrCreation(tid, sensorId);
441             strStream << std::setfill('0') << std::hex << "eventState 0x"
442                       << std::setw(2) << static_cast<uint32_t>(eventState)
443                       << " previousEventState 0x" << std::setw(2)
444                       << static_cast<uint32_t>(previousEventState)
445                       << " sensorDataSize 0x" << std::setw(2)
446                       << static_cast<uint32_t>(sensorDataSize)
447                       << " presentReading 0x" << std::setw(8)
448                       << static_cast<uint32_t>(presentReading) << std::dec;
449             description += strStream.str();
450 
451             sendJournalRedfish(description, logLevel);
452             break;
453     }
454     return PLDM_SUCCESS;
455 }
456 
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)457 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
458                                              const uint8_t* sensorData,
459                                              size_t sensorDataLength)
460 {
461     uint8_t sensorOffset = 0;
462     uint8_t eventState = 0;
463     uint8_t previousEventState = 0;
464 
465     auto rc =
466         decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
467                                  &eventState, &previousEventState);
468     if (rc)
469     {
470         lg2::error(
471             "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
472             "TID", tid, "RC", rc);
473         return rc;
474     }
475 
476     std::string description;
477     log_level logLevel = log_level::OK;
478 
479     if (stateSensorToMsgMap.contains(sensorId))
480     {
481         description += prefixMsgStrCreation(tid, sensorId);
482         auto componentMap = stateSensorToMsgMap[sensorId];
483         if (sensorOffset < componentMap.size())
484         {
485             description += std::get<0>(componentMap[sensorOffset]);
486             auto stateMap = std::get<1>(componentMap[sensorOffset]);
487             if (stateMap.contains(eventState))
488             {
489                 logLevel = std::get<0>(stateMap[eventState]);
490                 description += " state : " + std::get<1>(stateMap[eventState]);
491                 if (stateMap.contains(previousEventState))
492                 {
493                     description += "; previous state: " +
494                                    std::get<1>(stateMap[previousEventState]);
495                 }
496             }
497             else
498             {
499                 description += " sends unsupported event state: " +
500                                std::to_string(eventState);
501                 if (stateMap.contains(previousEventState))
502                 {
503                     description += "; previous state: " +
504                                    std::get<1>(stateMap[previousEventState]);
505                 }
506             }
507         }
508         else
509         {
510             description += "sends unsupported component sensor offset " +
511                            std::to_string(sensorOffset);
512         }
513     }
514     else
515     {
516         std::stringstream strStream;
517         description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
518         description += prefixMsgStrCreation(tid, sensorId);
519         strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
520                   << std::setw(2) << static_cast<uint32_t>(sensorOffset)
521                   << "eventState 0x" << std::setw(2)
522                   << static_cast<uint32_t>(eventState)
523                   << " previousEventState 0x" << std::setw(2)
524                   << static_cast<uint32_t>(previousEventState) << std::dec;
525         description += strStream.str();
526     }
527 
528     sendJournalRedfish(description, logLevel);
529 
530     return PLDM_SUCCESS;
531 }
532 
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)533 int OemEventManager::processSensorOpStateEvent(
534     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
535     size_t sensorDataLength)
536 {
537     uint8_t present_op_state = 0;
538     uint8_t previous_op_state = 0;
539 
540     auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
541                                     &present_op_state, &previous_op_state);
542     if (rc)
543     {
544         lg2::error(
545             "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
546             "TID", tid, "RC", rc);
547         return rc;
548     }
549 
550     std::string description;
551     std::stringstream strStream;
552     log_level logLevel = log_level::OK;
553 
554     description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
555     description += prefixMsgStrCreation(tid, sensorId);
556     strStream << std::setfill('0') << std::hex << "present_op_state 0x"
557               << std::setw(2) << static_cast<uint32_t>(present_op_state)
558               << "previous_op_state 0x" << std::setw(2)
559               << static_cast<uint32_t>(previous_op_state) << std::dec;
560     description += strStream.str();
561 
562     sendJournalRedfish(description, logLevel);
563 
564     return PLDM_SUCCESS;
565 }
566 
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)567 int OemEventManager::handleSensorEvent(
568     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
569     pldm_tid_t tid, size_t eventDataOffset)
570 {
571     /* This OEM event handler is only used for SoC terminus*/
572     if (!tidToSocketNameMap.contains(tid))
573     {
574         return PLDM_SUCCESS;
575     }
576     auto eventData =
577         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
578     auto eventDataSize = payloadLength - eventDataOffset;
579 
580     uint16_t sensorId = 0;
581     uint8_t sensorEventClassType = 0;
582     size_t eventClassDataOffset = 0;
583     auto rc =
584         decode_sensor_event_data(eventData, eventDataSize, &sensorId,
585                                  &sensorEventClassType, &eventClassDataOffset);
586     if (rc)
587     {
588         lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
589                    rc);
590         return rc;
591     }
592     const uint8_t* sensorData = eventData + eventClassDataOffset;
593     size_t sensorDataLength = eventDataSize - eventClassDataOffset;
594 
595     switch (sensorEventClassType)
596     {
597         case PLDM_NUMERIC_SENSOR_STATE:
598         {
599             return processNumericSensorEvent(tid, sensorId, sensorData,
600                                              sensorDataLength);
601         }
602         case PLDM_STATE_SENSOR_STATE:
603         {
604             return processStateSensorEvent(tid, sensorId, sensorData,
605                                            sensorDataLength);
606         }
607         case PLDM_SENSOR_OP_STATE:
608         {
609             return processSensorOpStateEvent(tid, sensorId, sensorData,
610                                              sensorDataLength);
611         }
612         default:
613             std::string description;
614             std::stringstream strStream;
615             log_level logLevel = log_level::OK;
616 
617             description += "SENSOR_EVENT : Unsupported Sensor Class " +
618                            std::to_string(sensorEventClassType) + ": ";
619             description += prefixMsgStrCreation(tid, sensorId);
620             strStream << std::setfill('0') << std::hex
621                       << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
622 
623             auto dataPtr = sensorData;
624             for ([[maybe_unused]] const auto& i :
625                  std::views::iota(0, (int)sensorDataLength))
626             {
627                 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
628                 dataPtr += sizeof(sensorData);
629             }
630 
631             description += strStream.str();
632 
633             sendJournalRedfish(description, logLevel);
634     }
635     lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
636               sensorEventClassType);
637     return PLDM_ERROR;
638 }
639 
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)640 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
641                                              uint32_t presentReading)
642 {
643     std::string description;
644     std::stringstream strStream;
645     PCIeHotPlugEventRecord_t record{presentReading};
646 
647     std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
648     std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
649     log_level logLevel =
650         (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
651 
652     description += prefixMsgStrCreation(tid, sensorId);
653 
654     strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
655               << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
656               << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
657               << "); Device (0x" << std::setw(2)
658               << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
659               << std::setw(2) << static_cast<uint32_t>(record.bits.function)
660               << "); Action (" << sAction << "); Operation status ("
661               << sOpStatus << "); Media slot number (" << std::dec
662               << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
663 
664     description += strStream.str();
665 
666     // Log to Redfish event
667     sendJournalRedfish(description, logLevel);
668 }
669 
dimmTrainingFailureToMsg(uint32_t failureInfo)670 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
671 {
672     std::string description;
673     DIMMTrainingFailure_t failure{failureInfo};
674 
675     if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
676     {
677         auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
678 
679         description += std::get<0>(failureInfoMap);
680 
681         description += "; MCU rank index " +
682                        std::to_string(failure.bits.mcuRankIdx);
683 
684         description += "; Slice number " +
685                        std::to_string(failure.bits.sliceNum);
686 
687         description += "; Upper nibble error status: ";
688         description += (!failure.bits.upperNibbStatErr)
689                            ? "No error"
690                            : "Found no rising edge";
691 
692         description += "; Lower nibble error status: ";
693         description += (!failure.bits.lowerNibbStatErr)
694                            ? "No error"
695                            : "Found no rising edge";
696 
697         description += "; Failure syndrome 0: ";
698 
699         auto& syndromeMap = std::get<1>(failureInfoMap);
700         if (syndromeMap.contains(failure.bits.syndrome))
701         {
702             description += syndromeMap[failure.bits.syndrome];
703         }
704         else
705         {
706             description += "(Unknown syndrome)";
707         }
708     }
709     else
710     {
711         description += "Unknown training failure type " +
712                        std::to_string(failure.bits.type);
713     }
714 
715     return description;
716 }
717 
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)718 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
719                                             uint32_t presentReading)
720 {
721     log_level logLevel{log_level::WARNING};
722     std::string description;
723     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
724     uint32_t byte012 = presentReading & 0xffffff;
725 
726     description += prefixMsgStrCreation(tid, sensorId);
727 
728     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
729     auto dimmIdx = sensorIdToDIMMIdx(sensorId);
730     if (dimmIdx >= maxDIMMIdxBitNum)
731     {
732         return;
733     }
734 
735     description += "DIMM " + std::to_string(dimmIdx) + " ";
736 
737     if (dimmStatusToMsgMap.contains(byte3))
738     {
739         if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
740             byte3 == dimm_status::INSTALLED_BUT_DISABLED)
741         {
742             logLevel = log_level::OK;
743         }
744 
745         description += dimmStatusToMsgMap[byte3];
746 
747         if (byte3 == dimm_status::TRAINING_FAILURE)
748         {
749             description += "; " + dimmTrainingFailureToMsg(byte012);
750         }
751         else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
752         {
753             uint8_t byte0 = (byte012 & 0xff);
754             if (byte0 < pmicTempAlertMsg.size())
755             {
756                 description += ": " + pmicTempAlertMsg[byte0];
757             }
758         }
759     }
760     else
761     {
762         switch (byte3)
763         {
764             case dimm_status::PMIC_HIGH_TEMP:
765                 if (byte012 == 0x01)
766                 {
767                     description += "has PMIC high temp condition";
768                 }
769                 break;
770             case dimm_status::TSx_HIGH_TEMP:
771                 switch (byte012)
772                 {
773                     case 0x01:
774                         description += "has TS0";
775                         break;
776                     case 0x02:
777                         description += "has TS1";
778                         break;
779                     case 0x03:
780                         description += "has TS0 and TS1";
781                         break;
782                 }
783                 description += " exceeding their high temperature threshold";
784                 break;
785             case dimm_status::SPD_HUB_HIGH_TEMP:
786                 if (byte012 == 0x01)
787                 {
788                     description += "has SPD/HUB high temp condition";
789                 }
790                 break;
791             default:
792                 description += "has unsupported status " +
793                                std::to_string(byte3);
794                 break;
795         }
796     }
797 
798     // Log to Redfish event
799     sendJournalRedfish(description, logLevel);
800 }
801 
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)802 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
803                                            uint32_t presentReading)
804 {
805     log_level logLevel{log_level::WARNING};
806     std::string description;
807     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
808     uint32_t byte012 = presentReading & 0xffffff;
809 
810     description += prefixMsgStrCreation(tid, sensorId);
811 
812     description += "DDR ";
813     if (ddrStatusToMsgMap.contains(byte3))
814     {
815         if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
816         {
817             logLevel = log_level::OK;
818         }
819 
820         description += ddrStatusToMsgMap[byte3];
821 
822         if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
823             byte3 == ddr_status::TRAINING_FAILURE)
824         {
825             // List out failed DIMMs
826             description += dimmIdxsToString(byte012);
827         }
828     }
829     else
830     {
831         description += "has unsupported status " + std::to_string(byte3);
832     }
833 
834     // Log to Redfish event
835     sendJournalRedfish(description, logLevel);
836 }
837 
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)838 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
839                                            uint32_t presentReading)
840 {
841     log_level logLevel{log_level::WARNING};
842     std::string description;
843     std::stringstream strStream;
844 
845     description += prefixMsgStrCreation(tid, sensorId);
846 
847     VRDStatus_t status{presentReading};
848 
849     if (status.bits.warning && status.bits.critical)
850     {
851         description += "A VR warning and a VR critical";
852         logLevel = log_level::CRITICAL;
853     }
854     else
855     {
856         if (status.bits.warning)
857         {
858             description += "A VR warning";
859         }
860         else if (status.bits.critical)
861         {
862             description += "A VR critical";
863             logLevel = log_level::CRITICAL;
864         }
865         else
866         {
867             description += "No VR warning or critical";
868             logLevel = log_level::OK;
869         }
870     }
871     description += " condition observed";
872 
873     strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
874               << std::setw(2)
875               << static_cast<uint32_t>(status.bits.vr_status_byte_high)
876               << "; VR status byte low is 0x" << std::setw(2)
877               << static_cast<uint32_t>(status.bits.vr_status_byte_low)
878               << "; Reading is 0x" << std::setw(2)
879               << static_cast<uint32_t>(presentReading) << ";";
880 
881     description += strStream.str();
882 
883     // Log to Redfish event
884     sendJournalRedfish(description, logLevel);
885 }
886 
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)887 void OemEventManager::handleNumericWatchdogEvent(
888     pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
889 {
890     std::string description;
891     log_level logLevel = log_level::CRITICAL;
892 
893     description += prefixMsgStrCreation(tid, sensorId);
894 
895     if (presentReading & 0x01)
896     {
897         description += "Global watchdog expired;";
898     }
899     if (presentReading & 0x02)
900     {
901         description += "Secure watchdog expired;";
902     }
903     if (presentReading & 0x04)
904     {
905         description += "Non-secure watchdog expired;";
906     }
907 
908     // Log to Redfish event
909     sendJournalRedfish(description, logLevel);
910 }
911 
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)912 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
913                                             const uint8_t* eventData,
914                                             size_t eventDataSize)
915 {
916     EFI_AMPERE_ERROR_DATA ampHdr;
917 
918     decodeCperRecord(eventData, eventDataSize, &ampHdr);
919 
920     addCperSELLog(tid, eventId, &ampHdr);
921 
922     /* isBert at bit 12 of TypeId */
923     if (ampHdr.TypeId & 0x0800)
924     {
925         lg2::info("Ampere SoC BERT is triggered.");
926         std::variant<std::string> value(
927             "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
928         try
929         {
930             auto& bus = pldm::utils::DBusHandler::getBus();
931             auto method =
932                 bus.new_method_call("com.ampere.CrashCapture.Trigger",
933                                     "/com/ampere/crashcapture/trigger",
934                                     pldm::utils::dbusProperties, "Set");
935             method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
936                           value);
937             bus.call_noreply(method);
938         }
939         catch (const std::exception& e)
940         {
941             lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
942         }
943     }
944 
945     return PLDM_SUCCESS;
946 }
947 
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)948 int OemEventManager::handlepldmMessagePollEvent(
949     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
950     pldm_tid_t tid, size_t eventDataOffset)
951 {
952     /* This OEM event handler is only used for SoC terminus*/
953     if (!tidToSocketNameMap.contains(tid))
954     {
955         return PLDM_SUCCESS;
956     }
957 
958     auto eventData =
959         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
960     auto eventDataSize = payloadLength - eventDataOffset;
961 
962     pldm_message_poll_event poll_event{};
963     auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
964                                                   &poll_event);
965     if (rc)
966     {
967         lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
968                    "RC", rc);
969         return rc;
970     }
971 
972     auto sensorID = poll_event.event_id;
973     /* The UE errors */
974     if (rasUESensorIDs.contains(sensorID))
975     {
976         pldm::utils::DBusMapping dbusMapping{
977             "/xyz/openbmc_project/led/groups/ras_ue_fault",
978             "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
979         try
980         {
981             pldm::utils::DBusHandler().setDbusProperty(
982                 dbusMapping, pldm::utils::PropertyValue{bool(true)});
983         }
984         catch (const std::exception& e)
985         {
986             lg2::error(
987                 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
988                 "TID", tid, "SENSORID", sensorID, "ERROR", e);
989         }
990     }
991 
992     return PLDM_SUCCESS;
993 }
994 
995 } // namespace oem_ampere
996 } // namespace pldm
997