xref: /openbmc/pldm/oem/ampere/event/oem_event_manager.cpp (revision 218f49920093e9a60a60ac9b2d84396844fdb9d3)
1 #include "oem_event_manager.hpp"
2 
3 #include "libcper/Cper.h"
4 
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8 
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13 
14 #include <phosphor-logging/lg2.hpp>
15 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
16 
17 #include <algorithm>
18 #include <map>
19 #include <set>
20 #include <sstream>
21 #include <string>
22 #include <unordered_map>
23 
24 namespace pldm
25 {
26 namespace oem_ampere
27 {
28 namespace fs = std::filesystem;
29 using namespace std::chrono;
30 
31 namespace boot_stage = boot::stage;
32 namespace ddr_status = ddr::status;
33 namespace dimm_status = dimm::status;
34 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
35 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
36 namespace training_failure = dimm::training_failure;
37 
38 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
39 constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
40 constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
41 constexpr const char* BIOSFWPanicRegistry =
42     "OpenBMC.0.1.BIOSFirmwarePanicReason";
43 constexpr auto maxDIMMIdxBitNum = 24;
44 constexpr auto maxDIMMInstantNum = 24;
45 
46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47 
48 /*
49     An array of possible boot status of a boot stage.
50     The index maps with byte 0 of boot code.
51 */
52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53 
54 /*
55     An array of possible boot status of DDR training stage.
56     The index maps with byte 0 of boot code.
57 */
58 std::array<std::string, 3> ddrTrainingMsg = {
59     " progress started", " in-progress", " progress completed"};
60 
61 /*
62     A map between PMIC status and logging strings.
63 */
64 std::array<std::string, 8> pmicTempAlertMsg = {
65     "Below 85°C", "85°C",  "95°C",  "105°C",
66     "115°C",      "125°C", "135°C", "Equal or greater than 140°C"};
67 
68 /*
69     In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70     EPs through SMBus and PCIe. When host boots up, SMBUS interface
71     comes up first. In this interface, BMC is bus owner.
72 
73     mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74     pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75 */
76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77 
78 /*
79     A map between sensor IDs and their names in string.
80     Using pldm::oem::sensor_ids
81 */
82 EventToMsgMap_t sensorIdToStrMap = {
83     {DDR_STATUS, "DDR_STATUS"},
84     {PCP_VR_STATE, "PCP_VR_STATE"},
85     {SOC_VR_STATE, "SOC_VR_STATE"},
86     {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87     {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88     {D2D_VR_STATE, "D2D_VR_STATE"},
89     {IOC_VR1_STATE, "IOC_VR1_STATE"},
90     {IOC_VR2_STATE, "IOC_VR2_STATE"},
91     {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92     {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93     {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94     {BOOT_OVERALL, "BOOT_OVERALL"},
95     {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96     {WATCH_DOG, "WATCH_DOG"}};
97 
98 /*
99     A map between the boot stages and logging strings.
100     Using pldm::oem::boot::stage::boot_stage
101 */
102 EventToMsgMap_t bootStageToMsgMap = {
103     {boot_stage::SECPRO, "SECpro"},
104     {boot_stage::MPRO, "Mpro"},
105     {boot_stage::ATF_BL1, "ATF BL1"},
106     {boot_stage::ATF_BL2, "ATF BL2"},
107     {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108     {boot_stage::DDR_TRAINING, "DDR training"},
109     {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110     {boot_stage::ATF_BL31, "ATF BL31"},
111     {boot_stage::ATF_BL32, "ATF BL32"},
112     {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113     {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114      "ATF BL33 (UEFI) booting status = "}};
115 
116 /*
117     A map between DDR status and logging strings.
118     Using pldm::oem::ddr::status::ddr_status
119 */
120 EventToMsgMap_t ddrStatusToMsgMap = {
121     {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122     {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123     {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124     {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125     {ddr_status::OTHER_FAILURE, "has other failure"},
126     {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127      "has boot failure due to no configuration"},
128     {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129      "failsafe activated but boot success with the next valid configuration"}};
130 
131 /*
132     A map between DIMM status and logging strings.
133     Using pldm::oem::dimm::status::dimm_status
134 */
135 EventToMsgMap_t dimmStatusToMsgMap = {
136     {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137     {dimm_status::NOT_INSTALLED, "is not installed"},
138     {dimm_status::OTHER_FAILURE, "has other failure"},
139     {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140     {dimm_status::TRAINING_FAILURE, "has training failure; "},
141     {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142 
143 /*
144     A map between PHY training failure syndrome and logging strings.
145     Using
146    pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147 */
148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149     {phy_syndrome::NA, "(N/A)"},
150     {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151     {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152     {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153      "(PHY write level failure - see syndrome 1)"},
154     {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155      "(PHY read gate leveling failure)"},
156     {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157     {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158     {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159 
160 /*
161     A map between DIMM training failure syndrome and logging strings.
162     Using
163    pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164 */
165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166     {dimm_syndrome::NA, "(N/A)"},
167     {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168      "(DRAM VREFDQ training failure)"},
169     {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170     {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171      "(LRDRIMM DB SW training failure)"}};
172 
173 /*
174     A map between DIMM training failure type and a pair of <logging strings -
175    syndrome map>. Using
176    pldm::oem::dimm::training_faillure::dimm_training_failure_type
177 */
178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179     dimmTrainingFailureTypeMap = {
180         {training_failure::PHY_TRAINING_FAILURE_TYPE,
181          std::make_pair("PHY training failure",
182                         phyTrainingFailureSyndromeToMsgMap)},
183         {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184          std::make_pair("DIMM training failure",
185                         dimmTrainingFailureSyndromeToMsgMap)}};
186 
187 /*
188     A map between log level and the registry used for Redfish SEL log
189     Using pldm::oem::log_level
190 */
191 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
192     {log_level::OK, ampereEventRegistry},
193     {log_level::WARNING, ampereWarningRegistry},
194     {log_level::CRITICAL, ampereCriticalRegistry},
195     {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
196 
197 std::unordered_map<
198     uint16_t,
199     std::vector<std::pair<
200         std::string,
201         std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
202     stateSensorToMsgMap = {
203         {SOC_HEALTH_AVAILABILITY,
204          {{"SoC Health",
205            {{1, {log_level::OK, "Normal"}},
206             {2, {log_level::WARNING, "Non-Critical"}},
207             {3, {log_level::CRITICAL, "Critical"}},
208             {4, {log_level::CRITICAL, "Fatal"}}}},
209           {"SoC Availability",
210            {{1, {log_level::OK, "Enabled"}},
211             {2, {log_level::WARNING, "Disabled"}},
212             {3, {log_level::CRITICAL, "Shutdown"}}}}}},
213         {WATCH_DOG,
214          {{"Global Watch Dog",
215            {{1, {log_level::OK, "Normal"}},
216             {2, {log_level::CRITICAL, "Timer Expired"}}}},
217           {"Secure Watch Dog",
218            {{1, {log_level::OK, "Normal"}},
219             {2, {log_level::CRITICAL, "Timer Expired"}}}},
220           {"Non-secure Watch Dog",
221            {{1, {log_level::OK, "Normal"}},
222             {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
223 
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)224 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
225                                                   uint16_t sensorId)
226 {
227     std::string description;
228     if (!tidToSocketNameMap.contains(tid))
229     {
230         description += "TID " + std::to_string(tid) + ": ";
231     }
232     else
233     {
234         description += tidToSocketNameMap[tid] + ": ";
235     }
236 
237     if (!sensorIdToStrMap.contains(sensorId))
238     {
239         description += "Sensor ID " + std::to_string(sensorId) + ": ";
240     }
241     else
242     {
243         description += sensorIdToStrMap[sensorId] + ": ";
244     }
245 
246     return description;
247 }
248 
sendJournalRedfish(const std::string & description,log_level & logLevel)249 void OemEventManager::sendJournalRedfish(const std::string& description,
250                                          log_level& logLevel)
251 {
252     if (description.empty())
253     {
254         return;
255     }
256 
257     if (!logLevelToRedfishMsgIdMap.contains(logLevel))
258     {
259         lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
260                    "DES", description);
261         return;
262     }
263     auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
264     lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265               redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
266 }
267 
dimmIdxsToString(uint32_t dimmIdxs)268 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
269 {
270     std::string description;
271     for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
272     {
273         if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
274         {
275             description += " #" + std::to_string(bitIdx);
276         }
277     }
278     return description;
279 }
280 
sensorIdToDIMMIdx(const uint16_t & sensorId)281 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
282 {
283     uint8_t dimmIdx = maxDIMMInstantNum;
284     int sensorId_Off = sensorId - 4;
285     if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
286         ((sensorId_Off / 2) < maxDIMMInstantNum))
287     {
288         dimmIdx = sensorId_Off / 2;
289     }
290     return dimmIdx;
291 }
292 
handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)293 void OemEventManager::handleBootOverallEvent(
294     pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
295 {
296     log_level logLevel{log_level::OK};
297     std::string description;
298     std::stringstream strStream;
299 
300     uint8_t byte0 = (presentReading & 0x000000ff);
301     uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
302     uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
303     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
304     /*
305      * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
306      * ATF BL32 and DDR initialization
307      */
308     if (bootStageToMsgMap.contains(byte3))
309     {
310         // Boot stage adding
311         description += bootStageToMsgMap[byte3];
312 
313         switch (byte3)
314         {
315             case boot_stage::DDR_TRAINING:
316                 if (byte0 >= ddrTrainingMsg.size())
317                 {
318                     logLevel = log_level::BIOSFWPANIC;
319                     description += " unknown status";
320                 }
321                 else
322                 {
323                     description += ddrTrainingMsg[byte0];
324                 }
325                 if (0x01 == byte0)
326                 {
327                     // Add complete percentage
328                     description += " at " + std::to_string(byte1) + "%";
329                 }
330                 break;
331             case boot_stage::S0_DDR_TRAINING_FAILURE:
332             case boot_stage::S1_DDR_TRAINING_FAILURE:
333                 // ddr_training_status_msg()
334                 logLevel = log_level::BIOSFWPANIC;
335                 description += " at DIMMs:";
336                 // dimmIdxs = presentReading & 0x00ffffff;
337                 description += dimmIdxsToString(presentReading & 0x00ffffff);
338                 description += " of socket ";
339                 description +=
340                     (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
341                 break;
342             default:
343                 if (byte0 >= bootStatMsg.size())
344                 {
345                     logLevel = log_level::BIOSFWPANIC;
346                     description += " unknown status";
347                 }
348                 else
349                 {
350                     description += bootStatMsg[byte0];
351                 }
352                 break;
353         }
354 
355         // Sensor report action is fail
356         if (boot::status::BOOT_STATUS_FAILURE == byte2)
357         {
358             logLevel = log_level::BIOSFWPANIC;
359         }
360     }
361     else
362     {
363         if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
364         {
365             description +=
366                 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
367 
368             strStream
369                 << "Segment (0x" << std::setfill('0') << std::hex
370                 << std::setw(8) << static_cast<uint32_t>(presentReading)
371                 << "); Status Class (0x" << std::setw(2)
372                 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
373                 << std::setw(2) << static_cast<uint32_t>(byte2)
374                 << "); Operation Code (0x" << std::setw(4)
375                 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
376                 << ")" << std::dec;
377 
378             description += strStream.str();
379         }
380     }
381 
382     // Log to Redfish event
383     sendJournalRedfish(description, logLevel);
384 }
385 
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)386 int OemEventManager::processNumericSensorEvent(
387     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
388     size_t sensorDataLength)
389 {
390     uint8_t eventState = 0;
391     uint8_t previousEventState = 0;
392     uint8_t sensorDataSize = 0;
393     uint32_t presentReading;
394     auto rc = decode_numeric_sensor_data(
395         sensorData, sensorDataLength, &eventState, &previousEventState,
396         &sensorDataSize, &presentReading);
397     if (rc)
398     {
399         lg2::error(
400             "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
401             "TID", tid, "RC", rc);
402         return rc;
403     }
404 
405     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
406     if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
407     {
408         handleDIMMStatusEvent(tid, sensorId, presentReading);
409         return PLDM_SUCCESS;
410     }
411 
412     switch (sensorId)
413     {
414         case BOOT_OVERALL:
415             handleBootOverallEvent(tid, sensorId, presentReading);
416             break;
417         case PCIE_HOT_PLUG:
418             handlePCIeHotPlugEvent(tid, sensorId, presentReading);
419             break;
420         case DDR_STATUS:
421             handleDDRStatusEvent(tid, sensorId, presentReading);
422             break;
423         case PCP_VR_STATE:
424         case SOC_VR_STATE:
425         case DPHY_VR1_STATE:
426         case DPHY_VR2_STATE:
427         case D2D_VR_STATE:
428         case IOC_VR1_STATE:
429         case IOC_VR2_STATE:
430         case PCI_D_VR_STATE:
431         case PCI_A_VR_STATE:
432             handleVRDStatusEvent(tid, sensorId, presentReading);
433             break;
434         case WATCH_DOG:
435             handleNumericWatchdogEvent(tid, sensorId, presentReading);
436             break;
437         default:
438             std::string description;
439             std::stringstream strStream;
440 
441             description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
442             description += prefixMsgStrCreation(tid, sensorId);
443             strStream << std::setfill('0') << std::hex << "eventState 0x"
444                       << std::setw(2) << static_cast<uint32_t>(eventState)
445                       << " previousEventState 0x" << std::setw(2)
446                       << static_cast<uint32_t>(previousEventState)
447                       << " sensorDataSize 0x" << std::setw(2)
448                       << static_cast<uint32_t>(sensorDataSize)
449                       << " presentReading 0x" << std::setw(8)
450                       << static_cast<uint32_t>(presentReading) << std::dec;
451             description += strStream.str();
452             std::cout << description << "\n";
453     }
454     return PLDM_SUCCESS;
455 }
456 
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)457 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
458                                              const uint8_t* sensorData,
459                                              size_t sensorDataLength)
460 {
461     uint8_t sensorOffset = 0;
462     uint8_t eventState = 0;
463     uint8_t previousEventState = 0;
464 
465     auto rc =
466         decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
467                                  &eventState, &previousEventState);
468     if (rc)
469     {
470         lg2::error(
471             "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
472             "TID", tid, "RC", rc);
473         return rc;
474     }
475 
476     std::string description;
477 
478     if (stateSensorToMsgMap.contains(sensorId))
479     {
480         log_level logLevel = log_level::OK;
481 
482         description += prefixMsgStrCreation(tid, sensorId);
483         auto componentMap = stateSensorToMsgMap[sensorId];
484         if (sensorOffset < componentMap.size())
485         {
486             description += std::get<0>(componentMap[sensorOffset]);
487             auto stateMap = std::get<1>(componentMap[sensorOffset]);
488             if (stateMap.contains(eventState))
489             {
490                 logLevel = std::get<0>(stateMap[eventState]);
491                 description += " state : " + std::get<1>(stateMap[eventState]);
492                 if (stateMap.contains(previousEventState))
493                 {
494                     description += "; previous state: " +
495                                    std::get<1>(stateMap[previousEventState]);
496                 }
497             }
498             else
499             {
500                 description += " sends unsupported event state: " +
501                                std::to_string(eventState);
502                 if (stateMap.contains(previousEventState))
503                 {
504                     description += "; previous state: " +
505                                    std::get<1>(stateMap[previousEventState]);
506                 }
507             }
508         }
509         else
510         {
511             description += "sends unsupported component sensor offset " +
512                            std::to_string(sensorOffset);
513         }
514 
515         sendJournalRedfish(description, logLevel);
516     }
517     else
518     {
519         std::stringstream strStream;
520         description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
521         description += prefixMsgStrCreation(tid, sensorId);
522         strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
523                   << std::setw(2) << static_cast<uint32_t>(sensorOffset)
524                   << "eventState 0x" << std::setw(2)
525                   << static_cast<uint32_t>(eventState)
526                   << " previousEventState 0x" << std::setw(2)
527                   << static_cast<uint32_t>(previousEventState) << std::dec;
528         description += strStream.str();
529         std::cout << description << "\n";
530     }
531 
532     return PLDM_SUCCESS;
533 }
534 
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)535 int OemEventManager::processSensorOpStateEvent(
536     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
537     size_t sensorDataLength)
538 {
539     uint8_t present_op_state = 0;
540     uint8_t previous_op_state = 0;
541 
542     auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
543                                     &present_op_state, &previous_op_state);
544     if (rc)
545     {
546         lg2::error(
547             "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
548             "TID", tid, "RC", rc);
549         return rc;
550     }
551 
552     std::string description;
553     std::stringstream strStream;
554 
555     description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
556     description += prefixMsgStrCreation(tid, sensorId);
557     strStream << std::setfill('0') << std::hex << "present_op_state 0x"
558               << std::setw(2) << static_cast<uint32_t>(present_op_state)
559               << "previous_op_state 0x" << std::setw(2)
560               << static_cast<uint32_t>(previous_op_state) << std::dec;
561     description += strStream.str();
562     std::cout << description << "\n";
563 
564     return PLDM_SUCCESS;
565 }
566 
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)567 int OemEventManager::handleSensorEvent(
568     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
569     pldm_tid_t tid, size_t eventDataOffset)
570 {
571     /* This OEM event handler is only used for SoC terminus*/
572     if (!tidToSocketNameMap.contains(tid))
573     {
574         return PLDM_SUCCESS;
575     }
576     auto eventData =
577         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
578     auto eventDataSize = payloadLength - eventDataOffset;
579 
580     uint16_t sensorId = 0;
581     uint8_t sensorEventClassType = 0;
582     size_t eventClassDataOffset = 0;
583     auto rc =
584         decode_sensor_event_data(eventData, eventDataSize, &sensorId,
585                                  &sensorEventClassType, &eventClassDataOffset);
586     if (rc)
587     {
588         lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
589                    rc);
590         return rc;
591     }
592     const uint8_t* sensorData = eventData + eventClassDataOffset;
593     size_t sensorDataLength = eventDataSize - eventClassDataOffset;
594 
595     switch (sensorEventClassType)
596     {
597         case PLDM_NUMERIC_SENSOR_STATE:
598         {
599             return processNumericSensorEvent(tid, sensorId, sensorData,
600                                              sensorDataLength);
601         }
602         case PLDM_STATE_SENSOR_STATE:
603         {
604             return processStateSensorEvent(tid, sensorId, sensorData,
605                                            sensorDataLength);
606         }
607         case PLDM_SENSOR_OP_STATE:
608         {
609             return processSensorOpStateEvent(tid, sensorId, sensorData,
610                                              sensorDataLength);
611         }
612         default:
613             std::string description;
614             std::stringstream strStream;
615 
616             description += "SENSOR_EVENT : Unsupported Sensor Class " +
617                            std::to_string(sensorEventClassType) + ": ";
618             description += prefixMsgStrCreation(tid, sensorId);
619             strStream << std::setfill('0') << std::hex
620                       << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
621 
622             auto dataPtr = sensorData;
623             for ([[maybe_unused]] const auto& i :
624                  std::views::iota(0, (int)sensorDataLength))
625             {
626                 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
627                 dataPtr += sizeof(sensorData);
628             }
629 
630             description += strStream.str();
631             std::cout << description << "\n";
632     }
633 
634     return PLDM_ERROR;
635 }
636 
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)637 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
638                                              uint32_t presentReading)
639 {
640     std::string description;
641     std::stringstream strStream;
642     PCIeHotPlugEventRecord_t record{presentReading};
643 
644     std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
645     std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
646     log_level logLevel =
647         (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
648 
649     description += prefixMsgStrCreation(tid, sensorId);
650 
651     strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
652               << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
653               << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
654               << "); Device (0x" << std::setw(2)
655               << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
656               << std::setw(2) << static_cast<uint32_t>(record.bits.function)
657               << "); Action (" << sAction << "); Operation status ("
658               << sOpStatus << "); Media slot number (" << std::dec
659               << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
660 
661     description += strStream.str();
662 
663     // Log to Redfish event
664     sendJournalRedfish(description, logLevel);
665 }
666 
dimmTrainingFailureToMsg(uint32_t failureInfo)667 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
668 {
669     std::string description;
670     DIMMTrainingFailure_t failure{failureInfo};
671 
672     if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
673     {
674         auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
675 
676         description += std::get<0>(failureInfoMap);
677 
678         description += "; MCU rank index " +
679                        std::to_string(failure.bits.mcuRankIdx);
680 
681         description += "; Slice number " +
682                        std::to_string(failure.bits.sliceNum);
683 
684         description += "; Upper nibble error status: ";
685         description += (!failure.bits.upperNibbStatErr)
686                            ? "No error"
687                            : "Found no rising edge";
688 
689         description += "; Lower nibble error status: ";
690         description += (!failure.bits.lowerNibbStatErr)
691                            ? "No error"
692                            : "Found no rising edge";
693 
694         description += "; Failure syndrome 0: ";
695 
696         auto& syndromeMap = std::get<1>(failureInfoMap);
697         if (syndromeMap.contains(failure.bits.syndrome))
698         {
699             description += syndromeMap[failure.bits.syndrome];
700         }
701         else
702         {
703             description += "(Unknown syndrome)";
704         }
705     }
706     else
707     {
708         description += "Unknown training failure type " +
709                        std::to_string(failure.bits.type);
710     }
711 
712     return description;
713 }
714 
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)715 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
716                                             uint32_t presentReading)
717 {
718     log_level logLevel{log_level::WARNING};
719     std::string description;
720     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
721     uint32_t byte012 = presentReading & 0xffffff;
722 
723     description += prefixMsgStrCreation(tid, sensorId);
724 
725     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
726     auto dimmIdx = sensorIdToDIMMIdx(sensorId);
727     if (dimmIdx >= maxDIMMIdxBitNum)
728     {
729         return;
730     }
731 
732     description += "DIMM " + std::to_string(dimmIdx) + " ";
733 
734     if (dimmStatusToMsgMap.contains(byte3))
735     {
736         if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
737             byte3 == dimm_status::INSTALLED_BUT_DISABLED)
738         {
739             logLevel = log_level::OK;
740         }
741 
742         description += dimmStatusToMsgMap[byte3];
743 
744         if (byte3 == dimm_status::TRAINING_FAILURE)
745         {
746             description += "; " + dimmTrainingFailureToMsg(byte012);
747         }
748         else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
749         {
750             uint8_t byte0 = (byte012 & 0xff);
751             if (byte0 < pmicTempAlertMsg.size())
752             {
753                 description += ": " + pmicTempAlertMsg[byte0];
754             }
755         }
756     }
757     else
758     {
759         switch (byte3)
760         {
761             case dimm_status::PMIC_HIGH_TEMP:
762                 if (byte012 == 0x01)
763                 {
764                     description += "has PMIC high temp condition";
765                 }
766                 break;
767             case dimm_status::TSx_HIGH_TEMP:
768                 switch (byte012)
769                 {
770                     case 0x01:
771                         description += "has TS0";
772                         break;
773                     case 0x02:
774                         description += "has TS1";
775                         break;
776                     case 0x03:
777                         description += "has TS0 and TS1";
778                         break;
779                 }
780                 description += " exceeding their high temperature threshold";
781                 break;
782             case dimm_status::SPD_HUB_HIGH_TEMP:
783                 if (byte012 == 0x01)
784                 {
785                     description += "has SPD/HUB high temp condition";
786                 }
787                 break;
788             default:
789                 description += "has unsupported status " +
790                                std::to_string(byte3);
791                 break;
792         }
793     }
794 
795     // Log to Redfish event
796     sendJournalRedfish(description, logLevel);
797 }
798 
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)799 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
800                                            uint32_t presentReading)
801 {
802     log_level logLevel{log_level::WARNING};
803     std::string description;
804     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
805     uint32_t byte012 = presentReading & 0xffffff;
806 
807     description += prefixMsgStrCreation(tid, sensorId);
808 
809     description += "DDR ";
810     if (ddrStatusToMsgMap.contains(byte3))
811     {
812         if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
813         {
814             logLevel = log_level::OK;
815         }
816 
817         description += ddrStatusToMsgMap[byte3];
818 
819         if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
820             byte3 == ddr_status::TRAINING_FAILURE)
821         {
822             // List out failed DIMMs
823             description += dimmIdxsToString(byte012);
824         }
825     }
826     else
827     {
828         description += "has unsupported status " + std::to_string(byte3);
829     }
830 
831     // Log to Redfish event
832     sendJournalRedfish(description, logLevel);
833 }
834 
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)835 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
836                                            uint32_t presentReading)
837 {
838     log_level logLevel{log_level::WARNING};
839     std::string description;
840     std::stringstream strStream;
841 
842     description += prefixMsgStrCreation(tid, sensorId);
843 
844     VRDStatus_t status{presentReading};
845 
846     if (status.bits.warning && status.bits.critical)
847     {
848         description += "A VR warning and a VR critical";
849         logLevel = log_level::CRITICAL;
850     }
851     else
852     {
853         if (status.bits.warning)
854         {
855             description += "A VR warning";
856         }
857         else if (status.bits.critical)
858         {
859             description += "A VR critical";
860             logLevel = log_level::CRITICAL;
861         }
862         else
863         {
864             description += "No VR warning or critical";
865             logLevel = log_level::OK;
866         }
867     }
868     description += " condition observed";
869 
870     strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
871               << std::setw(2)
872               << static_cast<uint32_t>(status.bits.vr_status_byte_high)
873               << "; VR status byte low is 0x" << std::setw(2)
874               << static_cast<uint32_t>(status.bits.vr_status_byte_low)
875               << "; Reading is 0x" << std::setw(2)
876               << static_cast<uint32_t>(presentReading) << ";";
877 
878     description += strStream.str();
879 
880     // Log to Redfish event
881     sendJournalRedfish(description, logLevel);
882 }
883 
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)884 void OemEventManager::handleNumericWatchdogEvent(
885     pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
886 {
887     std::string description;
888     log_level logLevel = log_level::CRITICAL;
889 
890     description += prefixMsgStrCreation(tid, sensorId);
891 
892     if (presentReading & 0x01)
893     {
894         description += "Global watchdog expired;";
895     }
896     if (presentReading & 0x02)
897     {
898         description += "Secure watchdog expired;";
899     }
900     if (presentReading & 0x04)
901     {
902         description += "Non-secure watchdog expired;";
903     }
904 
905     // Log to Redfish event
906     sendJournalRedfish(description, logLevel);
907 }
908 
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)909 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
910                                             const uint8_t* eventData,
911                                             size_t eventDataSize)
912 {
913     EFI_AMPERE_ERROR_DATA ampHdr;
914 
915     decodeCperRecord(eventData, eventDataSize, &ampHdr);
916 
917     addCperSELLog(tid, eventId, &ampHdr);
918 
919     /* isBert at bit 12 of TypeId */
920     if (ampHdr.TypeId & 0x0800)
921     {
922         lg2::info("Ampere SoC BERT is triggered.");
923         std::variant<std::string> value(
924             "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
925         try
926         {
927             auto& bus = pldm::utils::DBusHandler::getBus();
928             auto method =
929                 bus.new_method_call("com.ampere.CrashCapture.Trigger",
930                                     "/com/ampere/crashcapture/trigger",
931                                     pldm::utils::dbusProperties, "Set");
932             method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
933                           value);
934             bus.call_noreply(method);
935         }
936         catch (const std::exception& e)
937         {
938             lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
939         }
940     }
941 
942     return PLDM_SUCCESS;
943 }
944 
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)945 int OemEventManager::handlepldmMessagePollEvent(
946     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
947     pldm_tid_t tid, size_t eventDataOffset)
948 {
949     /* This OEM event handler is only used for SoC terminus*/
950     if (!tidToSocketNameMap.contains(tid))
951     {
952         return PLDM_SUCCESS;
953     }
954 
955     auto eventData =
956         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
957     auto eventDataSize = payloadLength - eventDataOffset;
958 
959     pldm_message_poll_event poll_event{};
960     auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
961                                                   &poll_event);
962     if (rc)
963     {
964         lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
965                    "RC", rc);
966         return rc;
967     }
968 
969     auto sensorID = poll_event.event_id;
970     /* The UE errors */
971     if (rasUESensorIDs.contains(sensorID))
972     {
973         pldm::utils::DBusMapping dbusMapping{
974             "/xyz/openbmc_project/led/groups/ras_ue_fault",
975             "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
976         try
977         {
978             pldm::utils::DBusHandler().setDbusProperty(
979                 dbusMapping, pldm::utils::PropertyValue{bool(true)});
980         }
981         catch (const std::exception& e)
982         {
983             lg2::error(
984                 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
985                 "TID", tid, "SENSORID", sensorID, "ERROR", e);
986         }
987     }
988 
989     return PLDM_SUCCESS;
990 }
991 
oemPollForPlatformEvent(pldm_tid_t tid)992 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
993 {
994     uint64_t t0 = 0;
995 
996     /* This OEM event handler is only used for SoC terminus */
997     if (!tidToSocketNameMap.contains(tid))
998     {
999         co_return PLDM_SUCCESS;
1000     }
1001 
1002     if (!timeStampMap.contains(tid))
1003     {
1004         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1005         timeStampMap.emplace(std::make_pair(tid, t0));
1006     }
1007     else
1008     {
1009         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1010         uint64_t elapsed = t0 - timeStampMap[tid];
1011         if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1012         {
1013             co_await manager->pollForPlatformEvent(tid, 0, 0);
1014             timeStampMap[tid] = t0;
1015         }
1016     }
1017 
1018     co_return PLDM_SUCCESS;
1019 }
1020 } // namespace oem_ampere
1021 } // namespace pldm
1022