xref: /openbmc/pldm/oem/ampere/event/oem_event_manager.cpp (revision 4a5038370b1513022b41e5d99a319f627c0084c8)
1 #include "oem_event_manager.hpp"
2 
3 #include "libcper/Cper.h"
4 
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8 
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13 
14 #include <phosphor-logging/lg2.hpp>
15 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
16 
17 #include <algorithm>
18 #include <map>
19 #include <set>
20 #include <sstream>
21 #include <string>
22 #include <unordered_map>
23 
24 namespace pldm
25 {
26 namespace oem_ampere
27 {
28 namespace fs = std::filesystem;
29 using namespace std::chrono;
30 
31 namespace boot_stage = boot::stage;
32 namespace ddr_status = ddr::status;
33 namespace dimm_status = dimm::status;
34 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
35 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
36 namespace training_failure = dimm::training_failure;
37 
38 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
39 constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
40 constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
41 constexpr const char* BIOSFWPanicRegistry =
42     "OpenBMC.0.1.BIOSFirmwarePanicReason";
43 constexpr auto maxDIMMIdxBitNum = 24;
44 constexpr auto maxDIMMInstantNum = 24;
45 
46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47 
48 /*
49     An array of possible boot status of a boot stage.
50     The index maps with byte 0 of boot code.
51 */
52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53 
54 /*
55     An array of possible boot status of DDR training stage.
56     The index maps with byte 0 of boot code.
57 */
58 std::array<std::string, 3> ddrTrainingMsg = {
59     " progress started", " in-progress", " progress completed"};
60 
61 /*
62     A map between PMIC status and logging strings.
63 */
64 std::array<std::string, 8> pmicTempAlertMsg = {
65     "Below 85°C", "85°C",  "95°C",  "105°C",
66     "115°C",      "125°C", "135°C", "Equal or greater than 140°C"};
67 
68 /*
69     In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70     EPs through SMBus and PCIe. When host boots up, SMBUS interface
71     comes up first. In this interface, BMC is bus owner.
72 
73     mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74     pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75 */
76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77 
78 /*
79     A map between sensor IDs and their names in string.
80     Using pldm::oem::sensor_ids
81 */
82 EventToMsgMap_t sensorIdToStrMap = {
83     {DDR_STATUS, "DDR_STATUS"},
84     {PCP_VR_STATE, "PCP_VR_STATE"},
85     {SOC_VR_STATE, "SOC_VR_STATE"},
86     {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87     {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88     {D2D_VR_STATE, "D2D_VR_STATE"},
89     {IOC_VR1_STATE, "IOC_VR1_STATE"},
90     {IOC_VR2_STATE, "IOC_VR2_STATE"},
91     {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92     {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93     {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94     {BOOT_OVERALL, "BOOT_OVERALL"},
95     {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96     {WATCH_DOG, "WATCH_DOG"}};
97 
98 /*
99     A map between the boot stages and logging strings.
100     Using pldm::oem::boot::stage::boot_stage
101 */
102 EventToMsgMap_t bootStageToMsgMap = {
103     {boot_stage::SECPRO, "SECpro"},
104     {boot_stage::MPRO, "Mpro"},
105     {boot_stage::ATF_BL1, "ATF BL1"},
106     {boot_stage::ATF_BL2, "ATF BL2"},
107     {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108     {boot_stage::DDR_TRAINING, "DDR training"},
109     {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110     {boot_stage::ATF_BL31, "ATF BL31"},
111     {boot_stage::ATF_BL32, "ATF BL32"},
112     {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113     {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114      "ATF BL33 (UEFI) booting status = "}};
115 
116 /*
117     A map between DDR status and logging strings.
118     Using pldm::oem::ddr::status::ddr_status
119 */
120 EventToMsgMap_t ddrStatusToMsgMap = {
121     {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122     {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123     {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124     {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125     {ddr_status::OTHER_FAILURE, "has other failure"},
126     {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127      "has boot failure due to no configuration"},
128     {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129      "failsafe activated but boot success with the next valid configuration"}};
130 
131 /*
132     A map between DIMM status and logging strings.
133     Using pldm::oem::dimm::status::dimm_status
134 */
135 EventToMsgMap_t dimmStatusToMsgMap = {
136     {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137     {dimm_status::NOT_INSTALLED, "is not installed"},
138     {dimm_status::OTHER_FAILURE, "has other failure"},
139     {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140     {dimm_status::TRAINING_FAILURE, "has training failure; "},
141     {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142 
143 /*
144     A map between PHY training failure syndrome and logging strings.
145     Using
146    pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147 */
148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149     {phy_syndrome::NA, "(N/A)"},
150     {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151     {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152     {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153      "(PHY write level failure - see syndrome 1)"},
154     {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155      "(PHY read gate leveling failure)"},
156     {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157     {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158     {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159 
160 /*
161     A map between DIMM training failure syndrome and logging strings.
162     Using
163    pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164 */
165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166     {dimm_syndrome::NA, "(N/A)"},
167     {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168      "(DRAM VREFDQ training failure)"},
169     {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170     {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171      "(LRDRIMM DB SW training failure)"}};
172 
173 /*
174     A map between DIMM training failure type and a pair of <logging strings -
175    syndrome map>. Using
176    pldm::oem::dimm::training_faillure::dimm_training_failure_type
177 */
178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179     dimmTrainingFailureTypeMap = {
180         {training_failure::PHY_TRAINING_FAILURE_TYPE,
181          std::make_pair("PHY training failure",
182                         phyTrainingFailureSyndromeToMsgMap)},
183         {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184          std::make_pair("DIMM training failure",
185                         dimmTrainingFailureSyndromeToMsgMap)}};
186 
187 /*
188     A map between log level and the registry used for Redfish SEL log
189     Using pldm::oem::log_level
190 */
191 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
192     {log_level::OK, ampereEventRegistry},
193     {log_level::WARNING, ampereWarningRegistry},
194     {log_level::CRITICAL, ampereCriticalRegistry},
195     {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
196 
197 std::unordered_map<
198     uint16_t,
199     std::vector<std::pair<
200         std::string,
201         std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
202     stateSensorToMsgMap = {
203         {SOC_HEALTH_AVAILABILITY,
204          {{"SoC Health",
205            {{1, {log_level::OK, "Normal"}},
206             {2, {log_level::WARNING, "Non-Critical"}},
207             {3, {log_level::CRITICAL, "Critical"}},
208             {4, {log_level::CRITICAL, "Fatal"}}}},
209           {"SoC Availability",
210            {{1, {log_level::OK, "Enabled"}},
211             {2, {log_level::WARNING, "Disabled"}},
212             {3, {log_level::CRITICAL, "Shutdown"}}}}}},
213         {WATCH_DOG,
214          {{"Global Watch Dog",
215            {{1, {log_level::OK, "Normal"}},
216             {2, {log_level::CRITICAL, "Timer Expired"}}}},
217           {"Secure Watch Dog",
218            {{1, {log_level::OK, "Normal"}},
219             {2, {log_level::CRITICAL, "Timer Expired"}}}},
220           {"Non-secure Watch Dog",
221            {{1, {log_level::OK, "Normal"}},
222             {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
223 
224 std::string
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)225     OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId)
226 {
227     std::string description;
228     if (!tidToSocketNameMap.contains(tid))
229     {
230         description += "TID " + std::to_string(tid) + ": ";
231     }
232     else
233     {
234         description += tidToSocketNameMap[tid] + ": ";
235     }
236 
237     if (!sensorIdToStrMap.contains(sensorId))
238     {
239         description += "Sensor ID " + std::to_string(sensorId) + ": ";
240     }
241     else
242     {
243         description += sensorIdToStrMap[sensorId] + ": ";
244     }
245 
246     return description;
247 }
248 
sendJournalRedfish(const std::string & description,log_level & logLevel)249 void OemEventManager::sendJournalRedfish(const std::string& description,
250                                          log_level& logLevel)
251 {
252     if (description.empty())
253     {
254         return;
255     }
256 
257     if (!logLevelToRedfishMsgIdMap.contains(logLevel))
258     {
259         lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
260                    "DES", description);
261         return;
262     }
263     auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
264     lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265               redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
266 }
267 
dimmIdxsToString(uint32_t dimmIdxs)268 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
269 {
270     std::string description;
271     for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
272     {
273         if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
274         {
275             description += " #" + std::to_string(bitIdx);
276         }
277     }
278     return description;
279 }
280 
sensorIdToDIMMIdx(const uint16_t & sensorId)281 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
282 {
283     uint8_t dimmIdx = maxDIMMInstantNum;
284     int sensorId_Off = sensorId - 4;
285     if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
286         ((sensorId_Off / 2) < maxDIMMInstantNum))
287     {
288         dimmIdx = sensorId_Off / 2;
289     }
290     return dimmIdx;
291 }
292 
handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)293 void OemEventManager::handleBootOverallEvent(
294     pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
295 {
296     log_level logLevel{log_level::OK};
297     std::string description;
298     std::stringstream strStream;
299 
300     uint8_t byte0 = (presentReading & 0x000000ff);
301     uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
302     uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
303     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
304     /*
305      * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
306      * ATF BL32 and DDR initialization
307      */
308     if (bootStageToMsgMap.contains(byte3))
309     {
310         // Boot stage adding
311         description += bootStageToMsgMap[byte3];
312 
313         switch (byte3)
314         {
315             case boot_stage::DDR_TRAINING:
316                 if (byte0 >= ddrTrainingMsg.size())
317                 {
318                     logLevel = log_level::BIOSFWPANIC;
319                     description += " unknown status";
320                 }
321                 else
322                 {
323                     description += ddrTrainingMsg[byte0];
324                 }
325                 if (0x01 == byte0)
326                 {
327                     // Add complete percentage
328                     description += " at " + std::to_string(byte1) + "%";
329                 }
330                 break;
331             case boot_stage::S0_DDR_TRAINING_FAILURE:
332             case boot_stage::S1_DDR_TRAINING_FAILURE:
333                 // ddr_training_status_msg()
334                 logLevel = log_level::BIOSFWPANIC;
335                 description += " at DIMMs:";
336                 // dimmIdxs = presentReading & 0x00ffffff;
337                 description += dimmIdxsToString(presentReading & 0x00ffffff);
338                 description += " of socket ";
339                 description +=
340                     (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
341                 break;
342             default:
343                 if (byte0 >= bootStatMsg.size())
344                 {
345                     logLevel = log_level::BIOSFWPANIC;
346                     description += " unknown status";
347                 }
348                 else
349                 {
350                     description += bootStatMsg[byte0];
351                 }
352                 break;
353         }
354 
355         // Sensor report action is fail
356         if (boot::status::BOOT_STATUS_FAILURE == byte2)
357         {
358             logLevel = log_level::BIOSFWPANIC;
359         }
360     }
361     else
362     {
363         if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
364         {
365             description +=
366                 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
367 
368             strStream
369                 << "Segment (0x" << std::setfill('0') << std::hex
370                 << std::setw(8) << static_cast<uint32_t>(presentReading)
371                 << "); Status Class (0x" << std::setw(2)
372                 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
373                 << std::setw(2) << static_cast<uint32_t>(byte2)
374                 << "); Operation Code (0x" << std::setw(4)
375                 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
376                 << ")" << std::dec;
377 
378             description += strStream.str();
379         }
380     }
381 
382     // Log to Redfish event
383     sendJournalRedfish(description, logLevel);
384 }
385 
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)386 int OemEventManager::processNumericSensorEvent(
387     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
388     size_t sensorDataLength)
389 {
390     uint8_t eventState = 0;
391     uint8_t previousEventState = 0;
392     uint8_t sensorDataSize = 0;
393     uint32_t presentReading;
394     auto rc = decode_numeric_sensor_data(
395         sensorData, sensorDataLength, &eventState, &previousEventState,
396         &sensorDataSize, &presentReading);
397     if (rc)
398     {
399         lg2::error(
400             "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
401             "TID", tid, "RC", rc);
402         return rc;
403     }
404 
405     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
406     if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
407     {
408         handleDIMMStatusEvent(tid, sensorId, presentReading);
409         return PLDM_SUCCESS;
410     }
411 
412     switch (sensorId)
413     {
414         case BOOT_OVERALL:
415             handleBootOverallEvent(tid, sensorId, presentReading);
416             break;
417         case PCIE_HOT_PLUG:
418             handlePCIeHotPlugEvent(tid, sensorId, presentReading);
419             break;
420         case DDR_STATUS:
421             handleDDRStatusEvent(tid, sensorId, presentReading);
422             break;
423         case PCP_VR_STATE:
424         case SOC_VR_STATE:
425         case DPHY_VR1_STATE:
426         case DPHY_VR2_STATE:
427         case D2D_VR_STATE:
428         case IOC_VR1_STATE:
429         case IOC_VR2_STATE:
430         case PCI_D_VR_STATE:
431         case PCI_A_VR_STATE:
432             handleVRDStatusEvent(tid, sensorId, presentReading);
433             break;
434         case WATCH_DOG:
435             handleNumericWatchdogEvent(tid, sensorId, presentReading);
436             break;
437         default:
438             std::string description;
439             std::stringstream strStream;
440             log_level logLevel = log_level::OK;
441 
442             description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
443             description += prefixMsgStrCreation(tid, sensorId);
444             strStream << std::setfill('0') << std::hex << "eventState 0x"
445                       << std::setw(2) << static_cast<uint32_t>(eventState)
446                       << " previousEventState 0x" << std::setw(2)
447                       << static_cast<uint32_t>(previousEventState)
448                       << " sensorDataSize 0x" << std::setw(2)
449                       << static_cast<uint32_t>(sensorDataSize)
450                       << " presentReading 0x" << std::setw(8)
451                       << static_cast<uint32_t>(presentReading) << std::dec;
452             description += strStream.str();
453 
454             sendJournalRedfish(description, logLevel);
455             break;
456     }
457     return PLDM_SUCCESS;
458 }
459 
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)460 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
461                                              const uint8_t* sensorData,
462                                              size_t sensorDataLength)
463 {
464     uint8_t sensorOffset = 0;
465     uint8_t eventState = 0;
466     uint8_t previousEventState = 0;
467 
468     auto rc =
469         decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
470                                  &eventState, &previousEventState);
471     if (rc)
472     {
473         lg2::error(
474             "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
475             "TID", tid, "RC", rc);
476         return rc;
477     }
478 
479     std::string description;
480     log_level logLevel = log_level::OK;
481 
482     if (stateSensorToMsgMap.contains(sensorId))
483     {
484         description += prefixMsgStrCreation(tid, sensorId);
485         auto componentMap = stateSensorToMsgMap[sensorId];
486         if (sensorOffset < componentMap.size())
487         {
488             description += std::get<0>(componentMap[sensorOffset]);
489             auto stateMap = std::get<1>(componentMap[sensorOffset]);
490             if (stateMap.contains(eventState))
491             {
492                 logLevel = std::get<0>(stateMap[eventState]);
493                 description += " state : " + std::get<1>(stateMap[eventState]);
494                 if (stateMap.contains(previousEventState))
495                 {
496                     description += "; previous state: " +
497                                    std::get<1>(stateMap[previousEventState]);
498                 }
499             }
500             else
501             {
502                 description += " sends unsupported event state: " +
503                                std::to_string(eventState);
504                 if (stateMap.contains(previousEventState))
505                 {
506                     description += "; previous state: " +
507                                    std::get<1>(stateMap[previousEventState]);
508                 }
509             }
510         }
511         else
512         {
513             description += "sends unsupported component sensor offset " +
514                            std::to_string(sensorOffset);
515         }
516     }
517     else
518     {
519         std::stringstream strStream;
520         description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
521         description += prefixMsgStrCreation(tid, sensorId);
522         strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
523                   << std::setw(2) << static_cast<uint32_t>(sensorOffset)
524                   << "eventState 0x" << std::setw(2)
525                   << static_cast<uint32_t>(eventState)
526                   << " previousEventState 0x" << std::setw(2)
527                   << static_cast<uint32_t>(previousEventState) << std::dec;
528         description += strStream.str();
529     }
530 
531     sendJournalRedfish(description, logLevel);
532 
533     return PLDM_SUCCESS;
534 }
535 
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)536 int OemEventManager::processSensorOpStateEvent(
537     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
538     size_t sensorDataLength)
539 {
540     uint8_t present_op_state = 0;
541     uint8_t previous_op_state = 0;
542 
543     auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
544                                     &present_op_state, &previous_op_state);
545     if (rc)
546     {
547         lg2::error(
548             "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
549             "TID", tid, "RC", rc);
550         return rc;
551     }
552 
553     std::string description;
554     std::stringstream strStream;
555     log_level logLevel = log_level::OK;
556 
557     description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
558     description += prefixMsgStrCreation(tid, sensorId);
559     strStream << std::setfill('0') << std::hex << "present_op_state 0x"
560               << std::setw(2) << static_cast<uint32_t>(present_op_state)
561               << "previous_op_state 0x" << std::setw(2)
562               << static_cast<uint32_t>(previous_op_state) << std::dec;
563     description += strStream.str();
564 
565     sendJournalRedfish(description, logLevel);
566 
567     return PLDM_SUCCESS;
568 }
569 
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)570 int OemEventManager::handleSensorEvent(
571     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
572     pldm_tid_t tid, size_t eventDataOffset)
573 {
574     /* This OEM event handler is only used for SoC terminus*/
575     if (!tidToSocketNameMap.contains(tid))
576     {
577         return PLDM_SUCCESS;
578     }
579     auto eventData =
580         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
581     auto eventDataSize = payloadLength - eventDataOffset;
582 
583     uint16_t sensorId = 0;
584     uint8_t sensorEventClassType = 0;
585     size_t eventClassDataOffset = 0;
586     auto rc =
587         decode_sensor_event_data(eventData, eventDataSize, &sensorId,
588                                  &sensorEventClassType, &eventClassDataOffset);
589     if (rc)
590     {
591         lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
592                    rc);
593         return rc;
594     }
595     const uint8_t* sensorData = eventData + eventClassDataOffset;
596     size_t sensorDataLength = eventDataSize - eventClassDataOffset;
597 
598     switch (sensorEventClassType)
599     {
600         case PLDM_NUMERIC_SENSOR_STATE:
601         {
602             return processNumericSensorEvent(tid, sensorId, sensorData,
603                                              sensorDataLength);
604         }
605         case PLDM_STATE_SENSOR_STATE:
606         {
607             return processStateSensorEvent(tid, sensorId, sensorData,
608                                            sensorDataLength);
609         }
610         case PLDM_SENSOR_OP_STATE:
611         {
612             return processSensorOpStateEvent(tid, sensorId, sensorData,
613                                              sensorDataLength);
614         }
615         default:
616             std::string description;
617             std::stringstream strStream;
618             log_level logLevel = log_level::OK;
619 
620             description += "SENSOR_EVENT : Unsupported Sensor Class " +
621                            std::to_string(sensorEventClassType) + ": ";
622             description += prefixMsgStrCreation(tid, sensorId);
623             strStream << std::setfill('0') << std::hex
624                       << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
625 
626             auto dataPtr = sensorData;
627             for ([[maybe_unused]] const auto& i :
628                  std::views::iota(0, (int)sensorDataLength))
629             {
630                 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
631                 dataPtr += sizeof(sensorData);
632             }
633 
634             description += strStream.str();
635 
636             sendJournalRedfish(description, logLevel);
637     }
638     lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
639               sensorEventClassType);
640     return PLDM_ERROR;
641 }
642 
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)643 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
644                                              uint32_t presentReading)
645 {
646     std::string description;
647     std::stringstream strStream;
648     PCIeHotPlugEventRecord_t record{presentReading};
649 
650     std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
651     std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
652     log_level logLevel =
653         (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
654 
655     description += prefixMsgStrCreation(tid, sensorId);
656 
657     strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
658               << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
659               << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
660               << "); Device (0x" << std::setw(2)
661               << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
662               << std::setw(2) << static_cast<uint32_t>(record.bits.function)
663               << "); Action (" << sAction << "); Operation status ("
664               << sOpStatus << "); Media slot number (" << std::dec
665               << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
666 
667     description += strStream.str();
668 
669     // Log to Redfish event
670     sendJournalRedfish(description, logLevel);
671 }
672 
dimmTrainingFailureToMsg(uint32_t failureInfo)673 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
674 {
675     std::string description;
676     DIMMTrainingFailure_t failure{failureInfo};
677 
678     if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
679     {
680         auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
681 
682         description += std::get<0>(failureInfoMap);
683 
684         description += "; MCU rank index " +
685                        std::to_string(failure.bits.mcuRankIdx);
686 
687         description += "; Slice number " +
688                        std::to_string(failure.bits.sliceNum);
689 
690         description += "; Upper nibble error status: ";
691         description += (!failure.bits.upperNibbStatErr)
692                            ? "No error"
693                            : "Found no rising edge";
694 
695         description += "; Lower nibble error status: ";
696         description += (!failure.bits.lowerNibbStatErr)
697                            ? "No error"
698                            : "Found no rising edge";
699 
700         description += "; Failure syndrome 0: ";
701 
702         auto& syndromeMap = std::get<1>(failureInfoMap);
703         if (syndromeMap.contains(failure.bits.syndrome))
704         {
705             description += syndromeMap[failure.bits.syndrome];
706         }
707         else
708         {
709             description += "(Unknown syndrome)";
710         }
711     }
712     else
713     {
714         description += "Unknown training failure type " +
715                        std::to_string(failure.bits.type);
716     }
717 
718     return description;
719 }
720 
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)721 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
722                                             uint32_t presentReading)
723 {
724     log_level logLevel{log_level::WARNING};
725     std::string description;
726     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
727     uint32_t byte012 = presentReading & 0xffffff;
728 
729     description += prefixMsgStrCreation(tid, sensorId);
730 
731     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
732     auto dimmIdx = sensorIdToDIMMIdx(sensorId);
733     if (dimmIdx >= maxDIMMIdxBitNum)
734     {
735         return;
736     }
737 
738     description += "DIMM " + std::to_string(dimmIdx) + " ";
739 
740     if (dimmStatusToMsgMap.contains(byte3))
741     {
742         if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
743             byte3 == dimm_status::INSTALLED_BUT_DISABLED)
744         {
745             logLevel = log_level::OK;
746         }
747 
748         description += dimmStatusToMsgMap[byte3];
749 
750         if (byte3 == dimm_status::TRAINING_FAILURE)
751         {
752             description += "; " + dimmTrainingFailureToMsg(byte012);
753         }
754         else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
755         {
756             uint8_t byte0 = (byte012 & 0xff);
757             if (byte0 < pmicTempAlertMsg.size())
758             {
759                 description += ": " + pmicTempAlertMsg[byte0];
760             }
761         }
762     }
763     else
764     {
765         switch (byte3)
766         {
767             case dimm_status::PMIC_HIGH_TEMP:
768                 if (byte012 == 0x01)
769                 {
770                     description += "has PMIC high temp condition";
771                 }
772                 break;
773             case dimm_status::TSx_HIGH_TEMP:
774                 switch (byte012)
775                 {
776                     case 0x01:
777                         description += "has TS0";
778                         break;
779                     case 0x02:
780                         description += "has TS1";
781                         break;
782                     case 0x03:
783                         description += "has TS0 and TS1";
784                         break;
785                 }
786                 description += " exceeding their high temperature threshold";
787                 break;
788             case dimm_status::SPD_HUB_HIGH_TEMP:
789                 if (byte012 == 0x01)
790                 {
791                     description += "has SPD/HUB high temp condition";
792                 }
793                 break;
794             default:
795                 description += "has unsupported status " +
796                                std::to_string(byte3);
797                 break;
798         }
799     }
800 
801     // Log to Redfish event
802     sendJournalRedfish(description, logLevel);
803 }
804 
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)805 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
806                                            uint32_t presentReading)
807 {
808     log_level logLevel{log_level::WARNING};
809     std::string description;
810     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
811     uint32_t byte012 = presentReading & 0xffffff;
812 
813     description += prefixMsgStrCreation(tid, sensorId);
814 
815     description += "DDR ";
816     if (ddrStatusToMsgMap.contains(byte3))
817     {
818         if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
819         {
820             logLevel = log_level::OK;
821         }
822 
823         description += ddrStatusToMsgMap[byte3];
824 
825         if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
826             byte3 == ddr_status::TRAINING_FAILURE)
827         {
828             // List out failed DIMMs
829             description += dimmIdxsToString(byte012);
830         }
831     }
832     else
833     {
834         description += "has unsupported status " + std::to_string(byte3);
835     }
836 
837     // Log to Redfish event
838     sendJournalRedfish(description, logLevel);
839 }
840 
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)841 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
842                                            uint32_t presentReading)
843 {
844     log_level logLevel{log_level::WARNING};
845     std::string description;
846     std::stringstream strStream;
847 
848     description += prefixMsgStrCreation(tid, sensorId);
849 
850     VRDStatus_t status{presentReading};
851 
852     if (status.bits.warning && status.bits.critical)
853     {
854         description += "A VR warning and a VR critical";
855         logLevel = log_level::CRITICAL;
856     }
857     else
858     {
859         if (status.bits.warning)
860         {
861             description += "A VR warning";
862         }
863         else if (status.bits.critical)
864         {
865             description += "A VR critical";
866             logLevel = log_level::CRITICAL;
867         }
868         else
869         {
870             description += "No VR warning or critical";
871             logLevel = log_level::OK;
872         }
873     }
874     description += " condition observed";
875 
876     strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
877               << std::setw(2)
878               << static_cast<uint32_t>(status.bits.vr_status_byte_high)
879               << "; VR status byte low is 0x" << std::setw(2)
880               << static_cast<uint32_t>(status.bits.vr_status_byte_low)
881               << "; Reading is 0x" << std::setw(2)
882               << static_cast<uint32_t>(presentReading) << ";";
883 
884     description += strStream.str();
885 
886     // Log to Redfish event
887     sendJournalRedfish(description, logLevel);
888 }
889 
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)890 void OemEventManager::handleNumericWatchdogEvent(
891     pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
892 {
893     std::string description;
894     log_level logLevel = log_level::CRITICAL;
895 
896     description += prefixMsgStrCreation(tid, sensorId);
897 
898     if (presentReading & 0x01)
899     {
900         description += "Global watchdog expired;";
901     }
902     if (presentReading & 0x02)
903     {
904         description += "Secure watchdog expired;";
905     }
906     if (presentReading & 0x04)
907     {
908         description += "Non-secure watchdog expired;";
909     }
910 
911     // Log to Redfish event
912     sendJournalRedfish(description, logLevel);
913 }
914 
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)915 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
916                                             const uint8_t* eventData,
917                                             size_t eventDataSize)
918 {
919     EFI_AMPERE_ERROR_DATA ampHdr;
920 
921     decodeCperRecord(eventData, eventDataSize, &ampHdr);
922 
923     addCperSELLog(tid, eventId, &ampHdr);
924 
925     /* isBert at bit 12 of TypeId */
926     if (ampHdr.TypeId & 0x0800)
927     {
928         lg2::info("Ampere SoC BERT is triggered.");
929         std::variant<std::string> value(
930             "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
931         try
932         {
933             auto& bus = pldm::utils::DBusHandler::getBus();
934             auto method =
935                 bus.new_method_call("com.ampere.CrashCapture.Trigger",
936                                     "/com/ampere/crashcapture/trigger",
937                                     pldm::utils::dbusProperties, "Set");
938             method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
939                           value);
940             bus.call_noreply(method);
941         }
942         catch (const std::exception& e)
943         {
944             lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
945         }
946     }
947 
948     return PLDM_SUCCESS;
949 }
950 
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)951 int OemEventManager::handlepldmMessagePollEvent(
952     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
953     pldm_tid_t tid, size_t eventDataOffset)
954 {
955     /* This OEM event handler is only used for SoC terminus*/
956     if (!tidToSocketNameMap.contains(tid))
957     {
958         return PLDM_SUCCESS;
959     }
960 
961     auto eventData =
962         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
963     auto eventDataSize = payloadLength - eventDataOffset;
964 
965     pldm_message_poll_event poll_event{};
966     auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
967                                                   &poll_event);
968     if (rc)
969     {
970         lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
971                    "RC", rc);
972         return rc;
973     }
974 
975     auto sensorID = poll_event.event_id;
976     /* The UE errors */
977     if (rasUESensorIDs.contains(sensorID))
978     {
979         pldm::utils::DBusMapping dbusMapping{
980             "/xyz/openbmc_project/led/groups/ras_ue_fault",
981             "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
982         try
983         {
984             pldm::utils::DBusHandler().setDbusProperty(
985                 dbusMapping, pldm::utils::PropertyValue{bool(true)});
986         }
987         catch (const std::exception& e)
988         {
989             lg2::error(
990                 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
991                 "TID", tid, "SENSORID", sensorID, "ERROR", e);
992         }
993     }
994 
995     return PLDM_SUCCESS;
996 }
997 
oemPollForPlatformEvent(pldm_tid_t tid)998 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
999 {
1000     uint64_t t0 = 0;
1001 
1002     /* This OEM event handler is only used for SoC terminus */
1003     if (!tidToSocketNameMap.contains(tid))
1004     {
1005         co_return PLDM_SUCCESS;
1006     }
1007 
1008     if (!timeStampMap.contains(tid))
1009     {
1010         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1011         timeStampMap.emplace(std::make_pair(tid, t0));
1012     }
1013     else
1014     {
1015         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1016         uint64_t elapsed = t0 - timeStampMap[tid];
1017         if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1018         {
1019             co_await manager->pollForPlatformEvent(tid, 0, 0);
1020             timeStampMap[tid] = t0;
1021         }
1022     }
1023 
1024     co_return PLDM_SUCCESS;
1025 }
1026 } // namespace oem_ampere
1027 } // namespace pldm
1028