xref: /openbmc/pldm/oem/ampere/event/oem_event_manager.cpp (revision 2e2b4823c06508cb38509988b8962543a35ab8aa)
1 #include "oem_event_manager.hpp"
2 
3 #include "libcper/Cper.h"
4 
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8 
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13 
14 #include <com/ampere/Event/ReportedSEL/event.hpp>
15 #include <phosphor-logging/commit.hpp>
16 #include <phosphor-logging/lg2.hpp>
17 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
18 
19 #include <algorithm>
20 #include <map>
21 #include <set>
22 #include <sstream>
23 #include <string>
24 #include <unordered_map>
25 
26 namespace pldm
27 {
28 namespace oem_ampere
29 {
30 namespace fs = std::filesystem;
31 using namespace std::chrono;
32 namespace ReportedErrorSEL = sdbusplus::error::com::ampere::event::ReportedSEL;
33 namespace ReportedEventSEL = sdbusplus::event::com::ampere::event::ReportedSEL;
34 
35 namespace boot_stage = boot::stage;
36 namespace ddr_status = ddr::status;
37 namespace dimm_status = dimm::status;
38 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
39 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
40 namespace training_failure = dimm::training_failure;
41 
42 constexpr const char* BIOSFWPanicRegistry =
43     "OpenBMC.0.1.BIOSFirmwarePanicReason";
44 constexpr auto maxDIMMIdxBitNum = 24;
45 constexpr auto maxDIMMInstantNum = 24;
46 
47 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
48 
49 /*
50     An array of possible boot status of a boot stage.
51     The index maps with byte 0 of boot code.
52 */
53 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
54 
55 /*
56     An array of possible boot status of DDR training stage.
57     The index maps with byte 0 of boot code.
58 */
59 std::array<std::string, 3> ddrTrainingMsg = {
60     " progress started", " in-progress", " progress completed"};
61 
62 /*
63     A map between PMIC status and logging strings.
64 */
65 std::array<std::string, 8> pmicTempAlertMsg = {
66     "Below 85°C", "85°C",  "95°C",  "105°C",
67     "115°C",      "125°C", "135°C", "Equal or greater than 140°C"};
68 
69 /*
70     In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
71     EPs through SMBus and PCIe. When host boots up, SMBUS interface
72     comes up first. In this interface, BMC is bus owner.
73 
74     mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
75     pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
76 */
77 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
78 
79 /*
80     A map between sensor IDs and their names in string.
81     Using pldm::oem::sensor_ids
82 */
83 EventToMsgMap_t sensorIdToStrMap = {
84     {DDR_STATUS, "DDR_STATUS"},
85     {PCP_VR_STATE, "PCP_VR_STATE"},
86     {SOC_VR_STATE, "SOC_VR_STATE"},
87     {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
88     {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
89     {D2D_VR_STATE, "D2D_VR_STATE"},
90     {IOC_VR1_STATE, "IOC_VR1_STATE"},
91     {IOC_VR2_STATE, "IOC_VR2_STATE"},
92     {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
93     {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
94     {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
95     {BOOT_OVERALL, "BOOT_OVERALL"},
96     {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
97     {WATCH_DOG, "WATCH_DOG"}};
98 
99 /*
100     A map between the boot stages and logging strings.
101     Using pldm::oem::boot::stage::boot_stage
102 */
103 EventToMsgMap_t bootStageToMsgMap = {
104     {boot_stage::SECPRO, "SECpro"},
105     {boot_stage::MPRO, "Mpro"},
106     {boot_stage::ATF_BL1, "ATF BL1"},
107     {boot_stage::ATF_BL2, "ATF BL2"},
108     {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
109     {boot_stage::DDR_TRAINING, "DDR training"},
110     {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
111     {boot_stage::ATF_BL31, "ATF BL31"},
112     {boot_stage::ATF_BL32, "ATF BL32"},
113     {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
114     {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
115      "ATF BL33 (UEFI) booting status = "}};
116 
117 /*
118     A map between DDR status and logging strings.
119     Using pldm::oem::ddr::status::ddr_status
120 */
121 EventToMsgMap_t ddrStatusToMsgMap = {
122     {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
123     {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
124     {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
125     {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
126     {ddr_status::OTHER_FAILURE, "has other failure"},
127     {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
128      "has boot failure due to no configuration"},
129     {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
130      "failsafe activated but boot success with the next valid configuration"}};
131 
132 /*
133     A map between DIMM status and logging strings.
134     Using pldm::oem::dimm::status::dimm_status
135 */
136 EventToMsgMap_t dimmStatusToMsgMap = {
137     {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
138     {dimm_status::NOT_INSTALLED, "is not installed"},
139     {dimm_status::OTHER_FAILURE, "has other failure"},
140     {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
141     {dimm_status::TRAINING_FAILURE, "has training failure; "},
142     {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
143 
144 /*
145     A map between PHY training failure syndrome and logging strings.
146     Using
147    pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
148 */
149 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
150     {phy_syndrome::NA, "(N/A)"},
151     {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
152     {phy_syndrome::CA_LEVELING, "(CA leveling)"},
153     {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
154      "(PHY write level failure - see syndrome 1)"},
155     {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
156      "(PHY read gate leveling failure)"},
157     {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
158     {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
159     {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
160 
161 /*
162     A map between DIMM training failure syndrome and logging strings.
163     Using
164    pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
165 */
166 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
167     {dimm_syndrome::NA, "(N/A)"},
168     {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
169      "(DRAM VREFDQ training failure)"},
170     {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
171     {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
172      "(LRDRIMM DB SW training failure)"}};
173 
174 /*
175     A map between DIMM training failure type and a pair of <logging strings -
176    syndrome map>. Using
177    pldm::oem::dimm::training_faillure::dimm_training_failure_type
178 */
179 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
180     dimmTrainingFailureTypeMap = {
181         {training_failure::PHY_TRAINING_FAILURE_TYPE,
182          std::make_pair("PHY training failure",
183                         phyTrainingFailureSyndromeToMsgMap)},
184         {training_failure::DIMM_TRAINING_FAILURE_TYPE,
185          std::make_pair("DIMM training failure",
186                         dimmTrainingFailureSyndromeToMsgMap)}};
187 
188 std::unordered_map<
189     uint16_t,
190     std::vector<std::pair<
191         std::string,
192         std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
193     stateSensorToMsgMap = {
194         {SOC_HEALTH_AVAILABILITY,
195          {{"SoC Health",
196            {{1, {log_level::OK, "Normal"}},
197             {2, {log_level::WARNING, "Non-Critical"}},
198             {3, {log_level::CRITICAL, "Critical"}},
199             {4, {log_level::CRITICAL, "Fatal"}}}},
200           {"SoC Availability",
201            {{1, {log_level::OK, "Enabled"}},
202             {2, {log_level::WARNING, "Disabled"}},
203             {3, {log_level::CRITICAL, "Shutdown"}}}}}},
204         {WATCH_DOG,
205          {{"Global Watch Dog",
206            {{1, {log_level::OK, "Normal"}},
207             {2, {log_level::CRITICAL, "Timer Expired"}}}},
208           {"Secure Watch Dog",
209            {{1, {log_level::OK, "Normal"}},
210             {2, {log_level::CRITICAL, "Timer Expired"}}}},
211           {"Non-secure Watch Dog",
212            {{1, {log_level::OK, "Normal"}},
213             {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
214 
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)215 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
216                                                   uint16_t sensorId)
217 {
218     std::string description;
219 
220     if (!sensorIdToStrMap.contains(sensorId))
221     {
222         description += "Sensor ID " + std::to_string(sensorId) + " of ";
223     }
224     else
225     {
226         description += "Sensor " + sensorIdToStrMap[sensorId] + " of ";
227     }
228 
229     if (!tidToSocketNameMap.contains(tid))
230     {
231         description += "TID " + std::to_string(tid);
232     }
233     else
234     {
235         description += tidToSocketNameMap[tid];
236     }
237 
238     return description;
239 }
240 
sendJournalRedfish(const std::string & source,const std::string & description,log_level & logLevel)241 void OemEventManager::sendJournalRedfish(const std::string& source,
242                                          const std::string& description,
243                                          log_level& logLevel)
244 {
245     if (description.empty())
246     {
247         return;
248     }
249 
250     switch (logLevel)
251     {
252         case log_level::OK:
253             lg2::commit(ReportedEventSEL::ReportedSELInfo(
254                 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
255             break;
256         case log_level::WARNING:
257             lg2::commit(ReportedErrorSEL::ReportedSELWarning(
258                 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
259             break;
260         case log_level::CRITICAL:
261             lg2::commit(ReportedErrorSEL::ReportedSELCritical(
262                 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
263             break;
264         case log_level::BIOSFWPANIC:
265             lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
266                       BIOSFWPanicRegistry, "REDFISH_MESSAGE_ARGS", description);
267             break;
268         default:
269         {
270             lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
271                        "DES", description);
272             return;
273         }
274     }
275 }
276 
dimmIdxsToString(uint32_t dimmIdxs)277 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
278 {
279     std::string description;
280     for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
281     {
282         if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
283         {
284             description += " #" + std::to_string(bitIdx);
285         }
286     }
287     return description;
288 }
289 
sensorIdToDIMMIdx(const uint16_t & sensorId)290 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
291 {
292     uint8_t dimmIdx = maxDIMMInstantNum;
293     int sensorId_Off = sensorId - 4;
294     if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
295         ((sensorId_Off / 2) < maxDIMMInstantNum))
296     {
297         dimmIdx = sensorId_Off / 2;
298     }
299     return dimmIdx;
300 }
301 
handleBootOverallEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)302 void OemEventManager::handleBootOverallEvent(pldm_tid_t tid, uint16_t sensorId,
303                                              uint32_t presentReading)
304 {
305     log_level logLevel{log_level::OK};
306     std::string description;
307     std::string source;
308     std::stringstream strStream;
309 
310     uint8_t byte0 = (presentReading & 0x000000ff);
311     uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
312     uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
313     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
314     /*
315      * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
316      * ATF BL32 and DDR initialization
317      */
318     if (bootStageToMsgMap.contains(byte3))
319     {
320         // Boot stage adding
321         description += bootStageToMsgMap[byte3];
322 
323         switch (byte3)
324         {
325             case boot_stage::DDR_TRAINING:
326                 if (byte0 >= ddrTrainingMsg.size())
327                 {
328                     logLevel = log_level::BIOSFWPANIC;
329                     description += " unknown status";
330                 }
331                 else
332                 {
333                     description += ddrTrainingMsg[byte0];
334                 }
335                 if (0x01 == byte0)
336                 {
337                     // Add complete percentage
338                     description += " at " + std::to_string(byte1) + "%";
339                 }
340                 break;
341             case boot_stage::S0_DDR_TRAINING_FAILURE:
342             case boot_stage::S1_DDR_TRAINING_FAILURE:
343                 // ddr_training_status_msg()
344                 logLevel = log_level::BIOSFWPANIC;
345                 description += " at DIMMs:";
346                 // dimmIdxs = presentReading & 0x00ffffff;
347                 description += dimmIdxsToString(presentReading & 0x00ffffff);
348                 description += " of socket ";
349                 description +=
350                     (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
351                 break;
352             default:
353                 if (byte0 >= bootStatMsg.size())
354                 {
355                     logLevel = log_level::BIOSFWPANIC;
356                     description += " unknown status";
357                 }
358                 else
359                 {
360                     description += bootStatMsg[byte0];
361                 }
362                 break;
363         }
364 
365         // Sensor report action is fail
366         if (boot::status::BOOT_STATUS_FAILURE == byte2)
367         {
368             logLevel = log_level::BIOSFWPANIC;
369         }
370     }
371     else
372     {
373         if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
374         {
375             description +=
376                 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
377 
378             strStream
379                 << "Segment (0x" << std::setfill('0') << std::hex
380                 << std::setw(8) << static_cast<uint32_t>(presentReading)
381                 << "); Status Class (0x" << std::setw(2)
382                 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
383                 << std::setw(2) << static_cast<uint32_t>(byte2)
384                 << "); Operation Code (0x" << std::setw(4)
385                 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
386                 << ")" << std::dec;
387 
388             description += strStream.str();
389         }
390     }
391 
392     source = prefixMsgStrCreation(tid, sensorId);
393     // Log to Redfish event
394     sendJournalRedfish(source, description, logLevel);
395 }
396 
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)397 int OemEventManager::processNumericSensorEvent(
398     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
399     size_t sensorDataLength)
400 {
401     uint8_t eventState = 0;
402     uint8_t previousEventState = 0;
403     uint8_t sensorDataSize = 0;
404     uint32_t presentReading;
405     auto rc = decode_numeric_sensor_data(
406         sensorData, sensorDataLength, &eventState, &previousEventState,
407         &sensorDataSize, &presentReading);
408     if (rc)
409     {
410         lg2::error(
411             "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
412             "TID", tid, "RC", rc);
413         return rc;
414     }
415 
416     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
417     if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
418     {
419         handleDIMMStatusEvent(tid, sensorId, presentReading);
420         return PLDM_SUCCESS;
421     }
422 
423     switch (sensorId)
424     {
425         case BOOT_OVERALL:
426             handleBootOverallEvent(tid, sensorId, presentReading);
427             break;
428         case PCIE_HOT_PLUG:
429             handlePCIeHotPlugEvent(tid, sensorId, presentReading);
430             break;
431         case DDR_STATUS:
432             handleDDRStatusEvent(tid, sensorId, presentReading);
433             break;
434         case PCP_VR_STATE:
435         case SOC_VR_STATE:
436         case DPHY_VR1_STATE:
437         case DPHY_VR2_STATE:
438         case D2D_VR_STATE:
439         case IOC_VR1_STATE:
440         case IOC_VR2_STATE:
441         case PCI_D_VR_STATE:
442         case PCI_A_VR_STATE:
443             handleVRDStatusEvent(tid, sensorId, presentReading);
444             break;
445         case WATCH_DOG:
446             handleNumericWatchdogEvent(tid, sensorId, presentReading);
447             break;
448         default:
449             std::string description;
450             std::stringstream strStream;
451 
452             description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
453             description += prefixMsgStrCreation(tid, sensorId);
454             strStream << std::setfill('0') << std::hex << "eventState 0x"
455                       << std::setw(2) << static_cast<uint32_t>(eventState)
456                       << " previousEventState 0x" << std::setw(2)
457                       << static_cast<uint32_t>(previousEventState)
458                       << " sensorDataSize 0x" << std::setw(2)
459                       << static_cast<uint32_t>(sensorDataSize)
460                       << " presentReading 0x" << std::setw(8)
461                       << static_cast<uint32_t>(presentReading) << std::dec;
462             description += strStream.str();
463             std::cout << description << "\n";
464     }
465     return PLDM_SUCCESS;
466 }
467 
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)468 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
469                                              const uint8_t* sensorData,
470                                              size_t sensorDataLength)
471 {
472     uint8_t sensorOffset = 0;
473     uint8_t eventState = 0;
474     uint8_t previousEventState = 0;
475 
476     auto rc =
477         decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
478                                  &eventState, &previousEventState);
479     if (rc)
480     {
481         lg2::error(
482             "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
483             "TID", tid, "RC", rc);
484         return rc;
485     }
486 
487     std::string description;
488     std::string source = prefixMsgStrCreation(tid, sensorId);
489 
490     if (stateSensorToMsgMap.contains(sensorId))
491     {
492         log_level logLevel = log_level::OK;
493 
494         auto componentMap = stateSensorToMsgMap[sensorId];
495         if (sensorOffset < componentMap.size())
496         {
497             description += std::get<0>(componentMap[sensorOffset]);
498             auto stateMap = std::get<1>(componentMap[sensorOffset]);
499             if (stateMap.contains(eventState))
500             {
501                 logLevel = std::get<0>(stateMap[eventState]);
502                 description += " state : " + std::get<1>(stateMap[eventState]);
503                 if (stateMap.contains(previousEventState))
504                 {
505                     description += "; previous state: " +
506                                    std::get<1>(stateMap[previousEventState]);
507                 }
508             }
509             else
510             {
511                 description += " sends unsupported event state: " +
512                                std::to_string(eventState);
513                 if (stateMap.contains(previousEventState))
514                 {
515                     description += "; previous state: " +
516                                    std::get<1>(stateMap[previousEventState]);
517                 }
518             }
519         }
520         else
521         {
522             description += "sends unsupported component sensor offset " +
523                            std::to_string(sensorOffset);
524         }
525 
526         sendJournalRedfish(source, description, logLevel);
527     }
528     else
529     {
530         std::stringstream strStream;
531         description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
532         description += prefixMsgStrCreation(tid, sensorId);
533         strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
534                   << std::setw(2) << static_cast<uint32_t>(sensorOffset)
535                   << "eventState 0x" << std::setw(2)
536                   << static_cast<uint32_t>(eventState)
537                   << " previousEventState 0x" << std::setw(2)
538                   << static_cast<uint32_t>(previousEventState) << std::dec;
539         description += strStream.str();
540         std::cout << description << "\n";
541     }
542 
543     return PLDM_SUCCESS;
544 }
545 
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)546 int OemEventManager::processSensorOpStateEvent(
547     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
548     size_t sensorDataLength)
549 {
550     uint8_t present_op_state = 0;
551     uint8_t previous_op_state = 0;
552 
553     auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
554                                     &present_op_state, &previous_op_state);
555     if (rc)
556     {
557         lg2::error(
558             "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
559             "TID", tid, "RC", rc);
560         return rc;
561     }
562 
563     std::string description;
564     std::stringstream strStream;
565 
566     description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
567     description += prefixMsgStrCreation(tid, sensorId);
568     strStream << std::setfill('0') << std::hex << "present_op_state 0x"
569               << std::setw(2) << static_cast<uint32_t>(present_op_state)
570               << "previous_op_state 0x" << std::setw(2)
571               << static_cast<uint32_t>(previous_op_state) << std::dec;
572     description += strStream.str();
573     std::cout << description << "\n";
574 
575     return PLDM_SUCCESS;
576 }
577 
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)578 int OemEventManager::handleSensorEvent(
579     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
580     pldm_tid_t tid, size_t eventDataOffset)
581 {
582     /* This OEM event handler is only used for SoC terminus*/
583     if (!tidToSocketNameMap.contains(tid))
584     {
585         return PLDM_SUCCESS;
586     }
587     auto eventData =
588         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
589     auto eventDataSize = payloadLength - eventDataOffset;
590 
591     uint16_t sensorId = 0;
592     uint8_t sensorEventClassType = 0;
593     size_t eventClassDataOffset = 0;
594     auto rc =
595         decode_sensor_event_data(eventData, eventDataSize, &sensorId,
596                                  &sensorEventClassType, &eventClassDataOffset);
597     if (rc)
598     {
599         lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
600                    rc);
601         return rc;
602     }
603     const uint8_t* sensorData = eventData + eventClassDataOffset;
604     size_t sensorDataLength = eventDataSize - eventClassDataOffset;
605 
606     switch (sensorEventClassType)
607     {
608         case PLDM_NUMERIC_SENSOR_STATE:
609         {
610             return processNumericSensorEvent(tid, sensorId, sensorData,
611                                              sensorDataLength);
612         }
613         case PLDM_STATE_SENSOR_STATE:
614         {
615             return processStateSensorEvent(tid, sensorId, sensorData,
616                                            sensorDataLength);
617         }
618         case PLDM_SENSOR_OP_STATE:
619         {
620             return processSensorOpStateEvent(tid, sensorId, sensorData,
621                                              sensorDataLength);
622         }
623         default:
624             std::string description;
625             std::stringstream strStream;
626 
627             description += "SENSOR_EVENT : Unsupported Sensor Class " +
628                            std::to_string(sensorEventClassType) + ": ";
629             description += prefixMsgStrCreation(tid, sensorId);
630             strStream << std::setfill('0') << std::hex
631                       << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
632 
633             auto dataPtr = sensorData;
634             for ([[maybe_unused]] const auto& i :
635                  std::views::iota(0, (int)sensorDataLength))
636             {
637                 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
638                 dataPtr += sizeof(sensorData);
639             }
640 
641             description += strStream.str();
642             std::cout << description << "\n";
643     }
644 
645     return PLDM_ERROR;
646 }
647 
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)648 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
649                                              uint32_t presentReading)
650 {
651     std::string description;
652     std::string source;
653     std::stringstream strStream;
654     PCIeHotPlugEventRecord_t record{presentReading};
655 
656     std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
657     std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
658     log_level logLevel =
659         (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
660 
661     source = prefixMsgStrCreation(tid, sensorId);
662 
663     strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
664               << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
665               << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
666               << "); Device (0x" << std::setw(2)
667               << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
668               << std::setw(2) << static_cast<uint32_t>(record.bits.function)
669               << "); Action (" << sAction << "); Operation status ("
670               << sOpStatus << "); Media slot number (" << std::dec
671               << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
672 
673     description += strStream.str();
674 
675     // Log to Redfish event
676     sendJournalRedfish(source, description, logLevel);
677 }
678 
dimmTrainingFailureToMsg(uint32_t failureInfo)679 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
680 {
681     std::string description;
682     DIMMTrainingFailure_t failure{failureInfo};
683 
684     if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
685     {
686         auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
687 
688         description += std::get<0>(failureInfoMap);
689 
690         description += "; MCU rank index " +
691                        std::to_string(failure.bits.mcuRankIdx);
692 
693         description += "; Slice number " +
694                        std::to_string(failure.bits.sliceNum);
695 
696         description += "; Upper nibble error status: ";
697         description += (!failure.bits.upperNibbStatErr)
698                            ? "No error"
699                            : "Found no rising edge";
700 
701         description += "; Lower nibble error status: ";
702         description += (!failure.bits.lowerNibbStatErr)
703                            ? "No error"
704                            : "Found no rising edge";
705 
706         description += "; Failure syndrome 0: ";
707 
708         auto& syndromeMap = std::get<1>(failureInfoMap);
709         if (syndromeMap.contains(failure.bits.syndrome))
710         {
711             description += syndromeMap[failure.bits.syndrome];
712         }
713         else
714         {
715             description += "(Unknown syndrome)";
716         }
717     }
718     else
719     {
720         description += "Unknown training failure type " +
721                        std::to_string(failure.bits.type);
722     }
723 
724     return description;
725 }
726 
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)727 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
728                                             uint32_t presentReading)
729 {
730     log_level logLevel{log_level::WARNING};
731     std::string description;
732     std::string source;
733     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
734     uint32_t byte012 = presentReading & 0xffffff;
735 
736     source = prefixMsgStrCreation(tid, sensorId);
737 
738     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
739     auto dimmIdx = sensorIdToDIMMIdx(sensorId);
740     if (dimmIdx >= maxDIMMIdxBitNum)
741     {
742         return;
743     }
744 
745     description += "DIMM " + std::to_string(dimmIdx) + " ";
746 
747     if (dimmStatusToMsgMap.contains(byte3))
748     {
749         if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
750             byte3 == dimm_status::INSTALLED_BUT_DISABLED)
751         {
752             logLevel = log_level::OK;
753         }
754 
755         description += dimmStatusToMsgMap[byte3];
756 
757         if (byte3 == dimm_status::TRAINING_FAILURE)
758         {
759             description += "; " + dimmTrainingFailureToMsg(byte012);
760         }
761         else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
762         {
763             uint8_t byte0 = (byte012 & 0xff);
764             if (byte0 < pmicTempAlertMsg.size())
765             {
766                 description += ": " + pmicTempAlertMsg[byte0];
767             }
768         }
769     }
770     else
771     {
772         switch (byte3)
773         {
774             case dimm_status::PMIC_HIGH_TEMP:
775                 if (byte012 == 0x01)
776                 {
777                     description += "has PMIC high temp condition";
778                 }
779                 break;
780             case dimm_status::TSx_HIGH_TEMP:
781                 switch (byte012)
782                 {
783                     case 0x01:
784                         description += "has TS0";
785                         break;
786                     case 0x02:
787                         description += "has TS1";
788                         break;
789                     case 0x03:
790                         description += "has TS0 and TS1";
791                         break;
792                 }
793                 description += " exceeding their high temperature threshold";
794                 break;
795             case dimm_status::SPD_HUB_HIGH_TEMP:
796                 if (byte012 == 0x01)
797                 {
798                     description += "has SPD/HUB high temp condition";
799                 }
800                 break;
801             default:
802                 description += "has unsupported status " +
803                                std::to_string(byte3);
804                 break;
805         }
806     }
807 
808     // Log to Redfish event
809     sendJournalRedfish(source, description, logLevel);
810 }
811 
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)812 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
813                                            uint32_t presentReading)
814 {
815     log_level logLevel{log_level::WARNING};
816     std::string description;
817     std::string source;
818     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
819     uint32_t byte012 = presentReading & 0xffffff;
820 
821     source = prefixMsgStrCreation(tid, sensorId);
822 
823     description += "DDR ";
824     if (ddrStatusToMsgMap.contains(byte3))
825     {
826         if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
827         {
828             logLevel = log_level::OK;
829         }
830 
831         description += ddrStatusToMsgMap[byte3];
832 
833         if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
834             byte3 == ddr_status::TRAINING_FAILURE)
835         {
836             // List out failed DIMMs
837             description += dimmIdxsToString(byte012);
838         }
839     }
840     else
841     {
842         description += "has unsupported status " + std::to_string(byte3);
843     }
844 
845     // Log to Redfish event
846     sendJournalRedfish(source, description, logLevel);
847 }
848 
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)849 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
850                                            uint32_t presentReading)
851 {
852     log_level logLevel{log_level::WARNING};
853     std::string description;
854     std::string source;
855     std::stringstream strStream;
856 
857     source = prefixMsgStrCreation(tid, sensorId);
858 
859     VRDStatus_t status{presentReading};
860 
861     if (status.bits.warning && status.bits.critical)
862     {
863         description += "A VR warning and a VR critical";
864         logLevel = log_level::CRITICAL;
865     }
866     else
867     {
868         if (status.bits.warning)
869         {
870             description += "A VR warning";
871         }
872         else if (status.bits.critical)
873         {
874             description += "A VR critical";
875             logLevel = log_level::CRITICAL;
876         }
877         else
878         {
879             description += "No VR warning or critical";
880             logLevel = log_level::OK;
881         }
882     }
883     description += " condition observed";
884 
885     strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
886               << std::setw(2)
887               << static_cast<uint32_t>(status.bits.vr_status_byte_high)
888               << "; VR status byte low is 0x" << std::setw(2)
889               << static_cast<uint32_t>(status.bits.vr_status_byte_low)
890               << "; Reading is 0x" << std::setw(2)
891               << static_cast<uint32_t>(presentReading) << ";";
892 
893     description += strStream.str();
894 
895     // Log to Redfish event
896     sendJournalRedfish(source, description, logLevel);
897 }
898 
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)899 void OemEventManager::handleNumericWatchdogEvent(
900     pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
901 {
902     std::string description;
903     std::string source;
904     log_level logLevel = log_level::CRITICAL;
905 
906     source = prefixMsgStrCreation(tid, sensorId);
907 
908     if (presentReading & 0x01)
909     {
910         description += "Global watchdog expired;";
911     }
912     if (presentReading & 0x02)
913     {
914         description += "Secure watchdog expired;";
915     }
916     if (presentReading & 0x04)
917     {
918         description += "Non-secure watchdog expired;";
919     }
920 
921     // Log to Redfish event
922     sendJournalRedfish(source, description, logLevel);
923 }
924 
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)925 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
926                                             const uint8_t* eventData,
927                                             size_t eventDataSize)
928 {
929     EFI_AMPERE_ERROR_DATA ampHdr;
930 
931     decodeCperRecord(eventData, eventDataSize, &ampHdr);
932 
933     addCperSELLog(tid, eventId, &ampHdr);
934 
935     /* isBert at bit 12 of TypeId */
936     if (ampHdr.TypeId & 0x0800)
937     {
938         lg2::info("Ampere SoC BERT is triggered.");
939         std::variant<std::string> value(
940             "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
941         try
942         {
943             auto& bus = pldm::utils::DBusHandler::getBus();
944             auto method =
945                 bus.new_method_call("com.ampere.CrashCapture.Trigger",
946                                     "/com/ampere/crashcapture/trigger",
947                                     pldm::utils::dbusProperties, "Set");
948             method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
949                           value);
950             bus.call_noreply(method);
951         }
952         catch (const std::exception& e)
953         {
954             lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
955         }
956     }
957 
958     return PLDM_SUCCESS;
959 }
960 
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)961 int OemEventManager::handlepldmMessagePollEvent(
962     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
963     pldm_tid_t tid, size_t eventDataOffset)
964 {
965     /* This OEM event handler is only used for SoC terminus*/
966     if (!tidToSocketNameMap.contains(tid))
967     {
968         return PLDM_SUCCESS;
969     }
970 
971     auto eventData =
972         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
973     auto eventDataSize = payloadLength - eventDataOffset;
974 
975     pldm_message_poll_event poll_event{};
976     auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
977                                                   &poll_event);
978     if (rc)
979     {
980         lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
981                    "RC", rc);
982         return rc;
983     }
984 
985     auto sensorID = poll_event.event_id;
986     /* The UE errors */
987     if (rasUESensorIDs.contains(sensorID))
988     {
989         pldm::utils::DBusMapping dbusMapping{
990             "/xyz/openbmc_project/led/groups/ras_ue_fault",
991             "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
992         try
993         {
994             pldm::utils::DBusHandler().setDbusProperty(
995                 dbusMapping, pldm::utils::PropertyValue{bool(true)});
996         }
997         catch (const std::exception& e)
998         {
999             lg2::error(
1000                 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
1001                 "TID", tid, "SENSORID", sensorID, "ERROR", e);
1002         }
1003     }
1004 
1005     return PLDM_SUCCESS;
1006 }
1007 
oemPollForPlatformEvent(pldm_tid_t tid)1008 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
1009 {
1010     uint64_t t0 = 0;
1011 
1012     /* This OEM event handler is only used for SoC terminus */
1013     if (!tidToSocketNameMap.contains(tid))
1014     {
1015         co_return PLDM_SUCCESS;
1016     }
1017 
1018     if (!timeStampMap.contains(tid))
1019     {
1020         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1021         timeStampMap.emplace(std::make_pair(tid, t0));
1022     }
1023     else
1024     {
1025         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1026         uint64_t elapsed = t0 - timeStampMap[tid];
1027         if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1028         {
1029             co_await manager->pollForPlatformEvent(tid, 0, 0);
1030             timeStampMap[tid] = t0;
1031         }
1032     }
1033 
1034     co_return PLDM_SUCCESS;
1035 }
1036 } // namespace oem_ampere
1037 } // namespace pldm
1038