xref: /openbmc/pldm/oem/ampere/event/oem_event_manager.cpp (revision 9c1455c067c2f11b85d52ce217a795db6857507e)
1 #include "oem_event_manager.hpp"
2 
3 #include "libcper/Cper.h"
4 
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8 
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <systemd/sd-journal.h>
12 
13 #include <com/ampere/Event/ReportedSEL/event.hpp>
14 #include <phosphor-logging/commit.hpp>
15 #include <phosphor-logging/lg2.hpp>
16 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
17 
18 #include <algorithm>
19 #include <map>
20 #include <set>
21 #include <sstream>
22 #include <string>
23 #include <unordered_map>
24 
25 namespace pldm
26 {
27 namespace oem_ampere
28 {
29 namespace fs = std::filesystem;
30 using namespace std::chrono;
31 namespace ReportedErrorSEL = sdbusplus::error::com::ampere::event::ReportedSEL;
32 namespace ReportedEventSEL = sdbusplus::event::com::ampere::event::ReportedSEL;
33 
34 namespace boot_stage = boot::stage;
35 namespace ddr_status = ddr::status;
36 namespace dimm_status = dimm::status;
37 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
38 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
39 namespace training_failure = dimm::training_failure;
40 
41 constexpr const char* BIOSFWPanicRegistry =
42     "OpenBMC.0.1.BIOSFirmwarePanicReason";
43 constexpr auto maxDIMMIdxBitNum = 24;
44 constexpr auto maxDIMMInstantNum = 24;
45 
46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47 
48 /*
49     An array of possible boot status of a boot stage.
50     The index maps with byte 0 of boot code.
51 */
52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53 
54 /*
55     An array of possible boot status of DDR training stage.
56     The index maps with byte 0 of boot code.
57 */
58 std::array<std::string, 3> ddrTrainingMsg = {
59     " progress started", " in-progress", " progress completed"};
60 
61 /*
62     A map between PMIC status and logging strings.
63 */
64 std::array<std::string, 8> pmicTempAlertMsg = {
65     "Below 85°C", "85°C",  "95°C",  "105°C",
66     "115°C",      "125°C", "135°C", "Equal or greater than 140°C"};
67 
68 /*
69     In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70     EPs through SMBus and PCIe. When host boots up, SMBUS interface
71     comes up first. In this interface, BMC is bus owner.
72 
73     mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74     pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75 */
76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77 
78 /*
79     A map between sensor IDs and their names in string.
80     Using pldm::oem::sensor_ids
81 */
82 EventToMsgMap_t sensorIdToStrMap = {
83     {DDR_STATUS, "DDR_STATUS"},
84     {PCP_VR_STATE, "PCP_VR_STATE"},
85     {SOC_VR_STATE, "SOC_VR_STATE"},
86     {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87     {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88     {D2D_VR_STATE, "D2D_VR_STATE"},
89     {IOC_VR1_STATE, "IOC_VR1_STATE"},
90     {IOC_VR2_STATE, "IOC_VR2_STATE"},
91     {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92     {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93     {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94     {BOOT_OVERALL, "BOOT_OVERALL"},
95     {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96     {WATCH_DOG, "WATCH_DOG"}};
97 
98 /*
99     A map between the boot stages and logging strings.
100     Using pldm::oem::boot::stage::boot_stage
101 */
102 EventToMsgMap_t bootStageToMsgMap = {
103     {boot_stage::SECPRO, "SECpro"},
104     {boot_stage::MPRO, "Mpro"},
105     {boot_stage::ATF_BL1, "ATF BL1"},
106     {boot_stage::ATF_BL2, "ATF BL2"},
107     {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108     {boot_stage::DDR_TRAINING, "DDR training"},
109     {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110     {boot_stage::ATF_BL31, "ATF BL31"},
111     {boot_stage::ATF_BL32, "ATF BL32"},
112     {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113     {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114      "ATF BL33 (UEFI) booting status = "}};
115 
116 /*
117     A map between DDR status and logging strings.
118     Using pldm::oem::ddr::status::ddr_status
119 */
120 EventToMsgMap_t ddrStatusToMsgMap = {
121     {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122     {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123     {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124     {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125     {ddr_status::OTHER_FAILURE, "has other failure"},
126     {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127      "has boot failure due to no configuration"},
128     {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129      "failsafe activated but boot success with the next valid configuration"}};
130 
131 /*
132     A map between DIMM status and logging strings.
133     Using pldm::oem::dimm::status::dimm_status
134 */
135 EventToMsgMap_t dimmStatusToMsgMap = {
136     {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137     {dimm_status::NOT_INSTALLED, "is not installed"},
138     {dimm_status::OTHER_FAILURE, "has other failure"},
139     {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140     {dimm_status::TRAINING_FAILURE, "has training failure; "},
141     {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142 
143 /*
144     A map between PHY training failure syndrome and logging strings.
145     Using
146    pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147 */
148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149     {phy_syndrome::NA, "(N/A)"},
150     {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151     {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152     {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153      "(PHY write level failure - see syndrome 1)"},
154     {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155      "(PHY read gate leveling failure)"},
156     {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157     {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158     {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159 
160 /*
161     A map between DIMM training failure syndrome and logging strings.
162     Using
163    pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164 */
165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166     {dimm_syndrome::NA, "(N/A)"},
167     {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168      "(DRAM VREFDQ training failure)"},
169     {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170     {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171      "(LRDRIMM DB SW training failure)"}};
172 
173 /*
174     A map between DIMM training failure type and a pair of <logging strings -
175    syndrome map>. Using
176    pldm::oem::dimm::training_faillure::dimm_training_failure_type
177 */
178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179     dimmTrainingFailureTypeMap = {
180         {training_failure::PHY_TRAINING_FAILURE_TYPE,
181          std::make_pair("PHY training failure",
182                         phyTrainingFailureSyndromeToMsgMap)},
183         {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184          std::make_pair("DIMM training failure",
185                         dimmTrainingFailureSyndromeToMsgMap)}};
186 
187 std::unordered_map<
188     uint16_t,
189     std::vector<std::pair<
190         std::string,
191         std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
192     stateSensorToMsgMap = {
193         {SOC_HEALTH_AVAILABILITY,
194          {{"SoC Health",
195            {{1, {log_level::OK, "Normal"}},
196             {2, {log_level::WARNING, "Non-Critical"}},
197             {3, {log_level::CRITICAL, "Critical"}},
198             {4, {log_level::CRITICAL, "Fatal"}}}},
199           {"SoC Availability",
200            {{1, {log_level::OK, "Enabled"}},
201             {2, {log_level::WARNING, "Disabled"}},
202             {3, {log_level::CRITICAL, "Shutdown"}}}}}},
203         {WATCH_DOG,
204          {{"Global Watch Dog",
205            {{1, {log_level::OK, "Normal"}},
206             {2, {log_level::CRITICAL, "Timer Expired"}}}},
207           {"Secure Watch Dog",
208            {{1, {log_level::OK, "Normal"}},
209             {2, {log_level::CRITICAL, "Timer Expired"}}}},
210           {"Non-secure Watch Dog",
211            {{1, {log_level::OK, "Normal"}},
212             {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
213 
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)214 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
215                                                   uint16_t sensorId)
216 {
217     std::string description;
218 
219     if (!sensorIdToStrMap.contains(sensorId))
220     {
221         description += "Sensor ID " + std::to_string(sensorId) + " of ";
222     }
223     else
224     {
225         description += "Sensor " + sensorIdToStrMap[sensorId] + " of ";
226     }
227 
228     if (!tidToSocketNameMap.contains(tid))
229     {
230         description += "TID " + std::to_string(tid);
231     }
232     else
233     {
234         description += tidToSocketNameMap[tid];
235     }
236 
237     return description;
238 }
239 
sendJournalRedfish(const std::string & source,const std::string & description,log_level & logLevel)240 void OemEventManager::sendJournalRedfish(const std::string& source,
241                                          const std::string& description,
242                                          log_level& logLevel)
243 {
244     if (description.empty())
245     {
246         return;
247     }
248 
249     switch (logLevel)
250     {
251         case log_level::OK:
252             lg2::commit(ReportedEventSEL::ReportedSELInfo(
253                 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
254             break;
255         case log_level::WARNING:
256             lg2::commit(ReportedErrorSEL::ReportedSELWarning(
257                 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
258             break;
259         case log_level::CRITICAL:
260             lg2::commit(ReportedErrorSEL::ReportedSELCritical(
261                 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
262             break;
263         case log_level::BIOSFWPANIC:
264             lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265                       BIOSFWPanicRegistry, "REDFISH_MESSAGE_ARGS", description);
266             break;
267         default:
268         {
269             lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
270                        "DES", description);
271             return;
272         }
273     }
274 }
275 
dimmIdxsToString(uint32_t dimmIdxs)276 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
277 {
278     std::string description;
279     for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
280     {
281         if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
282         {
283             description += " #" + std::to_string(bitIdx);
284         }
285     }
286     return description;
287 }
288 
sensorIdToDIMMIdx(const uint16_t & sensorId)289 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
290 {
291     uint8_t dimmIdx = maxDIMMInstantNum;
292     int sensorId_Off = sensorId - 4;
293     if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
294         ((sensorId_Off / 2) < maxDIMMInstantNum))
295     {
296         dimmIdx = sensorId_Off / 2;
297     }
298     return dimmIdx;
299 }
300 
handleBootOverallEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)301 void OemEventManager::handleBootOverallEvent(pldm_tid_t tid, uint16_t sensorId,
302                                              uint32_t presentReading)
303 {
304     log_level logLevel{log_level::OK};
305     std::string description;
306     std::string source;
307     std::stringstream strStream;
308 
309     uint8_t byte0 = (presentReading & 0x000000ff);
310     uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
311     uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
312     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
313     /*
314      * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
315      * ATF BL32 and DDR initialization
316      */
317     if (bootStageToMsgMap.contains(byte3))
318     {
319         // Boot stage adding
320         description += bootStageToMsgMap[byte3];
321 
322         switch (byte3)
323         {
324             case boot_stage::DDR_TRAINING:
325                 if (byte0 >= ddrTrainingMsg.size())
326                 {
327                     logLevel = log_level::BIOSFWPANIC;
328                     description += " unknown status";
329                 }
330                 else
331                 {
332                     description += ddrTrainingMsg[byte0];
333                 }
334                 if (0x01 == byte0)
335                 {
336                     // Add complete percentage
337                     description += " at " + std::to_string(byte1) + "%";
338                 }
339                 break;
340             case boot_stage::S0_DDR_TRAINING_FAILURE:
341             case boot_stage::S1_DDR_TRAINING_FAILURE:
342                 // ddr_training_status_msg()
343                 logLevel = log_level::BIOSFWPANIC;
344                 description += " at DIMMs:";
345                 // dimmIdxs = presentReading & 0x00ffffff;
346                 description += dimmIdxsToString(presentReading & 0x00ffffff);
347                 description += " of socket ";
348                 description +=
349                     (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
350                 break;
351             default:
352                 if (byte0 >= bootStatMsg.size())
353                 {
354                     logLevel = log_level::BIOSFWPANIC;
355                     description += " unknown status";
356                 }
357                 else
358                 {
359                     description += bootStatMsg[byte0];
360                 }
361                 break;
362         }
363 
364         // Sensor report action is fail
365         if (boot::status::BOOT_STATUS_FAILURE == byte2)
366         {
367             logLevel = log_level::BIOSFWPANIC;
368         }
369     }
370     else
371     {
372         if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
373         {
374             description +=
375                 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
376 
377             strStream
378                 << "Segment (0x" << std::setfill('0') << std::hex
379                 << std::setw(8) << static_cast<uint32_t>(presentReading)
380                 << "); Status Class (0x" << std::setw(2)
381                 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
382                 << std::setw(2) << static_cast<uint32_t>(byte2)
383                 << "); Operation Code (0x" << std::setw(4)
384                 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
385                 << ")" << std::dec;
386 
387             description += strStream.str();
388         }
389     }
390 
391     source = prefixMsgStrCreation(tid, sensorId);
392     // Log to Redfish event
393     sendJournalRedfish(source, description, logLevel);
394 }
395 
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)396 int OemEventManager::processNumericSensorEvent(
397     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
398     size_t sensorDataLength)
399 {
400     uint8_t eventState = 0;
401     uint8_t previousEventState = 0;
402     uint8_t sensorDataSize = 0;
403     uint32_t presentReading;
404     auto rc = decode_numeric_sensor_data(
405         sensorData, sensorDataLength, &eventState, &previousEventState,
406         &sensorDataSize, &presentReading);
407     if (rc)
408     {
409         lg2::error(
410             "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
411             "TID", tid, "RC", rc);
412         return rc;
413     }
414 
415     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
416     if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
417     {
418         handleDIMMStatusEvent(tid, sensorId, presentReading);
419         return PLDM_SUCCESS;
420     }
421 
422     switch (sensorId)
423     {
424         case BOOT_OVERALL:
425             handleBootOverallEvent(tid, sensorId, presentReading);
426             break;
427         case PCIE_HOT_PLUG:
428             handlePCIeHotPlugEvent(tid, sensorId, presentReading);
429             break;
430         case DDR_STATUS:
431             handleDDRStatusEvent(tid, sensorId, presentReading);
432             break;
433         case PCP_VR_STATE:
434         case SOC_VR_STATE:
435         case DPHY_VR1_STATE:
436         case DPHY_VR2_STATE:
437         case D2D_VR_STATE:
438         case IOC_VR1_STATE:
439         case IOC_VR2_STATE:
440         case PCI_D_VR_STATE:
441         case PCI_A_VR_STATE:
442             handleVRDStatusEvent(tid, sensorId, presentReading);
443             break;
444         case WATCH_DOG:
445             handleNumericWatchdogEvent(tid, sensorId, presentReading);
446             break;
447         default:
448             std::string description;
449             std::stringstream strStream;
450 
451             description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
452             description += prefixMsgStrCreation(tid, sensorId);
453             strStream << std::setfill('0') << std::hex << "eventState 0x"
454                       << std::setw(2) << static_cast<uint32_t>(eventState)
455                       << " previousEventState 0x" << std::setw(2)
456                       << static_cast<uint32_t>(previousEventState)
457                       << " sensorDataSize 0x" << std::setw(2)
458                       << static_cast<uint32_t>(sensorDataSize)
459                       << " presentReading 0x" << std::setw(8)
460                       << static_cast<uint32_t>(presentReading) << std::dec;
461             description += strStream.str();
462             std::cout << description << "\n";
463     }
464     return PLDM_SUCCESS;
465 }
466 
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)467 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
468                                              const uint8_t* sensorData,
469                                              size_t sensorDataLength)
470 {
471     uint8_t sensorOffset = 0;
472     uint8_t eventState = 0;
473     uint8_t previousEventState = 0;
474 
475     auto rc =
476         decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
477                                  &eventState, &previousEventState);
478     if (rc)
479     {
480         lg2::error(
481             "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
482             "TID", tid, "RC", rc);
483         return rc;
484     }
485 
486     std::string description;
487     std::string source = prefixMsgStrCreation(tid, sensorId);
488 
489     if (stateSensorToMsgMap.contains(sensorId))
490     {
491         log_level logLevel = log_level::OK;
492 
493         auto componentMap = stateSensorToMsgMap[sensorId];
494         if (sensorOffset < componentMap.size())
495         {
496             description += std::get<0>(componentMap[sensorOffset]);
497             auto stateMap = std::get<1>(componentMap[sensorOffset]);
498             if (stateMap.contains(eventState))
499             {
500                 logLevel = std::get<0>(stateMap[eventState]);
501                 description += " state : " + std::get<1>(stateMap[eventState]);
502                 if (stateMap.contains(previousEventState))
503                 {
504                     description += "; previous state: " +
505                                    std::get<1>(stateMap[previousEventState]);
506                 }
507             }
508             else
509             {
510                 description += " sends unsupported event state: " +
511                                std::to_string(eventState);
512                 if (stateMap.contains(previousEventState))
513                 {
514                     description += "; previous state: " +
515                                    std::get<1>(stateMap[previousEventState]);
516                 }
517             }
518         }
519         else
520         {
521             description += "sends unsupported component sensor offset " +
522                            std::to_string(sensorOffset);
523         }
524 
525         sendJournalRedfish(source, description, logLevel);
526     }
527     else
528     {
529         std::stringstream strStream;
530         description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
531         description += prefixMsgStrCreation(tid, sensorId);
532         strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
533                   << std::setw(2) << static_cast<uint32_t>(sensorOffset)
534                   << "eventState 0x" << std::setw(2)
535                   << static_cast<uint32_t>(eventState)
536                   << " previousEventState 0x" << std::setw(2)
537                   << static_cast<uint32_t>(previousEventState) << std::dec;
538         description += strStream.str();
539         std::cout << description << "\n";
540     }
541 
542     return PLDM_SUCCESS;
543 }
544 
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)545 int OemEventManager::processSensorOpStateEvent(
546     pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
547     size_t sensorDataLength)
548 {
549     uint8_t present_op_state = 0;
550     uint8_t previous_op_state = 0;
551 
552     auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
553                                     &present_op_state, &previous_op_state);
554     if (rc)
555     {
556         lg2::error(
557             "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
558             "TID", tid, "RC", rc);
559         return rc;
560     }
561 
562     std::string description;
563     std::stringstream strStream;
564 
565     description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
566     description += prefixMsgStrCreation(tid, sensorId);
567     strStream << std::setfill('0') << std::hex << "present_op_state 0x"
568               << std::setw(2) << static_cast<uint32_t>(present_op_state)
569               << "previous_op_state 0x" << std::setw(2)
570               << static_cast<uint32_t>(previous_op_state) << std::dec;
571     description += strStream.str();
572     std::cout << description << "\n";
573 
574     return PLDM_SUCCESS;
575 }
576 
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)577 int OemEventManager::handleSensorEvent(
578     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
579     pldm_tid_t tid, size_t eventDataOffset)
580 {
581     /* This OEM event handler is only used for SoC terminus*/
582     if (!tidToSocketNameMap.contains(tid))
583     {
584         return PLDM_SUCCESS;
585     }
586     auto eventData =
587         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
588     auto eventDataSize = payloadLength - eventDataOffset;
589 
590     uint16_t sensorId = 0;
591     uint8_t sensorEventClassType = 0;
592     size_t eventClassDataOffset = 0;
593     auto rc =
594         decode_sensor_event_data(eventData, eventDataSize, &sensorId,
595                                  &sensorEventClassType, &eventClassDataOffset);
596     if (rc)
597     {
598         lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
599                    rc);
600         return rc;
601     }
602     const uint8_t* sensorData = eventData + eventClassDataOffset;
603     size_t sensorDataLength = eventDataSize - eventClassDataOffset;
604 
605     switch (sensorEventClassType)
606     {
607         case PLDM_NUMERIC_SENSOR_STATE:
608         {
609             return processNumericSensorEvent(tid, sensorId, sensorData,
610                                              sensorDataLength);
611         }
612         case PLDM_STATE_SENSOR_STATE:
613         {
614             return processStateSensorEvent(tid, sensorId, sensorData,
615                                            sensorDataLength);
616         }
617         case PLDM_SENSOR_OP_STATE:
618         {
619             return processSensorOpStateEvent(tid, sensorId, sensorData,
620                                              sensorDataLength);
621         }
622         default:
623             std::string description;
624             std::stringstream strStream;
625 
626             description += "SENSOR_EVENT : Unsupported Sensor Class " +
627                            std::to_string(sensorEventClassType) + ": ";
628             description += prefixMsgStrCreation(tid, sensorId);
629             strStream << std::setfill('0') << std::hex
630                       << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
631 
632             auto dataPtr = sensorData;
633             for ([[maybe_unused]] const auto& i :
634                  std::views::iota(0, (int)sensorDataLength))
635             {
636                 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
637                 dataPtr += sizeof(sensorData);
638             }
639 
640             description += strStream.str();
641             std::cout << description << "\n";
642     }
643 
644     return PLDM_ERROR;
645 }
646 
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)647 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
648                                              uint32_t presentReading)
649 {
650     std::string description;
651     std::string source;
652     std::stringstream strStream;
653     PCIeHotPlugEventRecord_t record{presentReading};
654 
655     std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
656     std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
657     log_level logLevel =
658         (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
659 
660     source = prefixMsgStrCreation(tid, sensorId);
661 
662     strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
663               << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
664               << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
665               << "); Device (0x" << std::setw(2)
666               << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
667               << std::setw(2) << static_cast<uint32_t>(record.bits.function)
668               << "); Action (" << sAction << "); Operation status ("
669               << sOpStatus << "); Media slot number (" << std::dec
670               << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
671 
672     description += strStream.str();
673 
674     // Log to Redfish event
675     sendJournalRedfish(source, description, logLevel);
676 }
677 
dimmTrainingFailureToMsg(uint32_t failureInfo)678 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
679 {
680     std::string description;
681     DIMMTrainingFailure_t failure{failureInfo};
682 
683     if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
684     {
685         auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
686 
687         description += std::get<0>(failureInfoMap);
688 
689         description += "; MCU rank index " +
690                        std::to_string(failure.bits.mcuRankIdx);
691 
692         description += "; Slice number " +
693                        std::to_string(failure.bits.sliceNum);
694 
695         description += "; Upper nibble error status: ";
696         description += (!failure.bits.upperNibbStatErr)
697                            ? "No error"
698                            : "Found no rising edge";
699 
700         description += "; Lower nibble error status: ";
701         description += (!failure.bits.lowerNibbStatErr)
702                            ? "No error"
703                            : "Found no rising edge";
704 
705         description += "; Failure syndrome 0: ";
706 
707         auto& syndromeMap = std::get<1>(failureInfoMap);
708         if (syndromeMap.contains(failure.bits.syndrome))
709         {
710             description += syndromeMap[failure.bits.syndrome];
711         }
712         else
713         {
714             description += "(Unknown syndrome)";
715         }
716     }
717     else
718     {
719         description += "Unknown training failure type " +
720                        std::to_string(failure.bits.type);
721     }
722 
723     return description;
724 }
725 
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)726 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
727                                             uint32_t presentReading)
728 {
729     log_level logLevel{log_level::WARNING};
730     std::string description;
731     std::string source;
732     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
733     uint32_t byte012 = presentReading & 0xffffff;
734 
735     source = prefixMsgStrCreation(tid, sensorId);
736 
737     // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
738     auto dimmIdx = sensorIdToDIMMIdx(sensorId);
739     if (dimmIdx >= maxDIMMIdxBitNum)
740     {
741         return;
742     }
743 
744     description += "DIMM " + std::to_string(dimmIdx) + " ";
745 
746     if (dimmStatusToMsgMap.contains(byte3))
747     {
748         if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
749             byte3 == dimm_status::INSTALLED_BUT_DISABLED)
750         {
751             logLevel = log_level::OK;
752         }
753 
754         description += dimmStatusToMsgMap[byte3];
755 
756         if (byte3 == dimm_status::TRAINING_FAILURE)
757         {
758             description += "; " + dimmTrainingFailureToMsg(byte012);
759         }
760         else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
761         {
762             uint8_t byte0 = (byte012 & 0xff);
763             if (byte0 < pmicTempAlertMsg.size())
764             {
765                 description += ": " + pmicTempAlertMsg[byte0];
766             }
767         }
768     }
769     else
770     {
771         switch (byte3)
772         {
773             case dimm_status::PMIC_HIGH_TEMP:
774                 if (byte012 == 0x01)
775                 {
776                     description += "has PMIC high temp condition";
777                 }
778                 break;
779             case dimm_status::TSx_HIGH_TEMP:
780                 switch (byte012)
781                 {
782                     case 0x01:
783                         description += "has TS0";
784                         break;
785                     case 0x02:
786                         description += "has TS1";
787                         break;
788                     case 0x03:
789                         description += "has TS0 and TS1";
790                         break;
791                 }
792                 description += " exceeding their high temperature threshold";
793                 break;
794             case dimm_status::SPD_HUB_HIGH_TEMP:
795                 if (byte012 == 0x01)
796                 {
797                     description += "has SPD/HUB high temp condition";
798                 }
799                 break;
800             default:
801                 description += "has unsupported status " +
802                                std::to_string(byte3);
803                 break;
804         }
805     }
806 
807     // Log to Redfish event
808     sendJournalRedfish(source, description, logLevel);
809 }
810 
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)811 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
812                                            uint32_t presentReading)
813 {
814     log_level logLevel{log_level::WARNING};
815     std::string description;
816     std::string source;
817     uint8_t byte3 = (presentReading & 0xff000000) >> 24;
818     uint32_t byte012 = presentReading & 0xffffff;
819 
820     source = prefixMsgStrCreation(tid, sensorId);
821 
822     description += "DDR ";
823     if (ddrStatusToMsgMap.contains(byte3))
824     {
825         if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
826         {
827             logLevel = log_level::OK;
828         }
829 
830         description += ddrStatusToMsgMap[byte3];
831 
832         if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
833             byte3 == ddr_status::TRAINING_FAILURE)
834         {
835             // List out failed DIMMs
836             description += dimmIdxsToString(byte012);
837         }
838     }
839     else
840     {
841         description += "has unsupported status " + std::to_string(byte3);
842     }
843 
844     // Log to Redfish event
845     sendJournalRedfish(source, description, logLevel);
846 }
847 
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)848 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
849                                            uint32_t presentReading)
850 {
851     log_level logLevel{log_level::WARNING};
852     std::string description;
853     std::string source;
854     std::stringstream strStream;
855 
856     source = prefixMsgStrCreation(tid, sensorId);
857 
858     VRDStatus_t status{presentReading};
859 
860     if (status.bits.warning && status.bits.critical)
861     {
862         description += "A VR warning and a VR critical";
863         logLevel = log_level::CRITICAL;
864     }
865     else
866     {
867         if (status.bits.warning)
868         {
869             description += "A VR warning";
870         }
871         else if (status.bits.critical)
872         {
873             description += "A VR critical";
874             logLevel = log_level::CRITICAL;
875         }
876         else
877         {
878             description += "No VR warning or critical";
879             logLevel = log_level::OK;
880         }
881     }
882     description += " condition observed";
883 
884     strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
885               << std::setw(2)
886               << static_cast<uint32_t>(status.bits.vr_status_byte_high)
887               << "; VR status byte low is 0x" << std::setw(2)
888               << static_cast<uint32_t>(status.bits.vr_status_byte_low)
889               << "; Reading is 0x" << std::setw(2)
890               << static_cast<uint32_t>(presentReading) << ";";
891 
892     description += strStream.str();
893 
894     // Log to Redfish event
895     sendJournalRedfish(source, description, logLevel);
896 }
897 
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)898 void OemEventManager::handleNumericWatchdogEvent(
899     pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
900 {
901     std::string description;
902     std::string source;
903     log_level logLevel = log_level::CRITICAL;
904 
905     source = prefixMsgStrCreation(tid, sensorId);
906 
907     if (presentReading & 0x01)
908     {
909         description += "Global watchdog expired;";
910     }
911     if (presentReading & 0x02)
912     {
913         description += "Secure watchdog expired;";
914     }
915     if (presentReading & 0x04)
916     {
917         description += "Non-secure watchdog expired;";
918     }
919 
920     // Log to Redfish event
921     sendJournalRedfish(source, description, logLevel);
922 }
923 
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)924 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
925                                             const uint8_t* eventData,
926                                             size_t eventDataSize)
927 {
928     EFI_AMPERE_ERROR_DATA ampHdr;
929 
930     decodeCperRecord(eventData, eventDataSize, &ampHdr);
931 
932     addCperSELLog(tid, eventId, &ampHdr);
933 
934     /* isBert at bit 12 of TypeId */
935     if (ampHdr.TypeId & 0x0800)
936     {
937         lg2::info("Ampere SoC BERT is triggered.");
938         std::variant<std::string> value(
939             "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
940         try
941         {
942             auto& bus = pldm::utils::DBusHandler::getBus();
943             auto method =
944                 bus.new_method_call("com.ampere.CrashCapture.Trigger",
945                                     "/com/ampere/crashcapture/trigger",
946                                     pldm::utils::dbusProperties, "Set");
947             method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
948                           value);
949             bus.call_noreply(method);
950         }
951         catch (const std::exception& e)
952         {
953             lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
954         }
955     }
956 
957     return PLDM_SUCCESS;
958 }
959 
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)960 int OemEventManager::handlepldmMessagePollEvent(
961     const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
962     pldm_tid_t tid, size_t eventDataOffset)
963 {
964     /* This OEM event handler is only used for SoC terminus*/
965     if (!tidToSocketNameMap.contains(tid))
966     {
967         return PLDM_SUCCESS;
968     }
969 
970     auto eventData =
971         reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
972     auto eventDataSize = payloadLength - eventDataOffset;
973 
974     pldm_message_poll_event poll_event{};
975     auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
976                                                   &poll_event);
977     if (rc)
978     {
979         lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
980                    "RC", rc);
981         return rc;
982     }
983 
984     auto sensorID = poll_event.event_id;
985     /* The UE errors */
986     if (rasUESensorIDs.contains(sensorID))
987     {
988         pldm::utils::DBusMapping dbusMapping{
989             "/xyz/openbmc_project/led/groups/ras_ue_fault",
990             "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
991         try
992         {
993             pldm::utils::DBusHandler().setDbusProperty(
994                 dbusMapping, pldm::utils::PropertyValue{bool(true)});
995         }
996         catch (const std::exception& e)
997         {
998             lg2::error(
999                 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
1000                 "TID", tid, "SENSORID", sensorID, "ERROR", e);
1001         }
1002     }
1003 
1004     return PLDM_SUCCESS;
1005 }
1006 
oemPollForPlatformEvent(pldm_tid_t tid)1007 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
1008 {
1009     uint64_t t0 = 0;
1010 
1011     /* This OEM event handler is only used for SoC terminus */
1012     if (!tidToSocketNameMap.contains(tid))
1013     {
1014         co_return PLDM_SUCCESS;
1015     }
1016 
1017     if (!timeStampMap.contains(tid))
1018     {
1019         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1020         timeStampMap.emplace(std::make_pair(tid, t0));
1021     }
1022     else
1023     {
1024         sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1025         uint64_t elapsed = t0 - timeStampMap[tid];
1026         if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1027         {
1028             co_await manager->pollForPlatformEvent(tid, 0, 0);
1029             timeStampMap[tid] = t0;
1030         }
1031     }
1032 
1033     co_return PLDM_SUCCESS;
1034 }
1035 } // namespace oem_ampere
1036 } // namespace pldm
1037